1. Obtain the earnings call transcript for Tesla.
2. Parse and format the file in the json format described before
3. Each team should read the transcript and label the paragraphs with (+/-/neutral)
sentiments.
4. Pass this file to the final model you selected. How does it perform (Show confusion
matrix)?
5. On November 25th, share your labeled json for the Tesla report with the class.
6. Pass the 4 other files (from other teams) to your model. How does it perform (Show
confusion matrix)?
7. Discuss what you learnt from this exercise.

In [1]:
# -*- coding: utf-8 -*-

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, GlobalAveragePooling1D, LSTM
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model, load_model
from keras.models import Sequential
from keras.initializers import Constant
import json
import pandas as pd

Using TensorFlow backend.


In [2]:
BASE_DIR = ''
TEXT_DATA_DIR = os.path.join(BASE_DIR, 'data')
MAX_SEQUENCE_LENGTH = 800

def test_on_transcripts(filepath):
    #get imdb word index
    from keras.datasets import imdb
    word_index = imdb.get_word_index()


    #load pre-trained model(complied already)
    model = load_model('EX2_RNN(LSTM)_trained_model')

    #Load data    
    def load_data_from_csv(filepath):
        df_transcripts = pd.read_csv(filepath)

        transcripts_x = []
        transcripts_y = []

        #drop all neutral rows
        df_transcripts = df_transcripts[df_transcripts['sentiment'] != 'neutral']

        #replace positive and negative to pos and neg
        df_transcripts['sentiment'] = df_transcripts['sentiment'].str.replace('positive','1')
        df_transcripts['sentiment'] = df_transcripts['sentiment'].str.replace('negative','0')

        transcripts_x = df_transcripts['text'].tolist()
        transcripts_y = df_transcripts['sentiment'].astype('int32').tolist()

        return transcripts_x, transcripts_y


    #get list of x and list of y
    transcripts_x, transcripts_y = load_data_from_csv(filepath)


    #define a tokenizer
    def my_tokenize(text):
        tokens = text_to_word_sequence(text, 
                                       filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', 
                                       lower=True, #all words in word_index are in lower case
                                       split=' ') 
        return tokens


    #tokenize x
    transcripts_x_tokens = []
    for t in transcripts_x:
        transcripts_x_tokens.append(my_tokenize(t))

    # convert words to indices    
    transcripts_x_index = []
    for t_tokens in transcripts_x_tokens:
        t_index = []
        for t in t_tokens:
            #get index from word_index for current token
            try:
                i = word_index[t] 
            except KeyError:
                i = 0
            t_index.append(i)
        transcripts_x_index.append(t_index)


    #padding and to_categorical
    transcripts_x_index = pad_sequences(transcripts_x_index, maxlen=MAX_SEQUENCE_LENGTH)
    transcripts_y = to_categorical(np.asarray(transcripts_y))


    # Predicting the Test set results
    y_prob = model.predict(transcripts_x_index)
    y_classes = y_prob.argmax(axis=-1)


    #post processing

    labels_index_2 = {0:'negative',1:'positive',2:'neutral'} 
    def pred_vec_to_lebal(vec,labels_index_2):
        indices = [np.where(r==1)[0][0] for r in vec]
        labels = [labels_index_2[i] for i in indices]
        return labels
    y_val_labels = pred_vec_to_lebal(transcripts_y,labels_index_2)

    def pred_vec_to_lebal2(vec,labels_index_2):
        labels = [labels_index_2[i] for i in vec]
        return labels
    y_classes_labels = pred_vec_to_lebal2(y_classes,labels_index_2)


    #Evaluation
    import sklearn.metrics
    cm = sklearn.metrics.confusion_matrix(y_val_labels, y_classes_labels, labels=["positive", "negative"])
    print("Confusion Matrix: ")
    print("|TP:"+ str(cm[0][0]) + " | FN:"+str(cm[0][1])+'|')
    print("|FP:"+ str(cm[1][0]) + " | TN:"+str(cm[1][1])+'|')
    precision = sklearn.metrics.precision_score(y_val_labels, y_classes_labels, average='weighted')
    recall = sklearn.metrics.recall_score(y_val_labels, y_classes_labels, average='weighted')
    accuracy = sklearn.metrics.accuracy_score(y_val_labels, y_classes_labels)
    print()
    print("Accuracy: " + str(accuracy))
    print("Precision: " + str(precision))
    print("Recall: " + str(recall))

## Testing

In [6]:
test_data_base_path = 'test data/'

### Tesla

In [3]:
tesla_path = test_data_base_path + 'TESLA_earnings_call_transcript.csv'

test_on_transcripts(tesla_path)

Confusion Matrix: 
|TP:54 | FN:49|
|FP:18 | TN:13|

Accuracy: 0.5
Precision: 0.625
Recall: 0.5


### Google

In [7]:
google_path = test_data_base_path + 'google.csv'
test_on_transcripts(google_path)

Confusion Matrix: 
|TP:43 | FN:33|
|FP:6 | TN:3|

Accuracy: 0.5411764705882353
Precision: 0.7934573829531812
Recall: 0.5411764705882353


### Amazon

In [8]:
amazon_path = test_data_base_path +'amazon.csv'
test_on_transcripts(amazon_path)

Confusion Matrix: 
|TP:12 | FN:10|
|FP:1 | TN:3|

Accuracy: 0.5769230769230769
Precision: 0.8165680473372782
Recall: 0.5769230769230769


### Netflix

In [9]:
netflix_path = test_data_base_path + 'netflix.csv'
test_on_transcripts(netflix_path)

Confusion Matrix: 
|TP:38 | FN:23|
|FP:6 | TN:7|

Accuracy: 0.6081081081081081
Precision: 0.7529074529074529
Recall: 0.6081081081081081


### Microsoft

In [10]:
microsoft_path = test_data_base_path + 'microsoft.csv'
test_on_transcripts(microsoft_path)

Confusion Matrix: 
|TP:55 | FN:36|
|FP:2 | TN:1|

Accuracy: 0.5957446808510638
Precision: 0.9349797726057524
Recall: 0.5957446808510638
