In [174]:
# -*- coding: utf-8 -*-
#!/usr/bin/env python3

import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import TweetTokenizer
import pickle
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils import to_categorical


In [175]:
def read_train(data_path,nrows=None):
    df = pd.read_csv(data_path,nrows = nrows)
    df['textID']=df['textID'].astype(str)
    df['text']=df['text'].astype(str)
    df['selected_text']=df['selected_text'].astype(str)
    df['sentiment']=df['sentiment'].astype(str)
    return df
train_df = read_train("C:\\Users\\PANGPENGHUI\\Desktop\\ESG\\ESG5BD\\machine_learning\\tweet_tp3\\train.csv",nrows=100)

train_df.shape


(100, 4)

In [176]:
words_set = set()
def get_words_index(documents,build=True, save=True, index_path= "index.pickle"):
    '''
    documents: pd.Series, string
    '''
    
    tknzr = TweetTokenizer()    
    def build_index(row):
        global words_set
        tokens = tknzr.tokenize(row.lower())
        words_set = words_set.union(set(tokens))
        return tokens
    
    if build:
        documents.apply(build_index)
        words_index = {k+1:v for k,v in enumerate(words_set)}
        if save:            
            with open(index_path, 'wb') as out:
                pickle.dump(words_index, out)
    
    else:
         words_index = pickle.load(out, encoding = "utf-8")
                
    return words_index


In [177]:
words_index = get_words_index(train_df['text'], build=True, save=True)
print(words_index)



In [178]:
class_index = {k:v for k,v in enumerate(train_df['sentiment'].unique()) }

print(class_index)

{0: 'neutral', 1: 'negative', 2: 'positive'}


In [179]:
def build_sequences (documents, index):
    tknzr = TweetTokenizer()
    sequences = []
    for doc in documents:
        sequences.append([index[w] for w in tknzr.tokenize(doc.lower())])
    return sequences

In [180]:
reverse_word_index = {v:k for k, v in words_index.items()}
sequences = build_sequences(train_df['text'], reverse_word_index)
print(sequences)

[[504, 440, 173, 470, 634, 269, 614, 504, 441, 268], [142, 133, 504, 594, 584, 347, 413, 610, 73, 507, 552, 552, 552], [84, 340, 168, 598, 272, 407], [464, 628, 552, 59, 272, 381], [471, 579, 570, 570, 570, 269, 323, 213, 440, 371, 450, 11, 603, 317, 37, 483, 188, 599, 486], [612, 129, 100, 202, 87, 499, 37, 366, 433, 633, 317, 279], [176, 28, 499, 37, 152, 282, 263, 235, 183, 168, 143, 58, 261, 521], [61, 55], [357, 579, 347], [222, 552, 49, 158, 407, 199, 509, 230, 580, 333, 462, 407, 623, 168, 20, 547, 552, 49, 81], [223, 398, 223, 504, 331, 380, 240, 602, 269, 504, 495, 37, 512, 282, 105, 290, 504, 440, 329, 498, 629, 354, 84, 124, 261, 339], [504, 259, 259, 32, 37, 359, 331, 578, 412, 436, 443], [84, 44, 168, 313, 596, 63, 317, 416], [504, 89, 380, 131, 380, 225, 621, 193, 504, 270, 84, 192, 333], [382, 382, 273, 37, 360, 565, 308], [233, 19, 269, 504, 510, 543], [426, 440, 281, 269, 516, 380, 620, 114, 223, 188, 456, 570, 524, 570], [504, 440, 455, 581, 423, 499, 37, 43, 52, 517,

In [181]:
def read_embeddings(data_path, index, build=True, save=True):
    embeddings = np.zeros((len(index) + 1, 25))
    reverse_index = {v: k for k, v in index.items()}
    if build:
        with open(data_path, "r", encoding="utf-8") as file:
            for line in file:
                tokens = line.split(" ")
                w = tokens[0]
                zw = tokens[1:]
                try:
                    embeddings[reverse_index[w]] = np.array([float(zj) for zj in zw])
                except KeyError :
                    continue
        if save:
            with open (data_path, 'wb') as out:
                pickle.dump(embeddings, out)
    else:
        with open(data_path,'rb') as out:
            embeddings = pickle.load(out)
    return embeddings

In [271]:
def read_test(data_path,nrows=None):
    test=pd.read_csv(data_path,nrows = nrows)
    test['text']=test['text'].astype(str)
    test['sentiment']=test['sentiment'].astype(str)
    return test
test_df= read_test("C:\\Users\\PANGPENGHUI\\Desktop\\ESG\\ESG5BD\\machine_learning\\tweet_tp3\\test.csv",nrows=100)



In [272]:
embeddings_matrix = read_embeddings("C:\\Users\\PANGPENGHUI\\Desktop\\ESG\\ESG5BD\\machine_learning\\tweet_tp3\\glove.twitter.27B.25d.txt",index=words_index, build=False, save=False)
embeddings_matrix.shape

(635, 25)

In [273]:
T= 8
x = pad_sequences(sequences,maxlen=T,padding="post")


print(x.shape)


(100, 8)


In [274]:

test_df["sentiement_index"] = test_df["sentiment"].map({"neutral":0,"negative":1,"positive":2})

In [275]:
from sklearn.model_selection import train_test_split
y = test_df['sentiement_index']
X_train, X_test, Y_train, Y_test = train_test_split(x, y, stratify=y, test_size=0.2)
X_train.shape
Y_train.shape

(80,)

In [276]:

model_LSTM=Sequential()
model_LSTM.add(Embedding(635,100,input_length=T))
model_LSTM.add(LSTM(64,dropout=0.2,return_sequences=True))
model_LSTM.add(LSTM(32,dropout=0.3,return_sequences=False))
model_LSTM.add(Dense(num_classes,activation='softmax'))

In [277]:
print(model_LSTM.summary())

Model: "sequential_40"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_31 (Embedding)     (None, 8, 100)            63500     
_________________________________________________________________
lstm_53 (LSTM)               (None, 8, 64)             42240     
_________________________________________________________________
lstm_54 (LSTM)               (None, 32)                12416     
_________________________________________________________________
dense_26 (Dense)             (None, 3)                 99        
Total params: 118,255
Trainable params: 118,255
Non-trainable params: 0
_________________________________________________________________
None


In [283]:

model_LSTM.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_LSTM.fit(X_train,Y_train,batch_size=50, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x1ac8fe70518>

In [301]:
Y_pred_LSTM  = model_LSTM.predict(X_test)
print(y_pred_LSTM)

[[0.22490026 0.53285474 0.24224502]
 [0.6009771  0.04960369 0.34941927]
 [0.65724725 0.22837064 0.11438217]
 [0.68571454 0.02076572 0.29351968]
 [0.8063541  0.10113939 0.09250654]
 [0.2918426  0.45949364 0.24866377]
 [0.6073806  0.12549911 0.2671202 ]
 [0.66176623 0.01595982 0.32227394]
 [0.61313224 0.16402133 0.22284646]
 [0.62844515 0.07234304 0.29921174]
 [0.74614114 0.15771978 0.09613916]
 [0.45799434 0.08019165 0.46181393]
 [0.93636423 0.00598844 0.05764726]
 [0.63055176 0.02350519 0.3459431 ]
 [0.33633843 0.2945846  0.36907697]
 [0.87740934 0.08004987 0.04254085]
 [0.3525549  0.4209472  0.22649792]
 [0.13797563 0.74731207 0.11471224]
 [0.39025906 0.22135492 0.38838607]
 [0.62139    0.07746754 0.30114248]]


In [302]:
from sklearn.metrics import confusion_matrix, accuracy_score
print(confusion_matrix(Y_test, Y_pred_test))
print(accuracy_score(Y_test, Y_pred_test))

[[6 2 1]
 [5 1 0]
 [3 1 1]]
0.4
