In [2]:
import numpy as np
import pandas as pd
import pickle

from keras.models import load_model
from keras.models import Sequential

from keras.layers import LSTM
from keras.layers import Dense

from keras.callbacks import EarlyStopping

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [3]:
# convert an array of values into a data_set matrix
def create_data_set(_data_set, _look_back=1, look_forward=10):
    data_x, data_y = [], []
    for i in range(len(_data_set) - _look_back - look_forward):
        a = _data_set[i:(i + _look_back)]
        data_x.append(a)
        data_y.append([_data_set[i + j + _look_back] for j in range(look_forward)])
    return np.array(data_x), np.array(data_y) 

# get the internal representation of the LSTM while predicting
def get_internal_representation(sentence):
    begin = sentence[-look_back-look_forward:-look_forward]
    try:
        new = np.array(begin).reshape(1,look_back,1)
    except:
        print(sentence, len(sentence))
    seq = sequenceModel.predict(new)
    return seq[0]

In [10]:
# variables
look_back = 10
look_forward = 1
units = 50
smoothing = '_s'

In [11]:
# model = load_model('../LSTM/lb200_lf50_u200_e200_s.h5'.format(look_back, look_forward, units))

In [12]:
# load model
model = load_model('../LSTM/Models/lb{}_lf{}_u{}{}.h5'.format(look_back, look_forward, units, smoothing))

In [13]:
# load data
with open ('../Datapreprocessing/Data/Smoothing/Train_x_LB:{}{}'.format(look_back,smoothing), 'rb') as fp:
    train_x = pickle.load(fp)
with open ('../Datapreprocessing/Data/Smoothing/Val_x_LB:{}{}'.format(look_back,smoothing), 'rb') as fp:
    val_x = pickle.load(fp)
with open ('../Datapreprocessing/Data/Smoothing/Test_x_LB:{}{}'.format(look_back,smoothing), 'rb') as fp:
    test_x = pickle.load(fp)
    

with open ('../Datapreprocessing/Data/Smoothing/Train_y_LB:{}{}'.format(look_back,smoothing), 'rb') as fp:
    train_y = pickle.load(fp)
with open ('../Datapreprocessing/Data/Smoothing/Val_y_LB:{}{}'.format(look_back,smoothing), 'rb') as fp:
    val_y = pickle.load(fp)
with open ('../Datapreprocessing/Data/Smoothing/Test_y_LB:{}{}'.format(look_back,smoothing), 'rb') as fp:
    test_y = pickle.load(fp)

In [14]:
# normalize the data_set
scaler = MinMaxScaler(feature_range=(0, 1))

train_x = [scaler.fit_transform(np.array(l).reshape(-1,1)) for l in train_x if len(l) != 0]
val_x = [scaler.fit_transform(np.array(l).reshape(-1,1)) for l in val_x if len(l) != 0]
test_x = [scaler.fit_transform(np.array(l).reshape(-1,1)) for l in test_x if len(l) != 0]

In [15]:
# adjusting the model to return internal states
weights = model.get_weights()

sequenceModel = Sequential()
sequenceModel.add(LSTM(units, input_shape=(look_back, 1), return_sequences=True))

sequenceModel.set_weights(weights)

In [16]:
train_x = np.array([get_internal_representation(sentence) for sentence in train_x if len(sentence) > look_forward+look_back+1])

In [17]:
train_x.shape

(2775, 10, 50)

In [18]:
train_x[0].shape

(10, 50)

In [19]:
tmp_x = []
tmp_y = []
for i in range(len(train_x)):
    for n in train_x[i]:
#         print()
        tmp_x.append(n)
        tmp_y.append(train_y[i])

In [20]:
train_x = np.array(tmp_x, dtype='float16')

(27750, 50)

In [21]:
train_y = np.array(tmp_y, dtype='float16')

In [23]:
val_x = [get_internal_representation(sentence) for sentence in val_x if len(sentence) > look_forward+look_back+1]

In [24]:
tmp_x = []
tmp_y = []
for i in range(len(val_x)):
    for n in val_x[i]:
#         print()
        tmp_x.append(n)
        tmp_y.append(val_y[i])

In [27]:
val_x = np.array(tmp_x, dtype='float16')
val_y = np.array(tmp_y, dtype='float16')

In [28]:
test_x = [get_internal_representation(sentence) for sentence in test_x if len(sentence) > look_forward+look_back+1]

In [29]:
tmp_x = []
tmp_y = []
for i in range(len(test_x)):
    for n in test_x[i]:
#         print()
        tmp_x.append(n)
        tmp_y.append(test_y[i])

In [30]:
test_x = np.array(tmp_x, dtype='float16')

In [31]:
test_y = np.array(tmp_y, dtype='float16')

In [32]:
t1 = 0
t2 = 0
t3 = 0
for i in test_y:
    t1 += i[0]
    t2 += i[1]
    t3 += i[2]
print(t1, t2, t3)
sum([1 for i in test_y if i[1] == 1.])/len(test_y)*100

870.0 920.0 1720.0


26.21082621082621

In [33]:
# if look_back == 100:
#     val_x = np.append(val_x[:17400], val_x[17900:26600], axis = 0)
#     val_y = np.append(val_y[:17400], val_y[17900:26600], axis = 0)
#     train_x = np.append(train_x[:63600], train_x[79800:], axis = 0)
#     train_y = np.append(train_y[:63600], train_y[79800:], axis = 0)
#     test_x = np.append(test_x[:17400], test_x[17900:26600], axis = 0)
#     test_y = np.append(test_y[:17400], test_y[17900:26600], axis = 0)

In [34]:
# if look_back == 10:
#     val_x = np.append(val_x[:420], val_x[440:650], axis = 0)
#     val_y = np.append(val_y[:420], val_y[440:650], axis = 0)
#     train_x = np.append(train_x[:3460], train_x[3580:5310], axis = 0)
#     train_y = np.append(train_y[:3460], train_y[3580:5310], axis = 0)
#     test_x = np.append(test_x[:420], test_x[440:650], axis = 0)
#     test_y = np.append(test_y[:420], test_y[440:650], axis = 0)

In [35]:
# if look_back == 100:
#     val_x = np.append(val_x[:4200], val_x[4400:6500], axis = 0)
#     val_y = np.append(val_y[:4200], val_y[4400:6500], axis = 0)
#     train_x = np.append(train_x[:34600], train_x[35800:53100], axis = 0)
#     train_y = np.append(train_y[:34600], train_y[35800:53100], axis = 0)
#     test_x = np.append(test_x[:4200], test_x[4400:6500], axis = 0)
#     test_y = np.append(test_y[:4200], test_y[4400:6500], axis = 0)

In [36]:
# if look_back == 200:
#     val_x = np.append(val_x[:8400], val_x[8800:13000], axis = 0)
#     val_y = np.append(val_y[:8400], val_y[8800:13000], axis = 0)
#     train_x = np.append(train_x[:69200], train_x[71600:106200], axis = 0)
#     train_y = np.append(train_y[:69200], train_y[71600:106200], axis = 0)
#     test_x = np.append(test_x[:8400], test_x[8800:13000], axis = 0)
#     test_y = np.append(test_y[:8400], test_y[8800:13000], axis = 0)

In [37]:
# train_x = train_x.reshape(train_x.shape[0], train_x.shape[1], 1)
# train_y = train_y.reshape(train_y.shape[0], train_y.shape[1])

# val_x = val_x.reshape(val_x.shape[0], val_x.shape[1], 1)
# val_y = val_y.reshape(val_y.shape[0], val_y.shape[1])

# test_x = test_x.reshape(test_x.shape[0], test_x.shape[1], 1)
# test_y = test_y.reshape(test_y.shape[0], test_y.shape[1])

In [38]:
train_x.shape

(27750, 50)

In [47]:
callback = [EarlyStopping(monitor='val_accuracy', min_delta=0, patience=5, verbose=0, mode='auto')]

diagnostic_classifier = Sequential()
diagnostic_classifier.add(Dense(200, input_dim=units, activation='relu'))
diagnostic_classifier.add(Dense(200, input_dim=units, activation='relu'))
diagnostic_classifier.add(Dense(3))
diagnostic_classifier.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
diagnostic_classifier.fit(train_x, train_y, epochs = 50, verbose=1, batch_size=512, callbacks = callback, validation_data=(val_x, val_y))

Train on 27750 samples, validate on 3530 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50


<keras.callbacks.callbacks.History at 0x7fd286622550>

In [73]:
test_predict = diagnostic_classifier.predict(test_x)

accuracy_trump = sum([1 for i in range(len(test_predict)) if test_predict[i][0] == max(test_predict[i]) and test_y[i][0] ==  1.])/sum([1 for i in range(len(test_y)) if test_y[i][0] == 1.])*100
accuracy_sarcasm = sum([1 for i in range(len(test_predict)) if test_predict[i][1] == max(test_predict[i]) and test_y[i][1] ==  1.])/sum([1 for i in range(len(test_y)) if test_y[i][1] == 1.])*100
accuracy_baldwin = sum([1 for i in range(len(test_predict)) if test_predict[i][2] == max(test_predict[i]) and test_y[i][2] ==  1.])/sum([1 for i in range(len(test_y)) if test_y[i][2] == 1.])*100

accuracy_avg = (accuracy_trump+accuracy_satire+accuracy_baldwin)/3

y_true = []
for item in test_y:
    if item[0] == max(item):
        y_true.append(0)
    elif item[1] == max(item):
        y_true.append(1)
    else:
        y_true.append(2)
        
y_pred = []
for item in test_predict:
    if item[0] == max(item):
        y_pred.append(0)
    elif item[1] == max(item):
        y_pred.append(1)
    else:
        y_pred.append(2)

target_names = ['Trump',  'Sarcasm','Baldwin']
tmp = classification_report(y_true, y_pred, target_names=target_names)
t = tmp.split('\n')
t = [i.split(' ') for i in t if i != '']
t = [j for i in t for j in i if j != '']
trump = t[5:8] + [round(accuracy_trump,1)]
sarcasm = t[10:13] + [round(accuracy_sarcasm,1)]
baldwin = t[15:18] + [round(accuracy_baldwin,1)]
avg = t[-4:-1] + [round(accuracy_avg,1)]
final = np.array([trump, sarcasm, baldwin, avg], dtype='float')
print(final)

[[ 0.23  0.06  0.09  5.5 ]
 [ 0.27  0.56  0.36 56.3 ]
 [ 0.52  0.41  0.46 40.6 ]
 [ 0.38  0.36  0.34 34.2 ]]
