In [None]:
'''
1.) Add Print Statements to preprocess function
    Finish implementing serve_model_results (finished testing preprocess tweets, just need to serve results)
    Move environment file to this directory

https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
'''

In [3]:
import sys

In [5]:
sys.path.append("../utils/")

In [6]:
import ml_functions as mlf

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shivalakshmanan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

In [8]:
df = pd.read_csv('../data/ExtractedTweets.csv')

There are 42068 Democratic tweets and there are 44392 Republican tweets  
There are three columns: Party, Handle, and Tweet

In [None]:
xTr, xVal, yTr, yVal = mlf.preprocess(df)

In [None]:
#create dictionary of index to vocab mapping
unique_tokens_inv = {v: k for k, v in mlf.unique_tokens.items()}

In [None]:
model = tf.keras.Sequential()

In [None]:
model.add(tf.keras.layers.Embedding(mlf.num_unique_tokens, 8, input_length=mlf.max_tokenized_tweet_length))

In [None]:
model.add(tf.keras.layers.Flatten())

In [None]:
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [None]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.build()

In [None]:
# summarize the model
print(model.summary())

In [None]:
# fit the model
history = model.fit(tf.convert_to_tensor(xTr), yTr,validation_split = 0.1, epochs=50, batch_size=1000)

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

# Test Model

In [None]:
scores = model.predict(xVal)

In [None]:
# roc curve and auc
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
# calculate scores
auc = roc_auc_score(yVal, scores)
# calculate roc curves
fpr, tpr, thresholds = roc_curve(yVal, scores)
# plot the roc curve for the model
plt.plot(np.arange(0,1.01,0.01), np.arange(0,1.01,0.01), linestyle='--', label='Baseline')
plt.plot(fpr, tpr, marker='.', label='Embedding')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.text(0.55, 0.01, 'Embedding AUC Score=%.3f' % (auc))
# show the legend
plt.legend()
# show the plot
plt.show()

In [None]:
optimal_threshold = mlf.find_optimal_thresholds(fpr,tpr,thresholds)

In [None]:
optimal_threshold

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
#Recall confusion matrix is
#[[tn, fn],
# [fp, tp]]
conf_matrix = confusion_matrix(yVal, (scores > optimal_threshold).astype('float'))

In [None]:
conf_matrix

In [None]:
print("Accuracy: ", (conf_matrix[0][0]+conf_matrix[1][1])/conf_matrix.sum())

In [None]:
print("Recall: ", conf_matrix[1][1]/conf_matrix[1].sum())

In [None]:
print("Precision: ", conf_matrix[1][1]/conf_matrix[:,1].sum())

In [None]:
#save model
model.save('../src/demsvsreps_embedding')

# Save Model Parameters to config file

In [None]:
import json

In [None]:
parameters = {
    #float32 is not serializable, float64 is
    'optimal_threshold': np.float64(optimal_threshold)
    , 'num_unique_tokens': mlf.num_unique_tokens
    , 'unique_tokens': mlf.unique_tokens
    , 'max_tokenized_tweet_length': mlf.max_tokenized_tweet_length
}

In [None]:
with open('../config/config.json', 'w') as file:
    json.dump(parameters, file)