# Import necessary depencencies

In [None]:
import numpy as np
from keras.layers import Dropout, Activation, Dense
from keras.models import Sequential
import model_evaluation_utils as meu
import utils
import matplotlib.pyplot as plt
from sklearn import metrics
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, SpatialDropout1D
from keras.layers import LSTM
import xgboost as xgb


np.set_printoptions(precision=2, linewidth=80)


# Load normalized data

In [None]:
# train and test datasets
train_reviews = utils.readFromDisk('train_reviews')
train_sentiments = utils.readFromDisk('train_sentiments')
test_reviews = utils.readFromDisk('test_reviews')
test_sentiments = utils.readFromDisk('test_sentiments')
tokenized_train = utils.readFromDisk('tokenized_train')
tokenized_test = utils.readFromDisk('tokenized_test')
train_sentiments_encoded = utils.readFromDisk('train_sentiments_encoded')
test_sentiments_encoded = utils.readFromDisk('test_sentiments_encoded')

#glove features
# feature engineering with GloVe model
train_glove_features = utils.readFromDisk('train_glove_features')
test_glove_features = utils.readFromDisk('test_glove_features')


# Traditional Supervised Machine Learning Models
## Model Training, Prediction and Performance Evaluation

In [None]:

from sklearn.linear_model import SGDClassifier, LogisticRegression

lr = LogisticRegression(penalty='l2', max_iter=500, C=1)
svm = SGDClassifier(loss='hinge', max_iter=500)

In [None]:
# Logistic Regression model
lr_predictions = meu.train_predict_model(classifier=lr, 
                                             train_features=train_glove_features, train_labels=train_sentiments,
                                             test_features=test_glove_features, test_labels=test_sentiments)
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=lr_predictions,
                                      classes=[1,0])

In [None]:
svm_predictions = meu.train_predict_model(classifier=svm, 
                                             train_features=train_glove_features, train_labels=train_sentiments,
                                             test_features=test_glove_features, test_labels=test_sentiments)
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=svm_predictions,
                                      classes=[1,0])

# Newer Supervised Deep Learning Models



## Modeling with deep neural networks 

### Building Deep neural network architecture

In [None]:
def construct_deepnn_architecture(num_input_features):
    dnn_model = Sequential()
    dnn_model.add(Dense(512, activation='relu', input_shape=(num_input_features,)))
    dnn_model.add(Dropout(0.2))
    dnn_model.add(Dense(512, activation='relu'))
    dnn_model.add(Dropout(0.2))
    dnn_model.add(Dense(512, activation='relu'))
    dnn_model.add(Dropout(0.2))
    dnn_model.add(Dense(2))
    dnn_model.add(Activation('softmax'))

    dnn_model.compile(loss='categorical_crossentropy', optimizer='adam',                 
                      metrics=['accuracy'])
    return dnn_model

### Model Training, Prediction and Performance Evaluation


In [None]:
glove_dnn = construct_deepnn_architecture(num_input_features=300)



In [None]:
batch_size = 100
glove_dnn.fit(train_glove_features, train_sentiments_encoded, epochs=5, batch_size=batch_size, 
              shuffle=True, validation_split=0.1, verbose=1)

In [None]:
le = utils.readFromDisk('label_encoder')
y_pred = glove_dnn.predict_classes(test_glove_features)
dnn_predictions = le.inverse_transform(y_pred) 

In [None]:
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=dnn_predictions, 
                                      classes=[1,0])  


# XGBoost

In [None]:
xgc = xgb.XGBClassifier(n_estimators=500, max_depth=5, base_score=0.5,objective='binary:logistic', random_state=42)
xgc.fit(train_glove_features, train_sentiments)

In [None]:
xgc_predictions = xgc.predict(test_reviews)



In [None]:
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=xgc_predictions, 
                                      classes=[1,0])  


# LSTM


# Build Vocabulary Mapping (word to index)

In [None]:
from collections import Counter

# build word to index vocabulary
token_counter = Counter([token for review in tokenized_train for token in review])
vocab_map = {item[0]: index+1 for index, item in enumerate(dict(token_counter).items())}
max_index = np.max(list(vocab_map.values()))
vocab_map['PAD_INDEX'] = 0
vocab_map['NOT_FOUND_INDEX'] = max_index+1
vocab_size = len(vocab_map)
# view vocabulary size and part of the vocabulary map
print('Vocabulary Size:', vocab_size)
print('Sample slice of vocabulary map:', dict(list(vocab_map.items())[10:20]))

# Encode and Pad datasets & Encode prediction class labels

In [None]:
# get max length of train corpus and initialize label encoder
max_len = np.max([len(review) for review in tokenized_train])

## Train reviews data corpus
# Convert tokenized text reviews to numeric vectors
train_X = [[vocab_map[token] for token in tokenized_review] for tokenized_review in tokenized_train]
train_X = sequence.pad_sequences(train_X, maxlen=max_len) # pad 

## Test reviews data corpus
# Convert tokenized text reviews to numeric vectors
test_X = [[vocab_map[token] if vocab_map.get(token) else vocab_map['NOT_FOUND_INDEX'] 
           for token in tokenized_review] 
              for tokenized_review in tokenized_test]
test_X = sequence.pad_sequences(test_X, maxlen=max_len)

# view vector shapes
print('Max length of train review vectors:', max_len)
print('Train review vectors shape:', train_X.shape, ' Test review vectors shape:', test_X.shape)

# Build the LSTM Model Architecture

In [None]:
EMBEDDING_DIM = 128 # dimension for dense embeddings for each token
LSTM_DIM = 64 # total LSTM units

lstm = Sequential()
lstm.add(Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=max_len))
lstm.add(SpatialDropout1D(0.2))
lstm.add(LSTM(LSTM_DIM, dropout=0.2, recurrent_dropout=0.2))
lstm.add(Dense(2, activation="sigmoid"))

lstm.compile(loss="binary_crossentropy", optimizer="adam",
              metrics=["accuracy"])

In [None]:
print(lstm.summary())

# Visualize model architecture

In [None]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(lstm, show_shapes=True, show_layer_names=False, 
                 rankdir='LR').create(prog='dot', format='svg'))

# Train the model

In [None]:
batch_size = 100
lstm.fit(train_X, train_sentiments_encoded, epochs=5, batch_size=batch_size, 
          shuffle=True, validation_split=0.1, verbose=1)

# Predict and Evaluate Model Performance

In [None]:
lstm_pred_test = lstm.predict_classes(test_X)
lstm_predictions = le.inverse_transform(lstm_pred_test.flatten())

In [None]:
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=lstm_predictions, 
                                      classes=[1,0])  



# Model Evaluation

# ROC curves

In [None]:
plt.figure(0).clf()

color = ['blue', 'orange', 'red', 'green', 'coral',
             'grey', 'indigo', 'gold', 'lime', 'olive',
             'pink', 'navy', 'magenta', 'yellow', 'tomato',
             'turquoise', 'yellowgreen', 'maroon', 'lightblue']
mlr=[]
msvm=[]
mdnn=[]
mxgc=[]
mlstm=[]

def metricsAndROC(pred,metricsArray,rocTitle,colorIndex):
    fpr, tpr, thresholds = metrics.roc_curve(test_sentiments, pred)
    auc = metrics.roc_auc_score(test_sentiments, pred)
    metricsArray.append(metrics.f1_score(test_sentiments, pred))
    metricsArray.append(metrics.precision_score(test_sentiments, pred))
    metricsArray.append(metrics.accuracy_score(test_sentiments, pred))
    metricsArray.append(metrics.recall_score(test_sentiments, pred))
    
    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr,color=color[colorIndex], label=rocTitle.format(auc))



metricsAndROC(lr_predictions,mlr,'LR',0)
metricsAndROC(svm_predictions,msvm,'SVM',0)
metricsAndROC(xgc_predictions,mxgc,'XGBoost',0)
metricsAndROC(lstm_predictions,lstm,'LSTM',0)



#show the roc curve now
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
# show the legend
plt.legend(loc='best')
plt.show()

# Metrics comparison

In [None]:
n_groups = 4
index = np.arange(n_groups)
bar_width = .1

plt.bar(index,mlr, bar_width, color=color[0], label='BOW')

'''
z=index + bar_width
plt.bar(z, mngram, bar_width, color=color[1],label='NGRAM')



z=z+ bar_width
plt.bar(z, mtfidf, bar_width, color=color[2], label='TFIDF')

z=z+ bar_width
plt.bar(z,mw2v , bar_width, color=color[3], label='W2V')

z=z+ bar_width
plt.bar(z,mglove , bar_width,color=color[4], label='Glove')

z=z+ bar_width
plt.bar(z,mft , bar_width, color=color[5], label='FastText')
'''

#ax.set_xlabel('Metric')
#ax.set_ylabel('Value')
#ax.set_title('Comparison of Feature Engineering Models on Amazon Reviews')
#ax.set_xticks(index + bar_width / 2)
pltLabels=['F1','PRECISION','ACCURACY','RECALL']
# Add xticks on the middle of the group bars
plt.xlabel('Model Metrics', fontweight='bold')
plt.xticks([r + bar_width for r in range(n_groups)], pltLabels)
 
# Create legend & Show graphic
plt.legend(frameon=False,ncol=3, loc='lower left')
plt.show()
