# Import Necessary Dependencies

In [None]:
%%capture --no-display

## import operating system module
import os

## import module to measure the time
from time import time

## linear algebra package
import numpy as np

## data manipulation package
import pandas as pd

## visualizations packages
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
## the magic word for inline visualizations in Jupyter notebook
% matplotlib inline

## import module to process regular expressions
import re

## modules to split the data into training and testing sets
from sklearn.model_selection import StratifiedShuffleSplit

## package to upload and find the stopwords
from nltk.corpus import stopwords
## the list of stopwords to be used
stop = stopwords.words('english')

## tokenizer package
from nltk.tokenize import word_tokenize

## import stemmer
from nltk.stem.snowball import SnowballStemmer
## define a stemmer
stemmer = SnowballStemmer('english')

## modules to split the data into training and testing sets
from sklearn.model_selection import StratifiedShuffleSplit

## import the sentiment analyzer utility from nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

## import utilities to evaluate metrics
from sklearn import metrics

## import deep learning libraries and packages
import tensorflow as tf
import keras
from keras import regularizers

## packages to work with word vectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

## package that contains the Word2Vec embedding
import gensim

In [None]:
## import deep learning packages
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, Conv1D, MaxPooling1D, Flatten
from keras.layers import Dropout
from keras.layers.normalization import BatchNormalization
from keras.initializers import Constant

# Upload the Data

In [None]:
## read data from the cvs file and save it as a Pandas dataframe 
mobile = pd.read_csv('mobile_phones_prepared.csv') 

## make a work copy of the data
df = mobile.copy()

## check the dataframe
df.head(2)

# Text Preprocessing Functions

In [None]:
## function that performs all the steps neccessary for pre-processing of the reviews
def clean_data(revseries):
    ## rewrite all words in lower case form
    revseries_clean  = revseries.apply(lambda x: x.lower())
    ## remove special characters, punctuation and numbers - keep letters only
    revseries_clean = revseries_clean.apply(lambda x: re.sub("[^a-zA-Z]+"," ", x))
    ## remove stopwords
    revseries_clean = revseries_clean.apply(lambda x: ' '.join(x for x in x.split() if x not in stop))
    ## tokenize all reviews in the dataset
    revseries_clean = revseries_clean.apply(lambda x: word_tokenize(x))
    ## stem all reviews in the dataset
    revseries_clean = revseries_clean.apply(lambda x: [stemmer.stem(w) for w in x])
    return revseries_clean

## rewrite the reviews as strings
def clean_strings(revseries): 
    revseries_clean = revseries.apply(lambda x: ' '.join(x))
    return revseries_clean

# Vader Sentiment Analyzer in NLTK

In [None]:
## make a copy of the data 
sendata = mobile.copy()

## drop unnecessary columns 
sendata = sendata.drop(['name', 'brand', 'price', 'votes', 'revl'], axis=1)

## create a column that contains the cleaned reviews
sendata['revs'] = clean_strings(clean_data(sendata['review']))

## check for success
sendata.head(2)

In [None]:
## define the analyzer 
analyser = SentimentIntensityAnalyzer()

## define a function that extracts the compund sentiment score
def sentimental_value(review):
    result = analyser.polarity_scores(review)['compound']
    return round(result,2)

In [None]:
## create a column that records the compound sentiment score in the sample
sendata['sentiment'] = sendata['revs'].apply(lambda x: sentimental_value(x))

## check for success
sendata.sample(3)

In [None]:
## create a new column that bins the sentiment scores: score >= 0 being positive (or 1)
sendata['sen_bin'] = sendata['sentiment'].apply(lambda x: 1 if x >= 0 else 0) 

## check for success
sendata.sample(3)

In [None]:
## compare the scores obtained by binning together the ratings with those provided by the 
## binned sentiment analyzer compound scores

print('Accuracy score: ', format(round(metrics.accuracy_score(sendata['feel'],sendata['sen_bin']), 2)))
print('Precision score: ', format(round(metrics.precision_score(sendata['feel'],sendata['sen_bin']), 2)))
print('Recall score: ', format(round(metrics.recall_score(sendata['feel'],sendata['sen_bin']), 2)))
print('F1 score: ', format(round(metrics.f1_score(sendata['feel'],sendata['sen_bin']), 2)))
print('AUC score: ', format(round(metrics.roc_auc_score(sendata['feel'],sendata['sen_bin']), 2)))

# Split Data

The data is unbalanced, in order to deal with this issue I will use the StratifiedShuffleSplit method. The data will be split into a training set (80% of data) and a testing set (20% of data). The features to be investigated are the review texts, while the labels are provided by the 'feel' column, which classifies the review as positive (1) or negative (0).   

In [None]:
## create the features and labels vectors 
X = df['review']
y = df['feel']

## split the data with 80% training set and 20% testing set
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, train_size=0.8, random_state=42)
sss.split(X,y)

In [None]:
## create the train and the test sets of features and labels 
for train_index, test_index in sss.split(X, y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [None]:
## check for success
X_train[:2], y_train[:2]

# Tf-Idf Word Embedding 

In [None]:
## prepare the reviews for the supervised learner
Xv_train = clean_strings(clean_data(X_train)) 
Xv_test =  clean_strings(clean_data(X_test)) 

In [None]:
## create a matrix of relative frequencies from the cleaned reviews

## import and set the vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [None]:
## apply the vectorizer method to the train data
vectorized_train_data = vectorizer.fit_transform(Xv_train)

## get some information on the output
feature_names = vectorizer.get_feature_names()
print('Number of different words: {}'.format(len(feature_names)))
print('Vectorizer shape: {}'.format(vectorized_train_data.shape))

In [None]:
## apply the vectorizer method to the test data
vectorized_test_data = vectorizer.transform(Xv_test)

## get some information on the output
feature_names = vectorizer.get_feature_names()
print('Number of different words: {}'.format(len(feature_names)))
print('Vectorizer shape: {}'.format(vectorized_test_data.shape))

# Benchmark Model: SVM

In [None]:
## import the supervised learning model from sklearn
from sklearn import svm
## initialize the classifier
clf_svm = svm.SVC(kernel='linear', random_state=42)

# fit the classifier on the training data, measure training time
start = time() 
clf_svm.fit(vectorized_train_data, y_train)
end=time()
time_fit=end-start
print('Training time is {} sec.'.format(round(time_fit,2)))

In [None]:
## make predictions on the test data and store them in a variable 'pred'
## measure the prediction time

startp = time() 
pred = clf_svm.predict(vectorized_test_data)
endp=time()
time_pred=endp-startp
print('Prediction time is {} sec.'.format(round(time_pred,2)))

In [None]:
## evaluate the model
print('Accuracy score: ', format(round(metrics.accuracy_score(y_test,pred), 2)))
print('Precision score: ', format(round(metrics.precision_score(y_test,pred), 2)))
print('Recall score: ', format(round(metrics.recall_score(y_test,pred), 2)))
print('F1 score: ', format(round(metrics.f1_score(y_test,pred), 2)))
print('AUC score: ', format(round(metrics.roc_auc_score(y_test,pred), 2)))

### The confusion matrix for SVM 

The confusion matrix for the svm_classifier (according to this [thread](https://stackoverflow.com/questions/19233771/sklearn-plot-confusion-matrix-with-labels)): 

In [None]:
## define the labels and create the confusion matrix
cmatrix = metrics.confusion_matrix(y_test, pred)
print('The confusion matrix:\n')
print(cmatrix)

In [None]:
## create the figure in which the matrix will be drawn
fig = plt.figure(figsize=(10, 10), dpi=50, linewidth=2, frameon=True)

## add the figure's details
labels=[0,1]
ax = fig.add_subplot(111)
cax = ax.matshow(cmatrix)
#plt.title('Confusion matrix of the svm classifier', fontsize=12)

fig.colorbar(cax)
ax.set_xticklabels([''] + labels, fontsize=16)
ax.set_yticklabels([''] + labels, fontsize=16)

plt.xlabel('Predicted Values', fontsize=16)
plt.ylabel('True Values', fontsize=16)

plt.show()

### ROC curve for SVM

In [None]:
## define the variables and compute the auc (area under curve)
false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_test, pred)
roc_auc = metrics.auc(false_positive_rate, true_positive_rate)

## create the figure in which the matrix will be drawn
fig = plt.figure(figsize=(10, 10), dpi=50, linewidth=2, frameon=True)

## create title
plt.title('Receiver Operating Characteristic Curve', fontsize=18)

## plot the curve
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)

plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')

## intervals, ticks and labels for the two axes
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate', fontsize=16)
plt.xlabel('False Positive Rate', fontsize=16)

plt.show()

# Word2Vec Embedding

In [None]:
## prepare the reviews for the unsupervised learner
Xu_train = clean_data(X_train)
Xu_test =  clean_data(X_test) 

In [None]:
## save the pre-processed reviews as a list of lists
x_train_corpus = list(Xu_train)
x_test_corpus = list(Xu_test)

In [None]:
## parameters for the model
embedding_dim = 200 
max_length = 1000

In [None]:
## train word2vec model
model = gensim.models.Word2Vec(sentences = x_train_corpus, window=4, workers=4,
                               size = embedding_dim, min_count = 10)

## create a vocabulary
vocabulary = model.wv.vocab

## the list of words in the vocabulary
words = list(model.wv.vocab)
print("Vocabulary size is: {}".format(len(words)))

In [None]:
## print several words in the vocabulary
vocabulary_words = list(model.wv.vocab.keys())
print(vocabulary_words[:10])

In [None]:
## test the Word2Vec model
model.wv.most_similar('good')

In [None]:
## save the model to a file
model.wv.save_word2vec_format('reviews_word2vec.txt', binary=False)

# Prepare Word2Vec Embedding

In [None]:
## extract the word embeddings from the file 
## save the embeddings as a dictionary 
embedded_reviews = {}

file = open('reviews_word2vec.txt')
for line in file:
    values = line.split()
    word = values[0]
    components = np.array(values[1:])
    embedded_reviews[word] = components
file.close()

Convert the word embeddings into a tokenized vector.

In [None]:
## define a tokenizer 
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train_corpus)

## create a list vectors with integer components, one for each word
train_sequences = tokenizer.texts_to_sequences(x_train_corpus)
test_sequences = tokenizer.texts_to_sequences(x_test_corpus)

In [None]:
## check for success
train_sequences[100]

In [None]:
## find the number of words in the list created by the tokenizer
train_word_index = tokenizer.word_index
print('There are {} tokens.'.format(len(train_word_index)))

## pad train sequences to the same length 
x_train_tensor = pad_sequences(train_sequences, maxlen=embedding_dim)
print('Shape of train matrix: {}'.format(x_train_tensor.shape))

## pad test sequences to have the same length
x_test_tensor = pad_sequences(test_sequences, maxlen=embedding_dim)
print('Shape of test matrix: {}'.format(x_test_tensor.shape))

In [None]:
## map embeddings from Word2Vec model for each word to vocabulary
## and create a matrix with word vectors

num_words = len(train_word_index) + 1
embedding_weights = np.zeros((num_words, embedding_dim))

for word, i in train_word_index.items():
    embedding_weights[i] = train_word_index.get(word)

In [None]:
## the embedding matrix has nr.tokens x embedding_dim dimension
embedding_weights.shape

# Convolutional Neural Network

In [None]:
## the first CNN architecture

model_first = Sequential()

embedding_layer = Embedding(num_words, embedding_dim,
                            weights=[embedding_weights],
                          input_length = embedding_dim, 
                            trainable = False )

model_first.add(embedding_layer)

model_first.add(Conv1D(filters=32,kernel_size=3, padding = 'same', activation='relu'))
model_first.add(BatchNormalization())
model_first.add(MaxPooling1D(pool_size=2, padding='same'))

model_first.add(Conv1D(filters=64,kernel_size=4, padding = 'same', activation='relu'))
model_first.add(BatchNormalization())
model_first.add(MaxPooling1D(pool_size=2, padding='same'))
model_first.add(Dropout(0.2))

model_first.add(Conv1D(filters=128,kernel_size=5, padding = 'same', activation='relu'))
model_first.add(BatchNormalization())
model_first.add(MaxPooling1D(pool_size=2, padding='same'))
model_first.add(Dropout(0.2))

model_first.add(Flatten())
model_first.add(Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(0.01)))

### print the architecture of the CNN
model_first.summary()

In [None]:
## plot a diagram of the model
from keras.utils.vis_utils import plot_model

plot_model(model_first, to_file='model_first_plot.png',
           show_shapes=True, show_layer_names=True)

In [None]:
## compile the model
model_first.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'])

In [None]:
from time import time

start = time()
model_first_data= model_first.fit(x_train_tensor, y_train, batch_size=64, epochs=4, 
          validation_split=0.1, shuffle=True, verbose=2)
end = time()

duration = end-start
print('The first model trained in {} sec.'.format(duration))

In [None]:
model.save("model_first.h5py")

In [None]:
test_eval=model_first.evaluate(x_test_tensor, y_test, verbose=2)
print('Test loss:', round(test_eval[0],2))
print('Test accuracy:', round(test_eval[1],2))

In [None]:
accuracy = model_first_data.history['acc']
val_accuracy = model_first_data.history['val_acc']
loss = model_first_data.history['loss']
val_loss = model_first_data.history['val_loss']

epochs = range(len(accuracy))
plt.plot(epochs, accuracy, 'bo', label='Training accuracy')
plt.plot(epochs, val_accuracy, 'm', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'm', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
## the updated CNN model
model = Sequential()

embedding_layer = Embedding(num_words, embedding_dim,
                            weights=[embedding_weights],
                          input_length = embedding_dim, 
                            trainable = False )
model.add(embedding_layer)

model.add(Conv1D(filters=100,kernel_size=2, padding = 'same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))

model.add(Conv1D(filters=100,kernel_size=3, padding = 'same', activation='relu'))
model.add(MaxPooling1D(pool_size=3))

model.add(Conv1D(filters=100,kernel_size=4, padding = 'same', activation='relu'))
model.add(MaxPooling1D(pool_size=3))

model.add(Conv1D(filters=100,kernel_size=5, padding = 'same', activation='relu'))
model.add(MaxPooling1D(pool_size=3))

model.add(Conv1D(filters=100,kernel_size=6, padding = 'same', activation='relu'))
model.add(MaxPooling1D(pool_size=3))

model.add(Dropout(0.5))
model.add(Flatten())

model.add(Dense(100, activation = 'relu'))
model.add(Dropout(0.5))

model.add(Dense(1, activation='sigmoid'))

### print the architecture of the CNN
model.summary()

In [None]:
## plot a diagram of the model
from keras.utils.vis_utils import plot_model

## plot a diagram of the model
plot_model(model, to_file='model_plot.png',
           show_shapes=True, show_layer_names=True)

In [None]:
## compile the model
model.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'])

In [None]:
## fit the model
start = time()
model_data = model.fit(x_train_tensor, y_train, batch_size=50, epochs=10, 
          validation_split=0.1, shuffle=True, verbose=2)
end = time()

duration = end-start
print('The model trained in {} sec.'.format(duration))

In [None]:
model.save("mymodel.h5py")

In [None]:
test_eval_m=model.evaluate(x_test_tensor, y_test, verbose=2)
print('Test loss:', round(test_eval_m[0],2))
print('Test accuracy:', round(test_eval_m[1],2))

In [None]:
accuracy = model_data.history['acc']
val_accuracy = model_data.history['val_acc']
loss = model_data.history['loss']
val_loss = model_data.history['val_loss']

epochs = range(len(accuracy))
plt.plot(epochs, accuracy, 'bo', label='Training accuracy')
plt.plot(epochs, val_accuracy, 'm', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'm', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()