In [1]:
import pandas as pd
import numpy as np
import text_normalizer as tn
import model_evaluation_utils as meu

np.set_printoptions(precision=2, linewidth=80)

##  Load and normalize data

In [2]:
dataset = pd.read_csv('Movie_review.csv')
print(dataset.head())
reviews = np.array(dataset['review'])
sentiments = np.array(dataset['sentiment'])

# build train and test datasets
train_reviews = reviews[:35000]
train_sentiments = sentiments[:35000]
test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]

# normalize datasets
norm_train_reviews = tn.normalize_corpus(train_reviews)
norm_test_reviews = tn.normalize_corpus(test_reviews)

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


## Traditional Supervised Machine Learning Models

## Feature Engineering

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# build BOW features on train reviews
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0, ngram_range=(1,2))
cv_train_features = cv.fit_transform(norm_train_reviews)
# build TFIDF features on train reviews
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0, ngram_range=(1,2),
                     sublinear_tf=True)
tv_train_features = tv.fit_transform(norm_train_reviews)

In [4]:
# transform test reviews into features
cv_test_features = cv.transform(norm_test_reviews)
tv_test_features = tv.transform(norm_test_reviews)

In [5]:
print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

BOW model:> Train features shape: (35000, 2117227)  Test features shape: (15000, 2117227)
TFIDF model:> Train features shape: (35000, 2117227)  Test features shape: (15000, 2117227)


## Model Training, Prediction and Performance Evaluation

In [6]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
lr = LogisticRegression(penalty='l2', max_iter=100, C=1)
svm = SGDClassifier(loss='hinge', max_iter=100)

In [7]:
lr_bow_predictions = meu.train_predict_model(classifier=lr, 
                                             train_features=cv_train_features, train_labels=train_sentiments,
                                             test_features=cv_test_features, test_labels=test_sentiments)
lr_bow_predictions = ['positive' if val == 1 else 'negative'for val in lr_bow_predictions]
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=lr_bow_predictions,
                                      classes=['positive', 'negative'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



Model Performance metrics:
------------------------------
Accuracy : 90.59%
Precision : 90.41%
Recall : 90.84%
F1 Score : 90.62%

Model Classification reports:
------------------------------
              precision    recall  f1-score   support

    negative       0.91      0.90      0.91      7490
    positive       0.90      0.91      0.91      7510

    accuracy                           0.91     15000
   macro avg       0.91      0.91      0.91     15000
weighted avg       0.91      0.91      0.91     15000

Prediction Confusion Matrix : 
------------------------------
                 Predicted:         
                   positive negative
Actual: positive       6822      688
        negative        724     6766


In [8]:
lr_tfidf_predictions = meu.train_predict_model(classifier=lr, 
                                               train_features=tv_train_features, train_labels=train_sentiments,
                                               test_features=tv_test_features, test_labels=test_sentiments)
lr_tfidf_predictions = ['positive' if val == 1 else 'negative'for val in lr_tfidf_predictions]
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=lr_tfidf_predictions,
                                      classes=['positive', 'negative'])


Model Performance metrics:
------------------------------
Accuracy : 89.39%
Precision : 88.82%
Recall : 90.16%
F1 Score : 89.49%

Model Classification reports:
------------------------------
              precision    recall  f1-score   support

    negative       0.90      0.89      0.89      7490
    positive       0.89      0.90      0.89      7510

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000

Prediction Confusion Matrix : 
------------------------------
                 Predicted:         
                   positive negative
Actual: positive       6771      739
        negative        852     6638


In [9]:
svm_bow_predictions = meu.train_predict_model(classifier=svm, 
                                             train_features=cv_train_features, train_labels=train_sentiments,
                                             test_features=cv_test_features, test_labels=test_sentiments)
svm_bow_predictions = ['positive' if val == 1 else 'negative'for val in svm_bow_predictions]
meu.display_model_performance_metrics(true_labels=test_sentiments, 
                                      predicted_labels=svm_bow_predictions,
                                      classes=['positive', 'negative'])


Model Performance metrics:
------------------------------
Accuracy : 89.76%
Precision : 91.01%
Recall : 88.27%
F1 Score : 89.62%

Model Classification reports:
------------------------------
              precision    recall  f1-score   support

    negative       0.89      0.91      0.90      7490
    positive       0.91      0.88      0.90      7510

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000

Prediction Confusion Matrix : 
------------------------------
                 Predicted:         
                   positive negative
Actual: positive       6629      881
        negative        655     6835


In [10]:
svm_tfidf_predictions = meu.train_predict_model(classifier=svm, 
                                                train_features=tv_train_features, train_labels=train_sentiments,
                                                test_features=tv_test_features, test_labels=test_sentiments)
svm_tfidf_predictions = ['positive' if val == 1 else 'negative'for val in svm_tfidf_predictions]
meu.display_model_performance_metrics(true_labels=test_sentiments, 
                                      predicted_labels=svm_tfidf_predictions,
                                      classes=['positive', 'negative'])


Model Performance metrics:
------------------------------
Accuracy : 89.73%
Precision : 88.56%
Recall : 91.29%
F1 Score : 89.90%

Model Classification reports:
------------------------------
              precision    recall  f1-score   support

    negative       0.91      0.88      0.90      7490
    positive       0.89      0.91      0.90      7510

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000

Prediction Confusion Matrix : 
------------------------------
                 Predicted:         
                   positive negative
Actual: positive       6856      654
        negative        886     6604
