# Rotten Tomatoes Movies Dataset

In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import spacy
from pathlib import Path
from matplotlib import pyplot
import numpy as np
import pickle
%matplotlib inline

DATA_PATH=Path('../data')
CLAS_PATH=Path('../data/movies_clas/')
TMP_PATH=Path(CLAS_PATH/'tmp/')
CLAS_PATH.mkdir(exist_ok=True)
TMP_PATH.mkdir(exist_ok=True)

In [2]:
data_train = pd.read_csv(CLAS_PATH/'train.csv', header=None)
data_val = pd.read_csv(CLAS_PATH/'val.csv', header=None)
data_test = pd.read_csv(CLAS_PATH/'test.csv', header=None)
data = pd.concat([data_train, data_val, data_test])
data.columns = ['label', 'body_text']
data['body_text'] = data['body_text'].apply(str)

In [3]:
nlp = spacy.load('en')
def clean_text(text):
#     text = "".join([word.lower() for word in text if word not in string.punctuation])
#     text = "".join([word.lower() for word in text])
    text = nlp(text)
    text = [word.lemma_ for word in text]
    return text

In [4]:
print(clean_text('It was spectacular. I am glad that I had the fish'))

['-PRON-', 'be', 'spectacular', '.', '-PRON-', 'be', 'glad', 'that', '-PRON-', 'have', 'the', 'fish']


## Feature transformation and regularization

## Train/Test Split

In [5]:
len(data)

68000

In [6]:
neg_data = data[data['label']==0]
pos_data = data[data['label']==1]

neg_data = neg_data.sample(n=10000, random_state=10)
pos_data = pos_data.sample(n=10000, random_state=10)

from sklearn.model_selection import train_test_split

data = pd.concat([neg_data, pos_data])

X_train, X_test, y_train, y_test = train_test_split(data['body_text'], data['label'], test_size=0.2, random_state=10, stratify=data['label'])

print('Training set: ')
print(y_train.value_counts())
print('')
print('Testing set')
print(y_test.value_counts())

Training set: 
1    8000
0    8000
Name: label, dtype: int64

Testing set
1    2000
0    2000
Name: label, dtype: int64


## Vectorizing Text

In [7]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text, ngram_range=(1, 3), max_features=50000, max_df=0.99, min_df=1)
tfidf_vect_fit = tfidf_vect.fit(X_train)

tfidf_train = tfidf_vect_fit.transform(X_train)
tfidf_test = tfidf_vect_fit.transform(X_test)

X_train_vect = pd.DataFrame(tfidf_train.toarray())
X_test_vect = pd.DataFrame(tfidf_test.toarray())

X_train_vect.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30115,30116,30117,30118,30119,30120,30121,30122,30123,30124
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
tfidf_vect.vocabulary_

{'poorly': 20708,
 'write': 29667,
 ',': 25,
 'drag': 8576,
 'storyline': 25488,
 'and': 2123,
 'just': 14995,
 'two': 27606,
 'action': 1374,
 'sequence': 23816,
 'to': 26995,
 'impress': 13716,
 '.': 118,
 'the': 26599,
 'rest': 22469,
 'be': 3394,
 'a': 1098,
 'couple': 6601,
 'impressive': 13720,
 'hand': 12503,
 'combat': 5869,
 'with': 29466,
 'donnie': 8472,
 'yen': 29812,
 '...': 129,
 'that': 26576,
 '-PRON-': 101,
 'see': 23623,
 'in': 13749,
 'so': 24687,
 'many': 16652,
 'other': 19353,
 'film': 10677,
 'very': 28535,
 'heart': 12737,
 '-': 88,
 'feel': 10533,
 'entertaining': 9496,
 'engage': 9400,
 'at': 2774,
 'first': 10826,
 'glance': 11855,
 'may': 16871,
 'seem': 23638,
 'like': 15950,
 'an': 2094,
 'ordinary': 19275,
 'movie': 17893,
 'about': 1187,
 'extraordinary': 10165,
 'boy': 4160,
 'but': 4550,
 'much': 18033,
 'broad': 4350,
 'range': 21767,
 'of': 18971,
 'character': 5165,
 'detail': 7770,
 'life': 15895,
 'circumstance': 5505,
 'all': 1822,
 'culminate': 

## Modelling

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, roc_curve, f1_score
import time

### Random Forest Classifier (RF)

In [16]:
rf = RandomForestClassifier(n_estimators=400, max_depth=None, n_jobs=-1, random_state=10)

In [17]:
start = time.time()
rf_model = rf.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)
print("Training done in {} seconds".format(round(fit_time, 3)))

start = time.time()
y_pred_rf = rf_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)
print("Prediction done in {} seconds".format(round(pred_time, 3)))

Training done in 580.97 seconds
Prediction done in 2.026 seconds


In [18]:
print('Classification Report')
print('----------------------')
print(classification_report(y_test, y_pred_rf, labels = [0, 1], target_names=['negative', 'positive']))
print('accuracy: {}'.format(round((y_pred_rf==y_test).sum()/len(y_test), 2)))
print("ROC AUC Score :", roc_auc_score(y_test, y_pred_rf))
print("F1 Score :", f1_score(y_test, y_pred_rf))

Classification Report
----------------------
             precision    recall  f1-score   support

   negative       0.85      0.88      0.86      2000
   positive       0.88      0.84      0.86      2000

avg / total       0.86      0.86      0.86      4000

accuracy: 0.86
ROC AUC Score : 0.86225
F1 Score : 0.8597607533723594


### Support Vector Machine Classifier (SVM)

In [10]:
clf = svm.LinearSVC(random_state=10)

In [11]:
start = time.time()
clf_fit = clf.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)
print("Training done in {} seconds".format(round(fit_time, 3)))

start = time.time()
y_pred_svm = clf_fit.predict(X_test_vect)
end = time.time()
pred_time = (end - start)
print("Prediction done in {} seconds".format(round(pred_time, 3)))




Training done in 7.17 seconds
Prediction done in 1.274 seconds


In [12]:
print('Classification Report')
print('----------------------')
print(classification_report(y_test, y_pred_svm, labels = [0, 1], target_names=['negative', 'positive']))
print('accuracy: {}'.format(round((y_pred_svm==y_test).sum()/len(y_test), 2)))
print("ROC AUC Score :", roc_auc_score(y_test, y_pred_svm))
print("f1 Score :", f1_score(y_test, y_pred_svm))

Classification Report
----------------------
             precision    recall  f1-score   support

   negative       0.88      0.87      0.87      2000
   positive       0.87      0.88      0.87      2000

avg / total       0.87      0.87      0.87      4000

accuracy: 0.87
ROC AUC Score : 0.872
f1 Score : 0.8727001491795127


### Naive Bayes Classifier (NB)

In [13]:
nbc = MultinomialNB()

In [14]:
start = time.time()
nbc_fit = nbc.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)
print("Training done in {} seconds".format(round(fit_time, 3)))

start = time.time()
y_pred_nb = nbc_fit.predict(X_test_vect)
end = time.time()
pred_time = (end - start)
print("Prediction done in {} seconds".format(round(pred_time, 3)))

Training done in 8.882 seconds
Prediction done in 1.772 seconds


In [15]:
print('Classification Report')
print('----------------------')
print(classification_report(y_test, y_pred_nb, labels = [0, 1], target_names=['negative', 'positive']))
print('accuracy: {}'.format(round((y_pred_nb==y_test).sum()/len(y_test), 2)))
print("ROC AUC Score :", roc_auc_score(y_test, y_pred_nb))
print("F1 Score :", f1_score(y_test, y_pred_nb))

Classification Report
----------------------
             precision    recall  f1-score   support

   negative       0.86      0.89      0.87      2000
   positive       0.89      0.85      0.87      2000

avg / total       0.87      0.87      0.87      4000

accuracy: 0.87
ROC AUC Score : 0.8722500000000001
F1 Score : 0.869543017615522
