In [1]:
#import libraries
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
#from gensim.models import Word2Vec
#from gensim.test.utils import datapath
#from gensim import utils
#import gensim
import multiprocessing
cores = multiprocessing.cpu_count()
import os
from time import time
import torch
from transformers import AutoModel, AutoTokenizer, AutoConfig
from transformers import AutoModelForSequenceClassification
from torch.utils.data import DataLoader

In [None]:
# To notify when cell is complete (comment out first line). Add %%notify to the first line of a cell to get notified
# !pip uninstall jupyternotify -y
!pip install git+https://github.com/cphyc/jupyter-notify.git
%reload_ext jupyternotify

In [3]:
# read in cleaned CSV
filepath = Path('../massive.csv')
massive = pd.read_csv(filepath)
massive = massive.sample(n=15000)
massive.reset_index(drop=True, inplace=True)

## Pre-processing

In [4]:
# Dropping columns including audienceScore because it could unfairly help the model predict the delta. Leaving tomatoMeter to give the model a base of where the delta could be.
massive = massive.drop(columns=['id', 'reviewId', 'creationDate', 'isTopCritic', 'reviewState', 'audienceScore'])
massive.head(1)

Unnamed: 0,title,tomatoMeter,criticName,publicatioName,reviewText,scoreSentiment,delta
0,ClownTown,9.0,Roger Moore,Movie Nation,"Deathly-dull, but could this no-budget horror ...",NEGATIVE,-19.0


In [5]:
# Binning delta column into 5 categories for Logistic regression
bins = [-99, -26, -11, 10, 25, 99]
labels = [0, 1, 2, 3, 4]
massive['delta_category'] = pd.cut(massive['delta'], bins=bins, labels=labels)

In [6]:
massive.isnull().sum()

title             0
tomatoMeter       0
criticName        0
publicatioName    0
reviewText        0
scoreSentiment    0
delta             0
delta_category    0
dtype: int64

In [7]:
# Dropping duplicates
massive = massive.drop_duplicates(subset='reviewText', keep='first')
# Resetting index after dropping dupe's
massive.reset_index(drop=True, inplace=True)
# Checking for duplicates
dup_df = massive.apply(lambda x: x.duplicated()).sum()
dup_df

title              7215
tomatoMeter       14882
criticName        11916
publicatioName    13485
reviewText            0
scoreSentiment    14980
delta             14860
delta_category    14977
dtype: int64

## BERT Vectorizer

In [8]:
# Pulling BERT from huggingface
tokenizer = AutoTokenizer.from_pretrained("activebus/BERT_Review")
model = AutoModel.from_pretrained("activebus/BERT_Review")

Some weights of the model checkpoint at activebus/BERT_Review were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
%%notify
# Text preprocessing function
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)
# Function to vectorize text with BERT
def get_embedding(text):
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    features = model(**encoded_input)
    features = features[0].detach().numpy() 
    features_mean = np.mean(features[0], axis=0) 
    return features_mean
# Applying the function to reviewText and outputting to embeddings column
massive['embeddings'] = massive.reviewText.map(lambda x: get_embedding(x))
massive.head()

Unnamed: 0,title,tomatoMeter,criticName,publicatioName,reviewText,scoreSentiment,delta,delta_category,embeddings
0,ClownTown,9.0,Roger Moore,Movie Nation,"Deathly-dull, but could this no-budget horror ...",NEGATIVE,-19.0,1,"[0.08950369, 0.5656611, 0.2512408, 0.053721197..."
1,Maleficent,54.0,Ann Hornaday,Washington Post,"For all its limitations, ""Maleficent"" manages ...",POSITIVE,-16.0,1,"[-0.3613663, 0.5549257, 0.020555448, -0.069974..."
2,Cosmopolis,66.0,Gary Thompson,Philadelphia Daily News,"Here we have a brainy, sometimes impenetrable ...",NEGATIVE,35.0,4,"[-0.33737096, 0.4118671, 0.16659085, -0.024679..."
3,Frank Miller's Sin City: A Dame to Kill For,43.0,Jennifer Heaton,Alternative Lens,It often feels like the table scraps of the fi...,NEGATIVE,-1.0,2,"[-0.2537944, 0.2904615, 0.081751816, -0.033366..."
4,Hostiles,70.0,Cary Darling,Houston Chronicle,"Along with another 2017 film, ""Wind River,"" ""H...",POSITIVE,-5.0,2,"[-0.37430876, 0.44088465, -0.19314666, -0.1849..."


<IPython.core.display.Javascript object>

In [10]:
# Converting embeddings column to numpy array
vectors = np.array(massive.embeddings.tolist(), dtype='float')
vectors[:10]

array([[ 0.08950369,  0.56566107,  0.25124079, ..., -0.22535168,
         0.4612647 , -0.28839523],
       [-0.3613663 ,  0.55492568,  0.02055545, ..., -0.05430287,
         0.3483009 ,  0.34693852],
       [-0.33737096,  0.41186711,  0.16659085, ..., -0.39889804,
         0.33647817,  0.14008528],
       ...,
       [ 0.0393588 ,  0.79363269, -0.12832816, ..., -0.18065843,
         0.3424049 ,  0.22751403],
       [-0.11100719,  0.31592789,  0.00539729, ..., -0.16562848,
         0.29397234, -0.53371745],
       [-0.37595558,  0.35926038, -0.04925755, ..., -0.299943  ,
         0.29099593,  0.09980157]])

In [11]:
# Flattening numpy array and creating dataframe with it
v_df = pd.DataFrame(vectors, columns=[f'col{i+1}' for i in range(vectors.shape[1])])
v_df.head(3)

Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,...,col759,col760,col761,col762,col763,col764,col765,col766,col767,col768
0,0.089504,0.565661,0.251241,0.053721,0.242632,-0.06203,0.034294,0.557072,-0.226973,0.128724,...,-0.03486,-0.283966,-0.049585,0.067217,-0.17056,0.059303,-0.018557,-0.225352,0.461265,-0.288395
1,-0.361366,0.554926,0.020555,-0.069974,0.005973,0.076818,0.208793,0.45562,-0.131073,-0.084887,...,-0.101248,-0.404666,0.158484,-0.05587,-0.132956,-0.038175,0.4524,-0.054303,0.348301,0.346939
2,-0.337371,0.411867,0.166591,-0.02468,0.375604,0.084084,-0.157392,0.503688,-0.128764,-0.1752,...,-0.059233,-0.332006,0.078353,-0.067937,-0.362091,-0.048901,-0.014837,-0.398898,0.336478,0.140085


In [12]:
# Dropping review text now it is vectorized, and dropping embeddings now that it is flattened
massive = massive.drop(columns=['reviewText', 'embeddings', 'delta'])

In [13]:
massive.isnull().sum().head(9)

title             0
tomatoMeter       0
criticName        0
publicatioName    0
scoreSentiment    0
delta_category    0
dtype: int64

## Encoding categorical columns

In [14]:
# Renaming the least-appearing critics as Other to later be encoded as the same critic
counts = massive.criticName.value_counts()
threshold = massive.criticName.isin(counts.index[counts<16])
massive.loc[threshold, 'criticName'] = 'Other'
massive['criticName'].value_counts()

Other                  8936
Dennis Schwartz          98
Roger Ebert              88
Jeffrey M. Anderson      85
Roger Moore              79
                       ... 
James Rocchi             16
John Serba               16
Philip Martin            16
JoBlo                    16
Jon Popick               16
Name: criticName, Length: 217, dtype: int64

In [15]:
# Repeat for publications
counts = massive.publicatioName.value_counts()
threshold = massive.publicatioName.isin(counts.index[counts<12])
massive.loc[threshold, 'publicatioName'] = 'Other'
massive['publicatioName'].value_counts()

Other                 4088
New York Times         212
Variety                192
Los Angeles Times      138
Hollywood Reporter     133
                      ... 
Columbus Alive          12
FilmStew.com            12
Three Movie Buffs       12
rachelsreviews.net      12
Black Girl Nerds        12
Name: publicatioName, Length: 337, dtype: int64

In [16]:
# List of columns to dummy
categorical_cols = ['title', 'criticName', 'publicatioName', 'scoreSentiment'] 
# Get dummies on categorical columns
massive = pd.get_dummies(massive, columns = categorical_cols)

## Final pre-processing, splitting, scaling, PCA

In [17]:
# Merging massive df with vectorized df
combined = pd.concat([massive, v_df], axis=1)
combined.head(1)

Unnamed: 0,tomatoMeter,delta_category,title_$9.99,title_'71,title_(500) Days of Summer,title_... And They Lived Happily Ever After,title_...And Justice for All,title_1 Day,title_10,title_10 Cloverfield Lane,...,col759,col760,col761,col762,col763,col764,col765,col766,col767,col768
0,9.0,1,0,0,0,0,0,0,0,0,...,-0.03486,-0.283966,-0.049585,0.067217,-0.17056,0.059303,-0.018557,-0.225352,0.461265,-0.288395


In [18]:
X = combined.drop(columns=['delta_category'])
y = combined['delta_category']
# Converting Category datatype to integer
y = y.cat.codes

In [19]:
X.isnull().sum().sum()

0

In [20]:
y[0:4]

0    1
1    1
2    4
3    2
dtype: int8

In [21]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [22]:
%%notify
# Scaling X
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

<IPython.core.display.Javascript object>

In [23]:
%%notify
# PCA to filter most important features
pca = PCA(n_components=6000)
X_train_p = pca.fit_transform(X_train_sc)
X_test_p = pca.transform(X_test_sc)

<IPython.core.display.Javascript object>

## Gradient Boosted Tree Classifier

In [None]:
%%notify
g_cores = cores-14
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1] 
for learning_rate in learning_rates: 
    GBmodel = GradientBoostingClassifier(n_estimators=100,
                                         learning_rate=learning_rate,
                                         max_depth=3,
                                         random_state=69)
    GBmodel.fit(X_train_p, y_train.ravel())
    print("Learning rate: ", learning_rate)
    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        GBmodel.score(
            X_train_p,
            y_train.ravel())))
    print("Accuracy score (validation): {0:.3f}".format(
        GBmodel.score(
            X_test_p,
            y_test.ravel())))
    print()