In [1]:
#import libraries
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from gensim.models import Word2Vec
from gensim.test.utils import datapath
from gensim import utils
import gensim
import multiprocessing
import os
from time import time
cores = multiprocessing.cpu_count()

In [None]:
# To notify when cell is complete (may need to comment out first line)
# !pip uninstall jupyternotify -y
!pip install git+https://github.com/cphyc/jupyter-notify.git
%reload_ext jupyternotify

In [3]:
# read in cleaned CSV
filepath = Path('../massive.csv')
massive = pd.read_csv(filepath)
massive = massive.sample(n=15000)

<IPython.core.display.Javascript object>

In [4]:
massive.head(1)

Unnamed: 0,id,title,audienceScore,tomatoMeter,reviewId,creationDate,criticName,isTopCritic,reviewState,publicatioName,reviewText,scoreSentiment,delta
759017,1075984-iron_monkey,Iron Monkey,86.0,91.0,256086,2001-10-16,James Berardinelli,True,fresh,ReelViews,"[Yuen's] fights are innovative and intense, wh...",POSITIVE,5.0


In [4]:
# Dropping columns including audienceScore because it could unfairly help the model predict the delta. Leaving tomatoMeter to give the model a base of where the delta could be.
massive = massive.drop(columns=['id', 'reviewId', 'creationDate', 'isTopCritic', 'reviewState', 'audienceScore'])

In [5]:
massive.isnull().sum()

title             0
tomatoMeter       0
criticName        0
publicatioName    0
reviewText        0
scoreSentiment    0
delta             0
dtype: int64

In [6]:
# Dropping duplicates
massive = massive.drop_duplicates(subset='reviewText', keep='first')
# Checking for duplicates
dup_df = massive.apply(lambda x: x.duplicated()).sum()
dup_df

title              7267
tomatoMeter       14884
criticName        11902
publicatioName    13487
reviewText            0
scoreSentiment    14982
delta             14868
dtype: int64

In [18]:
import torch
from transformers import AutoModel, AutoTokenizer, AutoConfig
from transformers import AutoModelForSequenceClassification
from torch.utils.data import DataLoader

tokenizer = AutoTokenizer.from_pretrained("activebus/BERT_Review")
model = AutoModel.from_pretrained("activebus/BERT_Review")


Some weights of the model checkpoint at activebus/BERT_Review were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
# loaded_model = torch.load('../BERT_Review/pytorch_model.bin', map_location=torch.device('cpu'))

In [19]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

def get_embedding(text):
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    features = model(**encoded_input)
    features = features[0].detach().numpy() 
    features_mean = np.mean(features[0], axis=0) 
    return features_mean

massive['embeddings'] = massive.reviewText.map(lambda x: get_embedding(x))
massive.head()

Unnamed: 0,title,tomatoMeter,criticName,publicatioName,reviewText,scoreSentiment,delta,embeddings
326995,Superman II,83.0,Jesús Fernández Santos,El Pais (Spain),"Like in each of these films, the best part are...",POSITIVE,7.0,"[-0.31772605, 0.5285138, 0.3135373, -0.4036947..."
584625,Chasing Amy,87.0,Radheyan Simonpillai,AskMen.com,The third installment of Kevin Smith's New Jer...,POSITIVE,4.0,"[-0.0777253, 0.18772165, 0.051908623, -0.30558..."
676483,Machuca,87.0,Jonathan Curiel,San Francisco Chronicle,It's a sensitively wrought work that reveals a...,POSITIVE,-4.0,"[-0.31736574, 0.39004192, -0.3942195, -0.11205..."
867120,A Sunday Affair,50.0,John Serba,Decider,A Sunday Affair is a decent technical achievem...,NEGATIVE,-4.0,"[-0.34689125, 0.61465055, 0.1722262, -0.104893..."
661243,Scary Stories to Tell in the Dark,77.0,Jason Fraley,"WTOP (Washington, D.C.)",A PG-13 adaptation that delivers intermittent ...,NEGATIVE,5.0,"[-0.41672167, 0.5564149, 0.0064520207, -0.0507..."


In [21]:
vectors = np.array(massive.embeddings.tolist(), dtype='float')

X = vectors
y = massive['delta']

In [None]:
# Creating a dataframe made of the vectorized review text for the linear model
# massive_features = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names())

In [245]:
# Dropping review text now that vectorized words are all columns
combined = combined.drop(columns=['reviewText'])

In [238]:
# Fill the nulls in all of the word columns with 0
combined = combined.fillna(0)

In [None]:
combined.isnull().sum().head(9)

title_            0
tomatoMeter       0
criticName        0
publicatioName    0
scoreSentiment    0
delta             0
abandon           0
ability           0
able              0
dtype: int64

In [None]:
# Renaming the least-appearing critics as Other to be encoded as the same critic later
counts = combined.criticName.value_counts()
threshold = combined.criticName.isin(counts.index[counts<16])
combined.loc[threshold, 'criticName'] = 'Other'
combined['criticName'].value_counts()

Other              8954
Dennis Schwartz     111
David Nusair         94
Roger Ebert          89
Roger Moore          81
                   ... 
Ella Taylor          16
Scott Foundas        16
Dann Gire            16
David Noh            16
Kim Newman           16
Name: criticName, Length: 214, dtype: int64

In [None]:
# Repeat for publications
counts = combined.publicatioName.value_counts()
threshold = combined.publicatioName.isin(counts.index[counts<12])
combined.loc[threshold, 'publicatioName'] = 'Other'
combined['publicatioName'].value_counts()

Other                 4148
New York Times         180
Variety                166
Los Angeles Times      138
Slant Magazine         134
                      ... 
The Ringer              12
Three Movie Buffs       12
Film Companion          12
Baltimore Sun           12
culturevulture.net      12
Name: publicatioName, Length: 333, dtype: int64

In [271]:
combined.head(1)

Unnamed: 0,tomatoMeter,delta,abandon,ability,able,abrams,absence,absolute,absolutely,absorbing,...,publicatioName_culturevulture.net,publicatioName_eFilmCritic.com,publicatioName_eye WEEKLY,publicatioName_film-authority.com,publicatioName_indieWire,publicatioName_jackiekcooper.com,publicatioName_rachelsreviews.net,publicatioName_rec.arts.movies.reviews,scoreSentiment_NEGATIVE,scoreSentiment_POSITIVE
328556,22.0,-10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
# list of columns to dummy
categorical_cols = ['title_', 'criticName', 'publicatioName', 'scoreSentiment'] 
# get dummies on categorical columns
combined = pd.get_dummies(combined, columns = categorical_cols)

In [273]:
X = combined.drop(columns=['delta'])
y = combined['delta']

In [274]:
X.isnull().sum().sum()

0

In [284]:
y[:5]

328556   -10.0
477873    19.0
709010     9.0
340594     8.0
720402   -31.0
Name: delta, dtype: float64

In [292]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [293]:
X_train.isnull().sum().sum()

0

In [283]:
# Scaling X
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [339]:
pca = PCA(n_components=2000)
X_train_p = pca.fit_transform(X_train_sc)
X_test_p = pca.transform(X_test_sc)

In [340]:
# Choose a machine learning model (e.g., Logistic Regression) and train it
model = LinearRegression()
model.fit(X_train_p, y_train)

LinearRegression()

In [341]:
model.score(X_train_p, y_train)

0.5509262446941581

In [342]:
model.score(X_test_p, y_test)

0.3442473356373843