In [33]:
#import functions / classes from helper and text_pipeline script 
from helpers import prepare_df, calc_scores, check_class_balance, downsample, hard
from text_pipeline import TextPrep

#all appropriate imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
import pickle 
import tensorflow as tf


In [6]:
#load in dataframe
reviews = pd.read_csv('../data/yelp.csv', index_col=0)
#then use the prepare_df function to clean the dataframe and make our three classes (ranked 1-3) 
reviews = prepare_df(reviews)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [13]:
#next, going to split the entire dataframe into train and holdout
df_train, df_holdout = train_test_split(reviews)
#lets use the imported check class balance function
labels = [1, 2, 3]
check_class_balance(df_train, 'stars', labels=labels)
#we can see that class3 ('Good') greatly outnumbers the rest 

{'total': 118818, 1: 19705, 2: 19771, 3: 79342}

In [15]:
#going to use the imported downsample function 
df_train = downsample(df_train, 'stars')
#sanity check
check_class_balance(df_train, 'stars', labels=labels)

{'total': 59115, 1: 19705, 2: 19705, 3: 19705}

In [20]:
#import the TextPrep class which instatiates an NLP pipeline for a dataframe, 
#including a vectorize method which takes a method and a column name as a parameter 
pipeline = TextPrep(df_train)
X = pipeline.vectorize('text', method='token')
y = df_train['stars'].values
#the pipeline has a documents attribute which has a list of raw documents

In [26]:
# #create another pipeline for the un-undersampled holdout
pipeline_holdout = TextPrep(df_holdout)
X_holdout = pipeline_holdout.vectorize('text', method='token')
y_holdout = df_holdout['stars'].values


In [27]:
#now split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [28]:
#first:establish a baseline, which is if prediction was just random (each class with equal probability)
#then use the imported calc_scores function which returns a list of precision, accuracy, and recall score
baseline = []
for i in range(len(y)):
    baseline.append(np.random.choice([1, 2, 3]))
baseline = np.array(baseline)
baseline_scores = calc_scores(baseline, y)

Precision : 0.33340099805463924 
Accuracy : 0.33340099805463924 
Recall: 0.33340099805463924


In [30]:
#next load in the lstm model. It has already been trained with the following architecture:
# model = Sequential()
# model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X_lstm.shape[1]))
# model.add(SpatialDropout1D(0.2))
# model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
# model.add(Dense(3, activation='softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [34]:
model = tf.keras.models.load_model('lstm_model.pkl')