In [1]:
import pandas as pd
import numpy as np
import os

import warnings
warnings.filterwarnings('ignore')

In [7]:
from sklearn.naive_bayes import BernoulliNB

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [9]:
from scipy import sparse

In [16]:
from sklearn import preprocessing

In [2]:
full_preprocessed_train = pd.DataFrame(pd.read_csv(os.getcwd() + '/' + 'full_preprocessed_train.csv', sep=','))
full_preprocessed_val = pd.DataFrame(pd.read_csv(os.getcwd() + '/' + 'full_preprocessed_dev.csv', sep=','))

full_preprocessed_train = full_preprocessed_train.replace(np.nan, " ")
full_preprocessed_val = full_preprocessed_val.replace(np.nan, " ")

In [3]:
y_training_data = full_preprocessed_train['label']
y_val_val = full_preprocessed_val['label']

# LDA

In [4]:
lda_train = pd.DataFrame(pd.read_csv(os.getcwd() + '/' + 'lda_train.csv', sep=',', header=None))
lda_val = pd.DataFrame(pd.read_csv(os.getcwd() + '/' + 'lda_val.csv', sep=',', header=None))

In [5]:
lda_train.shape

(250874, 20)

In [6]:
lda_val.shape

(35918, 20)

### LDA without Additional Features (aka just vectorizer)

In [8]:
LDA_model = BernoulliNB()
LDA_model.fit(lda_train, y_training_data)
y_pred = LDA_model.predict(lda_val)
print("LDA with out additional features")
print("Accuracy:", accuracy_score(y_val_val, y_pred))
print("F1:", f1_score(y_val_val, y_pred, average='weighted'))

LDA with out additional features
Accuracy: 0.898435324906732
F1: 0.8503697992239174


### LDA with Additional Features (aka preprocessed + vectorizer)

In [10]:
num_feats = full_preprocessed_train[['user_id', 'prod_id', 'rating',  'length',
       'numbers', 'caps', 'num_sent', 'avg_words', 'perc_tot_user_reviews',
       'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD',
       'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR',
       'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT',
       'WP', 'WP$', 'WRB']].values

In [11]:
val_num_feats = full_preprocessed_val[['user_id', 'prod_id', 'rating',  'length',
       'numbers', 'caps', 'num_sent', 'avg_words', 'perc_tot_user_reviews',
       'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD',
       'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR',
       'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT',
       'WP', 'WP$', 'WRB']].values

In [12]:
LDA_X_training_data = sparse.hstack((num_feats, lda_train))
LDA_X_val_data = sparse.hstack((val_num_feats, lda_val))

In [13]:
LDA_X_training_data

<250874x64 sparse matrix of type '<class 'numpy.float64'>'
	with 9660326 stored elements in COOrdinate format>

In [14]:
LDA_X_val_data

<35918x64 sparse matrix of type '<class 'numpy.float64'>'
	with 1382973 stored elements in COOrdinate format>

In [15]:
LDA_model = BernoulliNB()
LDA_model.fit(LDA_X_training_data, y_training_data)
y_pred = LDA_model.predict(LDA_X_val_data)
print("LDA with additional features")
print("Accuracy:", accuracy_score(y_val_val, y_pred))
print("F1:", f1_score(y_val_val, y_pred, average='weighted'))

LDA with additional features
Accuracy: 0.8189765577147948
F1: 0.8272471328069518


### LDA just vectorizer and normalized

In [17]:
x = lda_train.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
lda_train_N = pd.DataFrame(x_scaled)

x = lda_val.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
lda_val_N = pd.DataFrame(x_scaled)

In [18]:
LDA_model = BernoulliNB()
LDA_model.fit(lda_train_N, y_training_data)
y_pred = LDA_model.predict(lda_val_N)
print("LDA with out additional features and Normalized")
print("Accuracy:", accuracy_score(y_val_val, y_pred))
print("F1:", f1_score(y_val_val, y_pred, average='weighted'))

LDA with out additional features and Normalized
Accuracy: 0.8983239601314105
F1: 0.8503142729508569


# Doc2Vec

In [19]:
doc2vec_train = pd.DataFrame(pd.read_csv(os.getcwd() + '/' + 'dv_train.csv', sep=',', header=None))
doc2vec_val = pd.DataFrame(pd.read_csv(os.getcwd() + '/' + 'dv_val.csv', sep=',', header=None))

In [20]:
doc2vec_train.shape

(250874, 100)

In [21]:
doc2vec_val.shape

(35918, 100)

### Doc2Vec without Additional Features (aka just vectorizer)

In [22]:
dv_model = BernoulliNB()
dv_model.fit(doc2vec_train, y_training_data)
y_pred = dv_model.predict(doc2vec_val)
print("DOC2VEC with out additional features")
print("Accuracy:", accuracy_score(y_val_val, y_pred))
print("F1:", f1_score(y_val_val, y_pred, average='weighted'))

DOC2VEC with out additional features
Accuracy: 0.8252129851328025
F1: 0.8277094500929553


### Doc2Vec with Additional Features (aka preprocessed + vectorizer)

In [23]:
DOC2VEC_X_training_data = sparse.hstack((num_feats, doc2vec_train))
DOC2VEC_X_val_data = sparse.hstack((val_num_feats, doc2vec_val))

In [24]:
DOC2VEC_X_training_data

<250874x144 sparse matrix of type '<class 'numpy.float64'>'
	with 29730246 stored elements in COOrdinate format>

In [25]:
DOC2VEC_X_val_data

<35918x144 sparse matrix of type '<class 'numpy.float64'>'
	with 4256413 stored elements in COOrdinate format>

In [26]:
dv_model = BernoulliNB()
dv_model.fit(DOC2VEC_X_training_data, y_training_data)
y_pred = dv_model.predict(DOC2VEC_X_val_data)
print("DOC2VEC with additional features")
print("Accuracy:", accuracy_score(y_val_val, y_pred))
print("F1:", f1_score(y_val_val, y_pred, average='weighted'))

DOC2VEC with additional features
Accuracy: 0.7603430035079904
F1: 0.7938734692738963


### Doc2Vec just vectorizer and normalized

In [27]:
x = doc2vec_train.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
doc2vec_train_N = pd.DataFrame(x_scaled)

x = doc2vec_val.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
doc2vec_val_N = pd.DataFrame(x_scaled)

In [28]:
dv_model = BernoulliNB()
dv_model.fit(doc2vec_train_N , y_training_data)
y_pred = dv_model.predict(doc2vec_val_N)
print("DOC2VEC with out additional features")
print("Accuracy:", accuracy_score(y_val_val, y_pred))
print("F1:", f1_score(y_val_val, y_pred, average='weighted'))

DOC2VEC with out additional features
Accuracy: 0.8981290717745977
F1: 0.8502170862949754


# TFIDF

In [29]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 

#TfidfVectorizer
binary_Tfidf_vectorizer = TfidfVectorizer(binary=True, ngram_range=(1, 2))
binary_Tfidf_vectorizer.fit(full_preprocessed_train['review'])

#stores data in sparse matrix
X_train_binary_Tfidf = binary_Tfidf_vectorizer.transform(full_preprocessed_train['review'])
X_val_binary_Tfidf = binary_Tfidf_vectorizer.transform(full_preprocessed_val['review'])

In [30]:
X_train_binary_Tfidf

<250874x4309860 sparse matrix of type '<class 'numpy.float64'>'
	with 27181974 stored elements in Compressed Sparse Row format>

In [31]:
X_val_binary_Tfidf

<35918x4309860 sparse matrix of type '<class 'numpy.float64'>'
	with 3451428 stored elements in Compressed Sparse Row format>

### TFIDF without Additional Features (aka just vectorizer)

In [32]:
tfidf_model = BernoulliNB()
tfidf_model.fit(X_train_binary_Tfidf, y_training_data)
y_pred = tfidf_model.predict(X_val_binary_Tfidf)
print("TFIDF with out additional features")
print("Accuracy:", accuracy_score(y_val_val, y_pred))
print("F1:", f1_score(y_val_val, y_pred, average='weighted'))

TFIDF with out additional features
Accuracy: 0.898435324906732
F1: 0.8503697992239174


### TFIDF with Additional Features (aka preprocessed + vectorizer)

In [33]:
Tfidf_X_training_data = sparse.hstack((num_feats, X_train_binary_Tfidf))
Tfidf_X_val_data = sparse.hstack((val_num_feats, X_val_binary_Tfidf ))

In [34]:
tfidf_model = BernoulliNB()
tfidf_model.fit(Tfidf_X_training_data, y_training_data)
y_pred = tfidf_model.predict(Tfidf_X_val_data)
print("TFIDF with additional features")
print("Accuracy:", accuracy_score(y_val_val, y_pred))
print("F1:", f1_score(y_val_val, y_pred, average='weighted'))

TFIDF with additional features
Accuracy: 0.898435324906732
F1: 0.8503697992239174


### TFIDF just vectorizer and normalized

MinMaxScaler does not support sparse input and crashed computer when converted to dense. 

# Imbalanced Classes Compare