## BOW - Bag of Words

In [None]:
# Imports
import pandas as pd

from xgboost import XGBRegressor

from sklearn import feature_extraction, naive_bayes, svm
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import pipeline
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.model_selection import KFold, cross_val_predict, StratifiedKFold

In [None]:
# Load Data
data = pd.read_csv("data/filtered_reviews.csv", names=["Review Text", "Stars", "Polarity"])
data

### Create Bow Dataframe

In [None]:
# Creating a BOW vectorizer using Tf_idf - a more advanced version of the classic BOW algorithm
vectorizer = feature_extraction.text.CountVectorizer()

In [None]:
# Creating BOW vectors for dataset vocabulary
corpus = data["Review Text"].values.astype('U')
vectorizer.fit(corpus)

### Create Classifiers

In [None]:
SVM_classifier = svm.SVC()
NB_classifier = naive_bayes.MultinomialNB()
RF_classifier = RandomForestClassifier()
DT_classifier = DecisionTreeClassifier()
AB_classifier = AdaBoostClassifier()
# XGB_classifier = XGBRegressor()

### Create model pipelines

In [None]:
NB_model = pipeline.Pipeline([("vectorizer", vectorizer),("classifier", NB_classifier)])
SVM_model = pipeline.Pipeline([("vectorizer", vectorizer),("classifier", SVM_classifier)])
RF_model = pipeline.Pipeline([("vectorizer", vectorizer),("classifier", RF_classifier)])
DT_model = pipeline.Pipeline([("vectorizer", vectorizer),("classifier", DT_classifier)])
AB_model = pipeline.Pipeline([("vectorizer", vectorizer),("classifier", AB_classifier)])
# XGB_model = pipeline.Pipeline([("vectorizer", vectorizer),("classifier", XGB_classifier)])

### Cross-validation for model evaluation

In [None]:
cv = StratifiedKFold(n_splits=10)

In [None]:
# Run Naive Bayes Model for star predicition and evaluate results
nb_stars_pred = cross_val_predict(NB_model, data["Review Text"].values.astype('U'), data["Stars"], cv=cv)
print(classification_report(data["Stars"], nb_stars_pred))
print(confusion_matrix(data["Stars"], nb_stars_pred))

In [None]:
# Run Naive Bayes Model for polarity predicition and evaluate results
nb_pol_pred = cross_val_predict(NB_model, data["Review Text"].values.astype('U'), data["Polarity"], cv=cv)
print(classification_report(data["Polarity"], nb_pol_pred))
print(confusion_matrix(data["Polarity"], nb_pol_pred))

In [None]:
# Run SVM Model for star predicition and evaluate results
svm_stars_pred = cross_val_predict(SVM_model, data["Review Text"].values.astype('U'), data["Stars"], cv=cv)
print(classification_report(data["Stars"], svm_stars_pred))
print(confusion_matrix(data["Stars"], svm_stars_pred))

In [None]:
# Run SVM Model for polarity predicition and evaluate results
svm_pol_pred = cross_val_predict(SVM_model, data["Review Text"].values.astype('U'), data["Polarity"], cv=cv)
print(classification_report(data["Polarity"], svm_pol_pred))
print(confusion_matrix(data["Polarity"], svm_pol_pred))

In [None]:
# Run Random Forest Model for star predicition and evaluate results
rf_stars_pred = cross_val_predict(RF_model, data["Review Text"].values.astype('U'), data["Stars"], cv=cv)
print(classification_report(data["Stars"], rf_stars_pred))
print(confusion_matrix(data["Stars"], rf_stars_pred))

In [None]:
# Run Random Forest Model for polarity predicition and evaluate results
rf_pol_pred = cross_val_predict(RF_model, data["Review Text"].values.astype('U'), data["Polarity"], cv=cv)
print(classification_report(data["Polarity"], rf_pol_pred))
print(confusion_matrix(data["Polarity"], rf_pol_pred))

In [None]:
# Run Decision Tree Model for star predicition and evaluate results
dt_stars_pred = cross_val_predict(DT_model, data["Review Text"].values.astype('U'), data["Stars"], cv=cv)
print(classification_report(data["Stars"], dt_stars_pred))
print(confusion_matrix(data["Stars"], dt_stars_pred))

In [None]:
# Run Decision Tree Model for polarity predicition and evaluate results
dt_pol_pred = cross_val_predict(DT_model, data["Review Text"].values.astype('U'), data["Polarity"], cv=cv)
print(classification_report(data["Polarity"], dt_pol_pred))
print(confusion_matrix(data["Polarity"], dt_pol_pred))

In [None]:
# Run Ada Boost Model for star predicition and evaluate results
ab_stars_pred = cross_val_predict(AB_model, data["Review Text"].values.astype('U'), data["Stars"], cv=cv)
print(classification_report(data["Stars"], ab_stars_pred))
print(confusion_matrix(data["Stars"], ab_stars_pred))

In [None]:
# Run Ada Boost Model for polarity predicition and evaluate results
ab_pol_pred = cross_val_predict(AB_model, data["Review Text"].values.astype('U'), data["Polarity"], cv=cv)
print(classification_report(data["Polarity"], ab_pol_pred))
print(confusion_matrix(data["Polarity"], ab_pol_pred))