# Predictions for part 5

In [1]:
import autograd.numpy as np
from autograd import grad
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

In [2]:
# import data
test_data = pd.read_csv('Test.csv')
train_data = pd.read_csv('Training.csv')

train_data.describe(include='all')

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,image,style,category
count,29189.0,29189,29189,29189,29189,29185,29189,29184,29189.0,6047.0,600,15365,29189
unique,,2,4866,27588,24102,23305,28807,21768,,,600,6104,6
top,,True,"01 3, 2017",AA7918E9410D650A076221C7B2934A09,6E46168E5DFA83876F2E968A187B9F7A,760C63E8E5E8DC3FAA01878D37BA5678,works great,Five Stars,,,['https://images-na.ssl-images-amazon.com/imag...,{'Format:': ' Audio CD'},CDs
freq,,22996,40,32,36,1049,25,1476,,,1,4296,4912
mean,2.971907,,,,,,,,1409661000.0,7.295353,,,
std,1.410878,,,,,,,,111509900.0,18.74946,,,
min,1.0,,,,,,,,894758400.0,2.0,,,
25%,2.0,,,,,,,,1390781000.0,2.0,,,
50%,3.0,,,,,,,,1439770000.0,4.0,,,
75%,4.0,,,,,,,,1476317000.0,7.0,,,


In [3]:
import re

# Preprocessing data

GENERIC_STOP_WORDS = [] 

STOP_WORDS_SUMMARY = []

STOP_WORDS_REVIEW_TEXT = []

def edit_summary(content):

    # step 1 - convert the text to only lower case
    content = content.lower()

    # step 2 - remove unwanted and unicode characters
    content = re.sub(
        r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", content)

    # step 3 - remove stop words
    stopwords = GENERIC_STOP_WORDS + STOP_WORDS_SUMMARY

    filtered_words = [
        word for word in content.split() if word not in stopwords]
    text = " ".join(filtered_words)

    return text


def edit_review_text(content):

    # step 1 - convert the text to only lower case
    content = content.lower()

    # step 2 - remove unwanted and unicode characters
    content = re.sub(
        r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", content)

    # step 3 - remove stop words
    stopwords = GENERIC_STOP_WORDS + STOP_WORDS_REVIEW_TEXT

    filtered_words = [
        word for word in content.split() if word not in stopwords]
    text = " ".join(filtered_words)

    return text

# create categories for vote label


def assign_vote_label(i):
    if i <= 2.0:
        return 'low'
    if i > 2.0 and i <= 10.0:
        return 'medium'
    if i > 10.0 and i <= 50.0:
        return 'good'
    if i > 50.0:
        return 'high'


def preprocess(data):

    # filter out only important columns
    X_cols = ['reviewText', 'summary', 'verified', 'vote', 'category']

    # editing text of "reviewText" & "summary"
    data['reviewText'] = data['reviewText'].astype(str).apply(
        edit_review_text)
    data['summary'] = data['summary'].astype(str).apply(edit_summary)

    # editing vote - assigning NAN values to 0
    data['vote'] = data['vote'].fillna(0)
    data['vote'] = data['vote'].apply(lambda x: assign_vote_label(x))

    data['category'] = data['category'].fillna('Unknown')

    X = data[X_cols]

    return X


In [4]:

y = train_data['overall'].astype(int)

X_main_train = preprocess(train_data)

# Test Data
X_main_test = preprocess(test_data)


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OrdinalEncoder

# define the column transformer to apply TfidfVectorizer and encoder to multiple columns
ct = ColumnTransformer(
    [("vect_summary", TfidfVectorizer(), 'summary'),
     ("vect_reviewText", TfidfVectorizer(), 'reviewText'),
     ("encd_verified", OrdinalEncoder(dtype=int), ['verified']),
     ("encd_category", OrdinalEncoder(dtype=int,
      handle_unknown='use_encoded_value', unknown_value=99), ['category']),
     ("encd_votes", OrdinalEncoder(dtype=int), ['vote'])
     ])


In [6]:
from sklearn.naive_bayes import MultinomialNB

pipe = Pipeline([
    ("transformer", ct),
    ("classifier", MultinomialNB())
])

params = {
    "transformer__vect_summary__ngram_range": [(1, 2)],
    "transformer__vect_reviewText__ngram_range": [(1, 2)],
    "transformer__vect_summary__max_features": [35000],
    "transformer__vect_reviewText__max_features": [15000],
    "classifier__alpha": [1],
    'classifier__fit_prior': [False],
}

# perform grid search cross-validation to find the best hyperparameters
grid_search = GridSearchCV(pipe, params, cv=5)

grid_search.fit(X_main_train, y)


In [7]:
# from sklearn.linear_model import LogisticRegression

# pipe = Pipeline([
#     ("transformer", ct),
#     ("classifier", LogisticRegression())
# ])

# params = {
#     "transformer__vect_summary__ngram_range": [(1, 2)],
#     "transformer__vect_reviewText__ngram_range": [(1, 2)],
#     "transformer__vect_summary__max_features": [32000],
#     "transformer__vect_reviewText__max_features": [5000],
#     "classifier__solver": ['liblinear'],
#     "classifier__C": [5],
#     "classifier__max_iter": [100]
# }

# # perform grid search cross-validation to find the best hyperparameters
# grid_search = GridSearchCV(pipe, params, cv=5)

# grid_search.fit(X_main_train, y)


In [8]:
# Predict test data using Logistic regression
y_pred_test_log = grid_search.best_estimator_.predict(X_main_test)

test_ids = list(test_data.index)
test_submission = pd.DataFrame({'id':test_ids, 'predicted':y_pred_test_log})
test_submission.head()

test_submission.to_csv('test_submission_part_5.csv', index = False)

# f1score: 0.61413
