In [1]:
# Import necessary modules
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from sklearn import utils

In [2]:
# Import data 

# Read the CSV file into a DataFrame: df
df = pd.read_csv('Amazon_Unlocked_Mobile.csv')
# Comment out this line to match with lecture
df = df.sample(frac=0.1, random_state=10)

In [3]:
df = df[pd.notnull(df['Reviews'])]
df.head(10)

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
394349,Sony XPERIA Z2 D6503 FACTORY UNLOCKED Internat...,,244.95,5,Very good one! Better than Samsung S and iphon...,0.0
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0
277158,Nokia N8 Unlocked GSM Touch Screen Phone Featu...,Nokia,95.0,5,I fell in love with this phone because it did ...,0.0
100311,Blackberry Torch 2 9810 Unlocked Phone with 1....,BlackBerry,77.49,5,I am pleased with this Blackberry phone! The p...,0.0
251669,Motorola Moto E (1st Generation) - Black - 4 G...,Motorola,89.99,5,"Great product, best value for money smartphone...",0.0
374058,Samsung Galaxy S7 Edge SM-G935F 32GB Factory U...,,593.5,4,except samsung pay everything is good,0.0
279878,OtterBox 77-29864 Defender Series Hybrid Case ...,OtterBox,9.99,5,I've bought 3 no problems. Fast delivery.,0.0


In [4]:
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
    text = text.lower() # lowercase text
    text = re.sub(r'[/(){}\[\]\|@,;.#+_]',' ', text) 
    text = re.sub(r'[^0-9a-z ]','', text) 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 
    return text

In [5]:
df['Reviews'] = df['Reviews'].apply(clean_text)

In [6]:
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
394349,Sony XPERIA Z2 D6503 FACTORY UNLOCKED Internat...,,244.95,5,good one better samsung iphones quality camera...,0.0
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,phone needed sim card would nice know,1.0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,3 months away upgrade stratosphere kept crappi...,3.0
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,experience want forget,0.0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,great phone work according expectations,1.0


In [7]:
maxLen = len(max(df['Reviews'], key=len).split())
maxLen

1408

In [8]:
df.shape[0] 

41374

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], df['Rating'], random_state=0, test_size=0.3)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5
vect = TfidfVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)
len(vect.get_feature_names())

21056

In [11]:
import warnings
warnings.filterwarnings("ignore")
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)

In [19]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, reg_alpha=4, objective='multi:softmax', subsample=0.8).fit(X_train_vectorized, y_train) 
xgb_prediction = xgb_model.predict(X_test_vectorized)

In [23]:
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
print('n-gram level tf-idf training score:', f1_score(y_train, xgb_model.predict(X_train_vectorized), average='micro'))
print('n-gram level tf-idf test score:', f1_score(y_test, xgb_model.predict(X_test_vectorized), average='micro'))
print(classification_report(y_test, xgb_prediction))

n-gram level tf-idf training score: 0.8972756465591658
n-gram level tf-idf test score: 0.7112704422782566
             precision    recall  f1-score   support

          1       0.68      0.80      0.73      2164
          2       0.59      0.16      0.25       743
          3       0.55      0.22      0.32       957
          4       0.52      0.22      0.31      1872
          5       0.75      0.95      0.84      6677

avg / total       0.68      0.71      0.66     12413

