In [1]:
# Import necessary modules
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from gensim.models import Doc2Vec
from sklearn import utils
from gensim.models.doc2vec import TaggedDocument

In [2]:
# Import data 

# Read the CSV file into a DataFrame: df
df = pd.read_csv('Amazon_Unlocked_Mobile.csv')
# Comment out this line to match with lecture
df = df.sample(frac=0.1, random_state=10)

In [3]:
df = df[pd.notnull(df['Reviews'])]
df.head(10)

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
394349,Sony XPERIA Z2 D6503 FACTORY UNLOCKED Internat...,,244.95,5,Very good one! Better than Samsung S and iphon...,0.0
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0
277158,Nokia N8 Unlocked GSM Touch Screen Phone Featu...,Nokia,95.0,5,I fell in love with this phone because it did ...,0.0
100311,Blackberry Torch 2 9810 Unlocked Phone with 1....,BlackBerry,77.49,5,I am pleased with this Blackberry phone! The p...,0.0
251669,Motorola Moto E (1st Generation) - Black - 4 G...,Motorola,89.99,5,"Great product, best value for money smartphone...",0.0
374058,Samsung Galaxy S7 Edge SM-G935F 32GB Factory U...,,593.5,4,except samsung pay everything is good,0.0
279878,OtterBox 77-29864 Defender Series Hybrid Case ...,OtterBox,9.99,5,I've bought 3 no problems. Fast delivery.,0.0


In [4]:
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
    text = text.lower() # lowercase text
    text = re.sub(r'[/(){}\[\]\|@,;.#+_]',' ', text) 
    text = re.sub(r'[^0-9a-z ]','', text) 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 
    return text

In [5]:
df['Reviews'] = df['Reviews'].apply(clean_text)

In [6]:
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
394349,Sony XPERIA Z2 D6503 FACTORY UNLOCKED Internat...,,244.95,5,good one better samsung iphones quality camera...,0.0
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,phone needed sim card would nice know,1.0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,3 months away upgrade stratosphere kept crappi...,3.0
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,experience want forget,0.0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,great phone work according expectations,1.0


In [7]:
maxLen = len(max(df['Reviews'], key=len).split())
maxLen

1408

In [8]:
df.shape[0] 

41374

In [9]:
def label_sentences(doc, label):
    labeldoc = []
    for i, w in enumerate(doc):
        labels = label + '_' + str(i)
        labeldoc.append(TaggedDocument(w.split(), [labels]))
    return labeldoc

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], df['Rating'], random_state=0, test_size=0.3)
X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
X = X_train + X_test

In [11]:
X[:3]

[TaggedDocument(words=['good', 'phone'], tags=['Train_0']),
 TaggedDocument(words=['ok', 'camara', 'bad'], tags=['Train_1']),
 TaggedDocument(words=['purchased', '2', 'phones', 'really', 'love', 'albeit', 'price', 'dropped', 'since', 'february', '2015', 'okay', 'really', 'think', 'quality', 'phone', 'warranted', 'higherprice', 'based', 'extensive', 'research', 'thing', 'found', 'disappointing', 'description', 'amazon', 'said', 'would', 'take', '64', 'gb', 'sd', 'card', 'fact', 'would', 'support', '34', 'gb', 'card', 'recently', 'delighted', 'blu', 'provided', 'upgrade', 'phone', 'permitting', 'accept', '64', 'gb', 'card', 'without', 'gerryrigging', 'using', 'straight', 'talk', '45', 'mo', '5', 'gb', 'unlimited', 'talk', 'text', 'even', 'bonus', 'originally', '3gbs', 'theyve', 'increased', '5gbs', 'using', 'byop', 'plan', 'market', 'months', 'theyre', 'starting', 'add', 'accessory', 'products', 'ordered', 'nice', 'leather', 'case', 'fits', '6', '0', 'lte', 'version', 'exclusively', 'id'

In [12]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.050, min_alpha=0.020)
model_dbow.build_vocab([x for x in X])

In [15]:
model_dbow.train(utils.shuffle([x for x in X]), total_examples=len(X), epochs=30)

In [16]:
def get_vectors(model, data_size, vectors_size, label):

    vectors = np.zeros((data_size, vectors_size))
    for i in range(0, data_size):
        labeldoc= label + '_' + str(i)
        vectors[i] = model.docvecs[labeldoc]
    return vectors

In [17]:
train_vectors = get_vectors(model_dbow, len(X_train), 300, 'Train')
test_vectors = get_vectors(model_dbow, len(X_test), 300, 'Test')

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# LogisticRegression Algorithm tuning
kfold = KFold(n_splits=5, random_state=7)
model = LogisticRegression()
c_space = np.logspace(-3, 6, 10)
param_grid = {'C': c_space}
logreg_cv = GridSearchCV(estimator=model, param_grid=param_grid, cv=kfold)
logresult=logreg_cv.fit(train_vectors, y_train)
print("Best: %f using %s" % (logresult.best_score_, logresult.best_params_))

Best: 0.676634 using {'C': 0.1}


In [21]:
from sklearn.metrics import classification_report
# Predict the labels of the test set: y_pred using LogisticRegression
y_pred2 = logreg_cv.predict(test_vectors)
# Compute and print metrics
print("Accuracy: {}".format(logreg_cv.score(test_vectors, y_test)))
print(classification_report(y_test, y_pred2))
print("Tuned Model Parameters: {}".format(logreg_cv.best_params_))

Accuracy: 0.6747764440505921
             precision    recall  f1-score   support

          1       0.62      0.76      0.68      2164
          2       0.39      0.05      0.09       743
          3       0.35      0.10      0.15       957
          4       0.40      0.15      0.22      1872
          5       0.73      0.95      0.82      6677

avg / total       0.61      0.67      0.61     12413

Tuned Model Parameters: {'C': 0.1}
