# Amazon Cell Phones & Accessories Dataset

In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import spacy
from pathlib import Path
from matplotlib import pyplot
import numpy as np
import pickle
%matplotlib inline

DATA_PATH=Path('../data')
CLAS_PATH=Path('../data/cellphones_clas/')
TMP_PATH=Path(CLAS_PATH/'tmp/')
CLAS_PATH.mkdir(exist_ok=True)
TMP_PATH.mkdir(exist_ok=True)

In [2]:
data_train = pd.read_csv(CLAS_PATH/'train.csv', header=None)
data_val = pd.read_csv(CLAS_PATH/'val.csv', header=None)
data_test = pd.read_csv(CLAS_PATH/'test.csv', header=None)
data = pd.concat([data_train, data_val, data_test])
data.columns = ['label', 'body_text']
data['body_text'] = data['body_text'].apply(str)

In [3]:
nlp = spacy.load('en')
def clean_text(text):
#     text = "".join([word.lower() for word in text if word not in string.punctuation])
#     text = "".join([word.lower() for word in text])
    text = nlp(text)
    text = [word.lemma_ for word in text]
    return text

## Feature transformation and regularization

## Train/Test Split

In [4]:
len(data)

50000

In [5]:
neg_data = data[data['label']==0]
pos_data = data[data['label']==1]

neg_data = neg_data.sample(n=10000, random_state=10)
pos_data = pos_data.sample(n=10000, random_state=10)

from sklearn.model_selection import train_test_split

data = pd.concat([neg_data, pos_data])

X_train, X_test, y_train, y_test = train_test_split(data['body_text'], data['label'], test_size=0.2, random_state=10, stratify=data['label'])

print('Training set: ')
print(y_train.value_counts())
print('')
print('Testing set')
print(y_test.value_counts())

Training set: 
1    8000
0    8000
Name: label, dtype: int64

Testing set
1    2000
0    2000
Name: label, dtype: int64


## Vectorizing Text

In [6]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text, ngram_range=(1, 3), max_features=50000, max_df=0.99, min_df=1)
tfidf_vect_fit = tfidf_vect.fit(X_train)

tfidf_train = tfidf_vect_fit.transform(X_train)
tfidf_test = tfidf_vect_fit.transform(X_test)

X_train_vect = pd.DataFrame(tfidf_train.toarray())
X_test_vect = pd.DataFrame(tfidf_test.toarray())

X_train_vect.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26271,26272,26273,26274,26275,26276,26277,26278,26279,26280
0,0.099371,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
tfidf_vect.vocabulary_

{'the': 23265,
 'charger': 6512,
 'may': 15235,
 'be': 4930,
 'real': 19401,
 'samsung': 20522,
 'but': 5822,
 'hologram': 12339,
 'miss': 15632,
 '.': 155,
 ' ': 0,
 'no': 16402,
 'wall': 25260,
 'and': 3984,
 'a': 3216,
 'counterfeit': 7623,
 'battery': 4861,
 'make': 15046,
 'this': 23470,
 'deal': 8070,
 'amazon': 3887,
 'fail': 10192,
 'love': 14828,
 ',': 44,
 '-PRON-': 110,
 'protect': 18917,
 'entire': 9695,
 'phone': 17819,
 'of': 16750,
 'course': 7640,
 'print': 18665,
 'very': 25050,
 'happy': 11921,
 'with': 25727,
 'one': 16911,
 'buy': 5872,
 'holster': 12341,
 'case': 6240,
 'for': 10793,
 'galaxy': 11154,
 's3': 20412,
 'work': 25798,
 'really': 19415,
 'good': 11450,
 'to': 23742,
 'keep': 14031,
 'can': 6079,
 'clip': 6849,
 'on': 16872,
 'belt': 5065,
 'or': 17050,
 'pant': 17487,
 'little': 14650,
 'big': 5156,
 'compare': 7112,
 'use': 24799,
 'carry': 6219,
 'iphone': 13446,
 'will': 25656,
 'get': 11275,
 'have': 11983,
 'about': 3278,
 '1month': 956,
 'still': 

## Modelling

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, roc_curve, f1_score
import time

### Random Forest Classifier (RF)

In [9]:
rf = RandomForestClassifier(n_estimators=400, max_depth=None, n_jobs=-1, random_state=10)

In [10]:
start = time.time()
rf_model = rf.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)
print("Training done in {} seconds".format(round(fit_time, 3)))

start = time.time()
y_pred_rf = rf_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)
print("Prediction done in {} seconds".format(round(pred_time, 3)))

Training done in 334.424 seconds
Prediction done in 1.979 seconds


In [14]:
print('Classification Report')
print('----------------------')
print(classification_report(y_test, y_pred_rf, labels = [0, 1], target_names=['negative', 'positive']))
print('accuracy: {}'.format(round((y_pred_rf==y_test).sum()/len(y_test), 2)))
print("ROC AUC Score :", roc_auc_score(y_test, y_pred_rf))
print("F1 Score :", f1_score(y_test, y_pred_rf))

Classification Report
----------------------
             precision    recall  f1-score   support

   negative       0.85      0.86      0.85      2000
   positive       0.85      0.85      0.85      2000

avg / total       0.85      0.85      0.85      4000

accuracy: 0.85
ROC AUC Score : 0.851
F1 Score : 0.8501759678230267


### Support Vector Machine Classifier (SVM)

In [15]:
clf = svm.LinearSVC(random_state=10)

In [16]:
start = time.time()
clf_fit = clf.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)
print("Training done in {} seconds".format(round(fit_time, 3)))

start = time.time()
y_pred_svm = clf_fit.predict(X_test_vect)
end = time.time()
pred_time = (end - start)
print("Prediction done in {} seconds".format(round(pred_time, 3)))




Training done in 6.026 seconds
Prediction done in 1.084 seconds


In [17]:
print('Classification Report')
print('----------------------')
print(classification_report(y_test, y_pred_svm, labels = [0, 1], target_names=['negative', 'positive']))
print('accuracy: {}'.format(round((y_pred_svm==y_test).sum()/len(y_test), 2)))
print("ROC AUC Score :", roc_auc_score(y_test, y_pred_svm))
print("f1 Score :", f1_score(y_test, y_pred_svm))

Classification Report
----------------------
             precision    recall  f1-score   support

   negative       0.87      0.85      0.86      2000
   positive       0.86      0.87      0.86      2000

avg / total       0.86      0.86      0.86      4000

accuracy: 0.86
ROC AUC Score : 0.862
f1 Score : 0.8629592850049653


### Naive Bayes Classifier (NB)

In [18]:
nbc = MultinomialNB()

In [19]:
start = time.time()
nbc_fit = nbc.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)
print("Training done in {} seconds".format(round(fit_time, 3)))

start = time.time()
y_pred_nb = nbc_fit.predict(X_test_vect)
end = time.time()
pred_time = (end - start)
print("Prediction done in {} seconds".format(round(pred_time, 3)))

Training done in 5.135 seconds
Prediction done in 1.051 seconds


In [20]:
print('Classification Report')
print('----------------------')
print(classification_report(y_test, y_pred_nb, labels = [0, 1], target_names=['negative', 'positive']))
print('accuracy: {}'.format(round((y_pred_nb==y_test).sum()/len(y_test), 2)))
print("ROC AUC Score :", roc_auc_score(y_test, y_pred_nb))
print("F1 Score :", f1_score(y_test, y_pred_nb))

Classification Report
----------------------
             precision    recall  f1-score   support

   negative       0.84      0.83      0.84      2000
   positive       0.83      0.85      0.84      2000

avg / total       0.84      0.84      0.84      4000

accuracy: 0.84
ROC AUC Score : 0.8375
F1 Score : 0.8389494549058474
