# Data Science For Business Project

# Step 1

## Preprocessing


In [1]:
# Load file (we keep the git repo as light as possible by only hosting the .gz's)
!rm -f *.json
!gunzip -c amazon_step1.json.gz > amazon_step1.json

import pandas as pd
import numpy as np

# A first look at the data
df1 = pd.read_json('amazon_step1.json', lines=True)
df1.head()

Unnamed: 0,asin,category,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,B000J4HXUC,Sports_and_Outdoors,"[1, 1]",5,It's a .50 Caliber Ammo Can. That largely sums...,"01 5, 2014",A3QRW0UJPKIAX7,Grant Fritchey,Clean and Exactly as Advertised,1388880000
1,0983393214,Books,"[0, 0]",5,This was a very good book. It kept me excited ...,"06 23, 2013",A2SEIOM4H06WTH,TJ,Great read!,1371945600
2,B003G4FVMY,Grocery_and_Gourmet_Food,"[0, 0]",5,"If you love coconut the way I do, you can't go...","05 19, 2013",A3GDEXMU9587JX,"K. Parsley ""kindlekat""","If you love coconut, get this coffee",1368921600
3,B00F9VRNF0,Cell_Phones_and_Accessories,"[0, 0]",5,I recently switched from the Galaxy S3 to the ...,"04 25, 2014",ASP3J2NEHDN4E,ChriS,Superior Protection!!!,1398384000
4,B00D5OZQUC,Amazon_Instant_Video,"[0, 0]",5,"Good show,looks like the gap from season 2 to ...","11 4, 2013",A1EDBI6TBKP9CO,Grants Book Trade,"Love the show, thanks for putting Season 3 on ...",1383523200


In [2]:
df1.dtypes

asin              object
category          object
helpful           object
overall            int64
reviewText        object
reviewTime        object
reviewerID        object
reviewerName      object
summary           object
unixReviewTime     int64
dtype: object

In [3]:
num_total_samples = len(df1.index)
num_total_features = len(df1.columns)
print("Number of features:", num_total_features)
print("Number of samples:", num_total_samples)

Number of features: 10
Number of samples: 96000


In [4]:
num_valid_entries_per_sample = df1.count(axis=1)

num_complete_samples = num_valid_entries_per_sample.tolist().count(num_total_features)

percentage_damaged_samples = 1 - num_complete_samples/num_total_samples
print('Number of damaged samples:', num_total_samples - num_complete_samples)
print('Percentage of damaged Samples:', np.around(100*percentage_damaged_samples,decimals=1), '%')

Number of damaged samples: 994
Percentage of damaged Samples: 1.0 %


It appears that we have some missing data. <br />
Let's see the number of valid entries for each feature

In [5]:
num_valid_entries_per_feature = df1.count(axis=0).sort_values()
print(num_valid_entries_per_feature)

reviewerName      95006
asin              96000
category          96000
helpful           96000
overall           96000
reviewText        96000
reviewTime        96000
reviewerID        96000
summary           96000
unixReviewTime    96000
dtype: int64


Only `reviewerName` is sometimes missing.<br />
## We now present the preprocessing choices for each features

We will use `reviewText` to predict the `category`, we drop the other features:<br />
TODO: use summary?

In [6]:
from sklearn import preprocessing
df1 = df1.drop(["asin", "helpful", "overall", "reviewTime", "reviewerID", "reviewerName", "summary", "unixReviewTime"], axis=1)

### category

`Category` is the target variable.<br />
We use OneHotEncoding since it is needed for feeding categorical data to many sklearn estimators, notably SVMs , which we are required to use.

In [7]:
one_hot = pd.get_dummies(df1["category"])
#df1 = df1.drop("category", 1)
df1 = df1.join(one_hot)

### reviewText
We add the feature $length(reviewText)$ because we think it may be correlated to the target.<br />
We standardize it: that way, outliers (e.g. very long comments) won't affect too much our model.

In [8]:
def text_length(row):
    return len(row)
df1["reviewTextLength"] = df1["reviewText"].apply(text_length)
df1["reviewTextLength"] = preprocessing.scale(pd.to_numeric(df1["reviewTextLength"]))



we also create features that capture the amount of punctuation that is used (standardized):

In [9]:
def count_char(row, char):
    return row.count(char)
df1["rtCountPoints"] = df1["reviewText"].apply(count_char, args=("."))
df1["rtCountPoints"] = preprocessing.scale(df1["rtCountPoints"])
df1["rtCountExcl"] = df1["reviewText"].apply(count_char, args=("!"))
df1["rtCountExcl"] = preprocessing.scale(df1["rtCountExcl"])
df1["rtCountInterr"] = df1["reviewText"].apply(count_char, args=("?"))
df1["rtCountInterr"] = preprocessing.scale(df1["rtCountInterr"])
df1["rtCountComas"] = df1["reviewText"].apply(count_char, args=(","))
df1["rtCountComas"] = preprocessing.scale(df1["rtCountComas"])



## Preprocessing results
We give below an overview of the final data frame that we will be using.

In [10]:
df1.head()

Unnamed: 0,category,reviewText,Amazon_Instant_Video,Apps_for_Android,Automotive,Baby,Beauty,Books,CDs_and_Vinyl,Cell_Phones_and_Accessories,...,Pet_Supplies,Sports_and_Outdoors,Tools_and_Home_Improvement,Toys_and_Games,Video_Games,reviewTextLength,rtCountPoints,rtCountExcl,rtCountInterr,rtCountComas
0,Sports_and_Outdoors,It's a .50 Caliber Ammo Can. That largely sums...,0,0,0,0,0,0,0,0,...,0,1,0,0,0,-0.106836,0.396455,-0.341411,-0.218245,-0.1877
1,Books,This was a very good book. It kept me excited ...,0,0,0,0,0,1,0,0,...,0,0,0,0,0,-0.627738,-0.470575,-0.341411,-0.218245,-0.56776
2,Grocery_and_Gourmet_Food,"If you love coconut the way I do, you can't go...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,-0.547035,-0.687332,0.215455,-0.218245,-0.441073
3,Cell_Phones_and_Accessories,I recently switched from the Galaxy S3 to the ...,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1.256557,1.155105,1.886052,-0.218245,-0.314387
4,Amazon_Instant_Video,"Good show,looks like the gap from season 2 to ...",1,0,0,0,0,0,0,0,...,0,0,0,0,0,-0.573936,-0.578953,-0.341411,2.760569,-0.1877


## Training, tuning and testing of models

In [12]:
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier

# Set up train and test sets
seed = 2017
features = df1[["reviewText"]]
target = df1[["category"]]
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=seed)

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=seed)),
                    ])


# TODO: idk how to make this work
text_clf = text_clf.fit(X_train.values.reshape(len(X_train), 1), y_train)
text_clf

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

# Step 2
## Preprocessing

In [None]:
!gunzip -c amazon_step23.json.gz > amazon_step23.json
# TODO
#df2 = pd.read_json('amazon_step23.json.gz', lines=True)
#df2.head()

# Step 3
## Preprocessing