# Amazon reviews classification


In [0]:
import os

import pandas as pd
import gzip
import numpy as np

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords

from textblob import Word

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

import joblib



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
try:
  from google.colab import drive
  drive.mount('/content/drive')
except:
  print('You are not running on Google Colab. Remember to download the data and set the file paths accordingly.')
  pass

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
      yield eval(l)

def getDF(path):
  df = {}
  for i, d in enumerate(parse(path)):
    df[i] = d
  return pd.DataFrame.from_dict(df, orient='index')

In [0]:
basepath = '/content/drive/My Drive/Text Mining & Search PROJECT/'
datapath = os.path.join(basepath, 'TM_Dataset')

beauty = getDF(os.path.join(datapath, 'reviews_Beauty_5.json.gz')).sample(n=100000, random_state = 1)
food = getDF(os.path.join(datapath, + 'reviews_Grocery_and_Gourmet_Food_5.json.gz')).sample(n=100000, random_state = 1)
pet = getDF(os.path.join(datapath, 'reviews_Pet_Supplies_5.json.gz')).sample(n=100000, random_state = 1)
baby = getDF(os.path.join(datapath, 'reviews_Baby_5.json.gz')).sample(n=100000, random_state = 1)
sports = getDF(os.path.join(datapath, 'reviews_Sports_and_Outdoors_5.json.gz')).sample(n=100000, random_state = 1)

In [0]:
df = [beauty, food, pet, baby, sports]
for cat in df : 
  print( len(cat))

100000
100000
100000
100000
100000


In [0]:
for i, cat in enumerate(df) : 
  cat['category'] = i

df_tot = pd.concat(df, sort = False)

In [0]:
df_tot.category.unique()

array([0, 1, 2, 3, 4])

In [0]:
df_tot.columns

Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime', 'category'],
      dtype='object')

In [0]:
df_sub = df_tot.loc[:, ['reviewText', 'category']]
df_sub.head(5)

Unnamed: 0,reviewText,category
1298,I wanted a coppery orange color and i had dyed...,0
23014,i purchased this item as a gift for my mom who...,0
32981,"This really like this product, my hair is rela...",0
186326,I don't like the taste that most vitamins leav...,0
10876,I don't know why I haven't had one of these br...,0


### 1. Preprocessing del testo

Inizialmente viene svolta una prima fase di preprocessing del testo in modo tale da snellirlo e renderlo più interpretabile durante la fase successiva, quella di classificazione.


**- Rimozione della punteggiatura**

**- Trasformazione del testo tutto minuscolo**

**- Eliminazione delle stop words**

**- Applicazione della *Lemmatization***

**- Costruzione della matrice tfidf**


In [0]:
#rimozione della punteggiatura

df_sub.loc[:, 'reviewText'] = df_sub.loc[:, 'reviewText'].str.replace('[^\w\s]','')
df_sub.loc[:, 'reviewText'].head(5)

1298      I wanted a coppery orange color and i had dyed...
23014     i purchased this item as a gift for my mom who...
32981     This really like this product my hair is relax...
186326    I dont like the taste that most vitamins leave...
10876     I dont know why I havent had one of these brus...
Name: reviewText, dtype: object

In [0]:
#trasformazione del testo tutto minuscolo

df_sub.loc[:, 'reviewText'] = df_sub.loc[:, 'reviewText'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df_sub['reviewText'].head()

1298      i wanted a coppery orange color and i had dyed...
23014     i purchased this item as a gift for my mom who...
32981     this really like this product my hair is relax...
186326    i dont like the taste that most vitamins leave...
10876     i dont know why i havent had one of these brus...
Name: reviewText, dtype: object

In [0]:
#rimozione delle stop words

stop_w = stopwords.words('english')

df_sub.loc[:, 'reviewText'] = df_sub.loc[:, 'reviewText'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_w))
df_sub['reviewText'].head(5)

1298      wanted coppery orange color dyed black hair pi...
23014     purchased item gift mom ran nail polish wished...
32981     really like product hair relaxed color treated...
186326    dont like taste vitamins leave mouth saw immed...
10876     dont know havent one brushes long theory clear...
Name: reviewText, dtype: object

In [0]:
#lemmatization

df_sub.loc[:, 'reviewText'] = df_sub.loc[:, 'reviewText'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
df_sub['reviewText'].head(5)

1298      wanted coppery orange color dyed black hair pi...
23014     purchased item gift mom ran nail polish wished...
32981     really like product hair relaxed color treated...
186326    dont like taste vitamin leave mouth saw immedi...
10876     dont know havent one brush long theory clear n...
Name: reviewText, dtype: object

# Text representation - unigram tfidf

In [0]:
#creazione matrice tfidf

tfidf = TfidfVectorizer(max_features=10000, analyzer='word',ngram_range=(1,1))
tfidf_mat = tfidf.fit_transform(df_sub['reviewText'])

In [0]:
tfidf_mat

<500000x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 17465197 stored elements in Compressed Sparse Row format>

In [0]:
#salvare le categorie in un vettore

y = df_sub.category
y.unique()

array([0, 1, 2, 3, 4])

In [0]:
# save text representation
joblib.dump([tfidf_mat, y], os.path.join(basepath, 'tfidf_matrix_5class.joblib'))


## Feature selection - Singular value decomposition

In [0]:
tsvd = TruncatedSVD(n_components=500, random_state=1)

In [0]:
tfidf_mat_svd = tsvd.transform(tfidf_mat)

In [0]:
# save svd
joblib.dump([tfidf_mat_svd, y], os.path.join(basepath, 'tfidf_matrix_svd.joblib'))