# imports

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [3]:
import re
from nltk import download
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from nltk.stem import  WordNetLemmatizer

In [4]:
download('stopwords')
download('wordnet')
download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
df = pd.read_parquet('/content/drive/MyDrive/Amazon Reviews/AMAZON_FASHION_v4.parquet')

In [8]:
new = pd.concat([df[df.sentiment == 0].head(10000), df[df.sentiment == 1].head(10000)], ignore_index=True)
del df

In [14]:
new.reviewLength.quantile(0.8)

50.0

# TF-IDF

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
tf_idf = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=500)
tf_idf.fit_transform(new['reviewText']).toarray().shape

(20000, 500)

### Dimesionality Reduction

In [20]:
from sklearn.decomposition import PCA

In [22]:
reducer = PCA(n_components=50, random_state=42)

In [23]:
reducer.fit(tf_idf.fit_transform(new['reviewText']).toarray())

In [24]:
reducer.transform(tf_idf.fit_transform(new['reviewText']).toarray())

array([[-0.03883539, -0.02700771, -0.0115248 , ...,  0.01639365,
        -0.04403369,  0.06483385],
       [-0.04289114, -0.02811076, -0.01243253, ..., -0.00203695,
        -0.07654652,  0.15616914],
       [ 0.11810459, -0.02613972, -0.02361583, ...,  0.02808952,
         0.02926751, -0.02429045],
       ...,
       [-0.06144726, -0.03755727, -0.01425981, ...,  0.04723072,
         0.02915458,  0.09239452],
       [-0.06787141, -0.0344248 , -0.03063971, ...,  0.0124085 ,
        -0.02741725, -0.06128591],
       [-0.05321103, -0.01851637, -0.01526086, ...,  0.05435463,
         0.06106377, -0.01866816]])

In [28]:
features = reducer.transform(tf_idf.fit_transform(new['reviewText']).toarray())
target = new.sentiment

In [29]:
columns = [f'pc{i}'for i in range(1,51)]
df = pd.DataFrame(features, columns=columns)
df['sentiment'] = target

In [30]:
df.head()

Unnamed: 0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,...,pc42,pc43,pc44,pc45,pc46,pc47,pc48,pc49,pc50,sentiment
0,-0.038835,-0.027008,-0.011525,-0.072345,-0.027155,-0.022539,0.0048,-0.010688,0.019479,-0.039771,...,0.038731,-0.036784,-0.083373,-0.076839,0.081127,-0.053671,0.016394,-0.044034,0.064834,0
1,-0.042891,-0.028111,-0.012433,-0.07306,-0.02974,-0.028217,0.007682,-0.011262,0.014247,-0.027737,...,0.063574,-0.077495,0.029717,-0.096108,-0.007815,-0.127521,-0.002037,-0.076547,0.156169,0
2,0.118105,-0.02614,-0.023616,-0.109601,0.018769,0.02536,-0.001296,-0.016394,0.030086,-0.040411,...,-0.027135,-0.032203,0.018897,0.009921,0.012143,-0.01797,0.02809,0.029268,-0.02429,0
3,-0.014612,-0.021104,-0.001063,-0.0934,-0.038795,0.116989,-0.019004,-0.040316,0.054144,-0.063023,...,0.014849,-0.033066,0.057981,-0.03358,-0.042369,-0.077679,-0.006845,-0.032879,0.094454,0
4,0.103122,0.285692,0.129457,0.056705,-0.00436,-0.103256,0.034043,-0.002988,-0.047971,0.041401,...,-0.000128,-0.037637,0.009139,-0.014279,-0.017376,-0.023573,-0.003879,0.003572,-0.03279,0


### Save

In [32]:
from joblib import dump

In [33]:
dump(tf_idf, '/content/drive/MyDrive/Amazon Reviews/TFIDF')

['/content/drive/MyDrive/Amazon Reviews/TFIDF']

In [34]:
dump(reducer, '/content/drive/MyDrive/Amazon Reviews/reducer')

['/content/drive/MyDrive/Amazon Reviews/reducer']

In [31]:
df.to_parquet('/content/drive/MyDrive/Amazon Reviews/AMAZON_FASHION_v5.parquet')