In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/amazon/FshionProductReviews_V5_NLP.csv')
data = data[['reviewText', 'sentiment']]
data.head()

Unnamed: 0,reviewText,sentiment
0,agree opening small bent hook expensive earrin...,0
1,tiny opening,0
2,little plastic back work great loosing hook ea...,1
3,mother law wanted present sister d work,0
4,look 100 stretched carefully push case fi...,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671560 entries, 0 to 671559
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   reviewText  664671 non-null  object
 1   sentiment   671560 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 10.2+ MB


In [5]:
data.isnull().sum()

reviewText    6889
sentiment        0
dtype: int64

In [6]:
data.dropna(axis=0, inplace=True)

In [7]:
data.isnull().sum()

reviewText    0
sentiment     0
dtype: int64

In [8]:
data.sentiment.value_counts()

1    451615
0    213056
Name: sentiment, dtype: int64

# Create a Balanced Dev Set

In [9]:
devSet1 = data[data['sentiment']==0].head(10000)
devSet2 = data[data['sentiment']==1].head(10000)
devSet3 = data[data['sentiment']==0].tail(10000)
devSet4 = data[data['sentiment']==1].tail(10000)

In [10]:
devSet = pd.concat(
    [devSet1, devSet2, devSet3, devSet4],
    ignore_index=True,
    axis=0
)

In [11]:
devSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   reviewText  40000 non-null  object
 1   sentiment   40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


In [12]:
devSet.sentiment.value_counts()

0    20000
1    20000
Name: sentiment, dtype: int64

In [13]:
del devSet1, devSet2, devSet3, devSet4

# Splitting Data into Train and Test

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
train_tokens, test_tokens, train_sentiment, test_sentiment = train_test_split(
    devSet[['reviewText']], devSet.sentiment, 
    test_size=0.1, 
    random_state=33, 
    shuffle=True
)

In [16]:
train_tokens.head()

Unnamed: 0,reviewText
13593,love van happy find amazon prime exactly expec...
15785,comfortable flip flop ever
37514,perfect fit great ball cap
27594,wrong knee area low case jean feel current f...
29695,small chest area 12 girl ly


In [17]:
test_tokens.head()

Unnamed: 0,reviewText
5502,barely opened 7 dollar wouldnt recommend seller
36357,super sexy nice fit little especially top
23479,october 2016 march 2017 started strip handle
19736,comfortable insole one helped heel pain pain gone
29026,soft little trunk itll pregnancy otherwise ...


In [18]:
train_tokens.shape, test_tokens.shape

((36000, 1), (4000, 1))

# **Feature Extraction**

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
vectorizer = TfidfVectorizer(
    max_features=100
    )

In [21]:
train_embeddings = vectorizer.fit_transform(train_tokens.reviewText).toarray()
test_embeddings = vectorizer.fit_transform(test_tokens.reviewText).toarray()

In [None]:
train_embeddings.shape

(144000, 200)

In [22]:
test_embeddings.shape

(4000, 100)

In [23]:
train_embeddings.max(axis=1)

array([0.4482738 , 1.        , 0.65777914, ..., 1.        , 0.        ,
       0.7977945 ])

# Save Vectorizer

In [None]:
from joblib import dump

In [None]:
dump(
    vectorizer, 
     '/content/drive/MyDrive/Colab Notebooks/amazon/TfIdfVectorizer.pkl',
     compress=1
)

['/content/drive/MyDrive/Colab Notebooks/amazon/TfIdfVectorizer.pkl']

# Save to feature store

In [25]:
np.savez(
    '/content/drive/MyDrive/Colab Notebooks/amazon/dataset_V6.npz',
    train_embeddings=train_embeddings, train_sentiment=train_sentiment,
    test_embeddings=test_embeddings, test_sentiment=test_sentiment,
)