This notebook does all the cleaning and pre processing of the dataset to make it ready for training

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
chunksize = 200000
chunks = []

# Iterating over chunks
for chunk in pd.read_json('../dataset/dataset.json', lines=True, chunksize=chunksize):
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)

df.head()

Unnamed: 0,reviewerID,asin,reviewerName,verified,reviewText,overall,reviewTime,summary,unixReviewTime,style,vote,image
0,A1H1DL4K669VQ9,1393774,Judith Paladino,True,Love it!! Great seller!,5,"04 29, 2016",Five Stars,1461888000,,,
1,A3V5XBBT7OZG5G,1393774,gflady,True,One of my very favourite albums from one of my...,5,"02 23, 2016",One of my very favourite albums from one of my...,1456185600,,,
2,A3SNL7UJY7GWBI,1393774,Lady Leatherneck,True,"THank you Jesus Lord God, that brother Green's...",5,"02 11, 2016",Five Stars,1455148800,,,
3,A3478QRKQDOPQ2,1393774,jacki,True,I recall loving his other albums and maybe thi...,5,"11 28, 2015",forgot but I figured on some of these artists ...,1448668800,,,
4,A23M5VTSN2C3H1,1393774,Caliope,True,Keith Green was a pioneer in the field of Chri...,5,"12 16, 2014",and I have loved every album he did,1418688000,,,


In [5]:
print(f"Columns: \n{df.columns}")
print("\n")
print(f"Class counts: \n{df.overall.value_counts()}")

Columns: 
Index(['reviewerID', 'asin', 'reviewerName', 'verified', 'reviewText',
       'overall', 'reviewTime', 'summary', 'unixReviewTime', 'style', 'vote',
       'image'],
      dtype='object')


Class counts: 
overall
5    968755
4    274731
3    110407
2     46367
1     43495
Name: count, dtype: int64


In [6]:
# defining functions here
def clean_data(df):
    df.drop(columns=['reviewerName', 'verified', 'reviewTime', 'summary', 'unixReviewTime', 'style', 'vote', 'image'], inplace=True)
    df.dropna(subset=['reviewText'], inplace=True)
    
def preprocess(data):
    label_encoder = LabelEncoder()
    data.loc[:, 'reviewerID'] = label_encoder.fit_transform(data['reviewerID'])
    data.loc[:, 'asin'] = label_encoder.fit_transform(data['asin'])

In [7]:
clean_data(df)
print(df.isna().sum())
preprocess(df)
df.head()

reviewerID    0
asin          0
reviewText    0
overall       0
dtype: int64


Unnamed: 0,reviewerID,asin,reviewText,overall
0,14054,0,Love it!! Great seller!,5
1,85186,0,One of my very favourite albums from one of my...,5
2,83064,0,"THank you Jesus Lord God, that brother Green's...",5
3,62917,0,I recall loving his other albums and maybe thi...,5
4,32788,0,Keith Green was a pioneer in the field of Chri...,5


In [None]:
so_score = SentimentIntensityAnalyzer()
polarity = [round(so_score.polarity_scores(i)['compound'], 2) for i in df['reviewText']]
df['so_score'] = polarity
df.head()

KeyboardInterrupt: 

In [4]:
# dropping users with less than 5 reviews
df = df.groupby('reviewerID').filter(lambda x: len(x) >= 5)

In [22]:
df.drop(columns=['reviewText'], inplace=True)

In [6]:
# saving dataset 
df.to_csv('../dataset/dataset.csv', index=False)