In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import gc
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
chunksize = 200000
chunks = []

# Iterating over chunks
for chunk in pd.read_csv('../dataset/dataset.csv', chunksize=chunksize):
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)

df.head()

Unnamed: 0,reviewerID,asin,overall,so_score
0,14054,0,5,0.88
1,85186,0,5,0.36
2,83064,0,5,0.56
3,62917,0,5,0.95
4,32788,0,5,0.42


In [3]:
df.overall.value_counts()

overall
5    968485
4    274727
3    110405
2     46366
1     43492
Name: count, dtype: int64

In [8]:
# defining functions here
def clean_data(df):
    df.drop(columns=['reviewerName', 'verified', 'reviewTime', 'summary', 'unixReviewTime', 'style', 'vote', 'image'], inplace=True)
    df.dropna(subset=['reviewText'], inplace=True)
    
def preprocess(data):
    label_encoder = LabelEncoder()
    data.loc[:, 'reviewerID'] = label_encoder.fit_transform(data['reviewerID'])
    data.loc[:, 'asin'] = label_encoder.fit_transform(data['asin'])

In [9]:
clean_data(df)
df.isna().sum()

reviewerID    0
asin          0
reviewText    0
overall       0
dtype: int64

In [10]:
preprocess(df)
df.head()

Unnamed: 0,reviewerID,asin,reviewText,overall
0,14054,0,Love it!! Great seller!,5
1,85186,0,One of my very favourite albums from one of my...,5
2,83064,0,"THank you Jesus Lord God, that brother Green's...",5
3,62917,0,I recall loving his other albums and maybe thi...,5
4,32788,0,Keith Green was a pioneer in the field of Chri...,5


In [11]:
so_score = SentimentIntensityAnalyzer()

In [12]:
polarity = [round(so_score.polarity_scores(i)['compound'], 2) for i in df['reviewText']]
df['so_score'] = polarity

KeyboardInterrupt: 

In [13]:
df.head()

Unnamed: 0,reviewerID,asin,reviewText,overall,so_score
0,14054,0,Love it!! Great seller!,5,0.88
1,85186,0,One of my very favourite albums from one of my...,5,0.36
2,83064,0,"THank you Jesus Lord God, that brother Green's...",5,0.56
3,62917,0,I recall loving his other albums and maybe thi...,5,0.95
4,32788,0,Keith Green was a pioneer in the field of Chri...,5,0.42


In [4]:
# dropping users with less than 5 reviews
df = df.groupby('reviewerID').filter(lambda x: len(x) >= 5)

In [22]:
df.drop(columns=['reviewText'], inplace=True)

In [6]:
# saving dataset 
df.to_csv('../dataset/dataset.csv', index=False)