In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import gc
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
chunksize = 200000
chunks = []

# Iterating over chunks
for chunk in pd.read_json('../dataset/Electronics_5.json', lines=True, chunksize=chunksize):
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)

df.head()

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image
0,5,67,True,"09 18, 1999",AAP7PPBU72QFM,151004714,{'Format:': ' Hardcover'},D. C. Carrad,This is the best novel I have read in 2 or 3 y...,A star is born,937612800,
1,3,5,True,"10 23, 2013",A2E168DTVGE6SV,151004714,{'Format:': ' Kindle Edition'},Evy,"Pages and pages of introspection, in the style...",A stream of consciousness novel,1382486400,
2,5,4,False,"09 2, 2008",A1ER5AYS3FQ9O3,151004714,{'Format:': ' Paperback'},Kcorn,This is the kind of novel to read when you hav...,I'm a huge fan of the author and this one did ...,1220313600,
3,5,13,False,"09 4, 2000",A1T17LMQABMBN5,151004714,{'Format:': ' Hardcover'},Caf Girl Writes,What gorgeous language! What an incredible wri...,The most beautiful book I have ever read!,968025600,
4,3,8,True,"02 4, 2000",A3QHJ0FXK33OBE,151004714,{'Format:': ' Hardcover'},W. Shane Schmidt,I was taken in by reviews that compared this b...,A dissenting view--In part.,949622400,


In [3]:
df.overall.value_counts()

overall
5    4323582
4    1137393
3     504781
1     467158
2     306676
Name: count, dtype: int64

In [5]:
def downsample_ratings(df):
    """Downsample each rating class using indices"""
    balanced_indices = []
    
    # Get indices for each rating class
    rating_indices = {
        rating: df[df['overall'] == rating].index.tolist() 
        for rating in range(1, 6)
    }
    
    # Find minimum class size
    min_class_size = min(len(indices) for indices in rating_indices.values())
    
    # Sample indices from each class
    for rating_indices in rating_indices.values():
        if len(rating_indices) > min_class_size:
            sampled_indices = np.random.choice(
                rating_indices, 
                size=min_class_size, 
                replace=False
            )
            balanced_indices.extend(sampled_indices)
        else:
            balanced_indices.extend(rating_indices)
    
    # Return filtered dataframe using collected indices
    return df.loc[balanced_indices]

In [4]:
# defining functions here
def clean_data(df):
    df.drop(columns=['reviewerName', 'verified', 'reviewTime', 'summary', 'unixReviewTime', 'style', 'vote', 'image'], inplace=True)
    df.dropna(subset=['reviewText'], inplace=True)
    
def preprocess(data):
    label_encoder = LabelEncoder()
    data.loc[:, 'reviewerID'] = label_encoder.fit_transform(data['reviewerID'])
    data.loc[:, 'asin'] = label_encoder.fit_transform(data['asin'])

In [6]:
clean_data(df)
df.isna().sum()

overall       0
reviewerID    0
asin          0
reviewText    0
dtype: int64

In [7]:
preprocess(df)
df.head()

Unnamed: 0,overall,reviewerID,asin,reviewText
0,5,593888,1,This is the best novel I have read in 2 or 3 y...
1,3,268036,1,"Pages and pages of introspection, in the style..."
2,5,79258,1,This is the kind of novel to read when you hav...
3,5,155813,1,What gorgeous language! What an incredible wri...
4,3,526903,1,I was taken in by reviews that compared this b...


In [11]:
so_score = SentimentIntensityAnalyzer()

In [12]:
polarity = [round(so_score.polarity_scores(i)['compound'], 2) for i in df['reviewText']]
df['so_score'] = polarity

KeyboardInterrupt: 

In [13]:
df.head()

Unnamed: 0,reviewerID,asin,reviewText,overall,so_score
0,14054,0,Love it!! Great seller!,5,0.88
1,85186,0,One of my very favourite albums from one of my...,5,0.36
2,83064,0,"THank you Jesus Lord God, that brother Green's...",5,0.56
3,62917,0,I recall loving his other albums and maybe thi...,5,0.95
4,32788,0,Keith Green was a pioneer in the field of Chri...,5,0.42


In [4]:
# dropping users with less than 5 reviews
df = df.groupby('reviewerID').filter(lambda x: len(x) >= 5)

In [22]:
df.drop(columns=['reviewText'], inplace=True)

In [6]:
# saving dataset 
df.to_csv('../dataset/dataset.csv', index=False)