# Imports

In [12]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import pandas as pd
import numpy as np
import torch
import kagglehub
from sklearn.model_selection import train_test_split


# Download the Amazon Review Data from Kaggle


In [6]:
# Load the amazon review data from Kaggle
path = kagglehub.dataset_download("kritanjalijain/amazon-reviews")

print("The path to the downloaded data is the following:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/kritanjalijain/amazon-reviews?dataset_version_number=2...


100%|██████████| 1.29G/1.29G [02:26<00:00, 9.47MB/s]

Extracting files...





The path to the downloaded data is the following: /Users/sandragedig/.cache/kagglehub/datasets/kritanjalijain/amazon-reviews/versions/2


# Load the Data 

In [29]:
# load the trainingsdata
train_df = pd.read_csv('/Users/sandragedig/.cache/kagglehub/datasets/kritanjalijain/amazon-reviews/versions/2/amazon_review_polarity_csv/train.csv', header=None)

# load the testdata
test_df = pd.read_csv('/Users/sandragedig/.cache/kagglehub/datasets/kritanjalijain/amazon-reviews/versions/2/amazon_review_polarity_csv/test.csv', header = None)

In [27]:
# load readme 
readme = '/Users/sandragedig/.cache/kagglehub/datasets/kritanjalijain/amazon-reviews/versions/2/amazon_review_polarity_csv/readme.txt'

# Open and read the file
with open(readme, 'r') as file:
    for line in file:
        print(line.strip())

Amazon Review Polaridy Dataset

Version 3, Updated 09/09/2015

ORIGIN

The Amazon reviews dataset consists of reviews from amazon. The data span a period of 18 years, including ~35 million reviews up to March 2013. Reviews include product and user information, ratings, and a plaintext review. For more information, please refer to the following paper: J. McAuley and J. Leskovec. Hidden factors and hidden topics: understanding rating dimensions with review text. RecSys, 2013.

The Amazon reviews polarity dataset is constructed by Xiang Zhang (xiang.zhang@nyu.edu) from the above dataset. It is used as a text classification benchmark in the following paper: Xiang Zhang, Junbo Zhao, Yann LeCun. Character-level Convolutional Networks for Text Classification. Advances in Neural Information Processing Systems 28 (NIPS 2015).


DESCRIPTION

The Amazon reviews polarity dataset is constructed by taking review score 1 and 2 as negative, and 4 and 5 as positive. Samples of score 3 is ignored. In th

# EDA

In [15]:
print("Train Set Length:", train_df.shape[0])
print("Test Set Length:", test_df.shape[0])

Train Set Length: 3599999
Test Set Length: 399999


In [30]:
train_df.head()

Unnamed: 0,0,1,2
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."


In [31]:
test_df.head()

Unnamed: 0,0,1,2
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...
1,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...


In [36]:
# Since the columns have no names, i will rename them with index, polarity, title and rating:

# Rename the columns in train_df
train_df.columns = ['polarity', 'title', 'review']  # Renaming the columns

# Rename the columns in test_df
test_df.columns = ['polarity', 'title', 'review']  # Renaming the columns

In [38]:
train_df['polarity'].unique()
test_df['polarity'].unique()

array([2, 1])

Since the customer reviews are split into two columns: the title and the review, it makes sense to merge the two and then process them together. 
provides more context and enables a more precise sentiment analysis, as both parts reflect the complete customer opinion.

In [41]:
# Combine the title and the review text in one column
train_df['combined_review'] = train_df['title'] + ": " + train_df['review']
test_df['combined_review'] = train_df['title'] + ": " + train_df['review']

# Drop the title and the review columns
train_df = train_df.drop(columns=['title', 'review'])
test_df = test_df.drop(columns=['title', 'review'])

In [42]:
train_df.head(2)

Unnamed: 0,polarity,combined_review
0,2,Stuning even for the non-gamer: This sound tra...
1,2,The best soundtrack ever to anything.: I'm rea...


In [43]:
# Limiting the dataset to 100,000 samples to ensure faster processing and efficient model training while maintaining a sufficiently large and representative sample.
# Randomly selecting 100,000 rows for both training and test datasets to ensure a representative sample.
# This prevents any biases that might result from the original order of the data, such as temporal patterns.
# A validation set is then created by splitting the training set, ensuring better model evaluation during training.

# Randomly sample 100,000 rows from the training and test datasets
train_df = train_df.sample(n=100000, random_state=42)
test_df = test_df.sample(n=100000, random_state=42)

# Creating a validation set from the randomly selected training set
train_df, validation_df = train_test_split(train_df, test_size=0.2, random_state=42)

# Checking the lengths of the datasets
print("Train Set Length:", len(train_df))
print("Validation Set Length:", len(validation_df))
print("Test Set Length:", len(test_df))

Train Set Length: 80000
Validation Set Length: 20000
Test Set Length: 100000
