In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
from nltk.corpus import stopwords

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, random_split
from torch.optim import Adam

torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

[nltk_data] Downloading package stopwords to /Users/jaden/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
# load data
train_df = pd.read_json("../instructions/train.json")
test_df = pd.read_json("../instructions/test.json")
train_df.head()

Unnamed: 0,reviews,sentiments
0,I bought this belt for my daughter in-law for ...,1
1,The size was perfect and so was the color. It...,1
2,"Fits and feels good, esp. for doing a swim rac...",1
3,These socks are absolutely the best. I take pi...,1
4,Thank you so much for the speedy delivery they...,1


In [40]:
# clean data
def clean_text(text):
    if not text:
        return []
    
    # get stopwords
    excluded = set(stopwords.words('english'))
    
    # remove email addresses
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)

    # replace '@' with 'at' and '#' with ''
    text = text.replace('@', 'at')
    text = text.replace('#', '')

    # normalize text: lowercase, remove non-alphabetic characters, and extra spaces
    text = re.sub(r'[^a-z\s]', ' ', text.lower())
    text = re.sub(r'\s+', ' ', text).strip()
    
    # remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # filter out stopwords
    words = [word for word in text.split() if word not in excluded]
    
    return words

In [41]:
# create flags for train and test dfs
train_df['flag'] = 'train'
test_df['flag'] = 'test'

# combine dfs and override reviews with clean text
combined_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
combined_df['reviews'] = combined_df['reviews'].apply(clean_text)

In [None]:
# get cleaned train reviews
train_reviews = combined_df[combined_df['flag'] == 'train']['reviews']

# find no. of unique words in train dataset
unique_words = set(word for review in train_reviews for word in review)

len(unique_words)

13528

In [46]:
train_df = combined_df[combined_df['flag'] == 'train'].reset_index(drop=True)
test_df = combined_df[combined_df['flag'] == 'test'].reset_index(drop=True)

# encode train and test data using TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000, tokenizer=lambda x: x, preprocessor=lambda x: x)
vectorizer = vectorizer.fit(train_df['reviews'])
X_train = vectorizer.transform(train_df['reviews'])
X_test = vectorizer.transform(test_df['reviews'])

# split the train data into train and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, train_df['sentiments'], test_size=0.2, random_state=42
)

print(f"Train set size: {X_train_split.shape[0]}")
print(f"Validation set size: {X_val_split.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

Train set size: 5920
Validation set size: 1481
Test set size: 1851




In [None]:
# convert data to PyTorch tensors


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 167652 stored elements and shape (7401, 5000)>