In [None]:
# Mount to Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Define Project Folder
FOLDERNAME = 'Colab Notebooks/Group_A_Project'
%cd drive/MyDrive/$FOLDERNAME



Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/Group_A_Project


In [None]:
# Import libraries
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import math

In [None]:
# Define device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
torch.manual_seed(42)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [None]:
raw_data = pd.read_csv('Amazon_Unlocked_Mobile.csv')

In [None]:
reviews = raw_data['Reviews'].astype(str)
labels = raw_data['Rating']
labels.replace({1: 0, 2: 0, 3: 1, 4: 2, 5: 2}, inplace=True)

In [None]:
patterns = ['<br />', '--', '.', ',', '!', '?', ')', '(', ';', ':', '*', '~', '_', "'", '"']
replacements = [' '] * len(patterns)

In [None]:
def preprocessing(reviews, patterns, replacements):
    cleaned_reviews = []
    for i in range(len(reviews)):
        review = reviews[i].lower()
        for pattern, replacement in zip(patterns, replacements):
            review = review.replace(pattern, replacement)
        cleaned_reviews.append(review)
    return cleaned_reviews

In [None]:
cleaned_reviews = preprocessing(reviews, patterns, replacements)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(cleaned_reviews)
sequences = tokenizer.texts_to_sequences(cleaned_reviews)


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
sequence_len = 5000
padded_sequences = pad_sequences(sequences, maxlen=sequence_len, padding='post')

In [None]:
# Prepare labels
labels = labels.values

In [None]:
# Train/Val split
train_size = 100000
val_size = 10000

In [None]:
train_data = padded_sequences[:train_size]
train_labels = labels[:train_size]
val_data = padded_sequences[train_size:train_size + val_size]
val_labels = labels[train_size:train_size + val_size]

In [None]:
# Convert to tensors
train_data_tensor = torch.tensor(train_data, dtype=torch.long)
train_labels_tensor = torch.tensor(train_labels, dtype=torch.long)
val_data_tensor = torch.tensor(val_data, dtype=torch.long)
val_labels_tensor = torch.tensor(val_labels, dtype=torch.long)


In [None]:
# Dataloaders
batch_size = 32
train_dataset = TensorDataset(train_data_tensor, train_labels_tensor)
val_dataset = TensorDataset(val_data_tensor, val_labels_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Define Positional Encoding for Transformer
class PositionalEncoding(nn.Module):
    def __init__(self, embedding_dim, dropout=0.1, max_len=6000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, embedding_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-torch.log(torch.tensor(10000.0)) / embedding_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

# Define Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_heads, num_layers, dropout):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_encoder = PositionalEncoding(embedding_dim, dropout)
        encoder_layer = nn.TransformerEncoderLayer(embedding_dim, num_heads, hidden_dim, dropout)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x) * math.sqrt(self.embedding.embedding_dim)
        x = self.pos_encoder(x)
        x = self.transformer(x)
        x = self.fc(x.mean(dim=1))  # global average pooling
        return x




In [None]:
# Model parameters
embedding_dim = 128
hidden_dim = 256
output_dim = 3
num_heads = 8
num_layers = 2
dropout = 0.5

vocab_size = len(tokenizer.word_index) + 1
model = TransformerModel(vocab_size, embedding_dim, hidden_dim, output_dim, num_heads, num_layers, dropout)
model = model.to(device)



In [None]:
# Loss and optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training function
def train(model, train_loader, val_loader, device, loss_function, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        correct = 0
        for data, target in train_loader:
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = loss_function(output, target)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            correct += (output.argmax(1) == target).sum().item()

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader)}, Accuracy: {correct/len(train_loader.dataset)}')

        evaluate(model, val_loader, device)

# Evaluation function
def evaluate(model, val_loader, device):
    model.eval()
    correct = 0
    with torch.no_grad():
        for data, target in val_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            correct += (output.argmax(1) == target).sum().item()
    print(f'Validation Accuracy: {correct/len(val_loader.dataset)}')




In [None]:
# Train the model
num_epochs = 10
train(model, train_loader, val_loader, device, loss_function, optimizer, num_epochs)

Epoch [1/10], Loss: 0.7507190199565887, Accuracy: 0.70581
Validation Accuracy: 0.6585
Epoch [2/10], Loss: 0.7443839913463592, Accuracy: 0.70574
Validation Accuracy: 0.6583
Epoch [3/10], Loss: 0.7381377372360229, Accuracy: 0.70571
Validation Accuracy: 0.6581
Epoch [4/10], Loss: 0.7288530111789704, Accuracy: 0.70555
Validation Accuracy: 0.6587
Epoch [5/10], Loss: 0.7157057019138336, Accuracy: 0.70726
Validation Accuracy: 0.6601
Epoch [6/10], Loss: 0.7023661456298829, Accuracy: 0.71259
Validation Accuracy: 0.6638
