### Question 2

In [None]:
import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import LabelEncoder
# from scrapy.selector import Selector
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.common.keys import Keys
import time
# from tqdm import tqdm
from sklearn.model_selection import train_test_split
import warnings
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
# from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
warnings.filterwarnings("ignore")

In [None]:
# Create instance
driver = webdriver.Chrome('chromedriver.exe')

url = 'https://www.imdb.com/title/tt1375666/reviews?ref_=tt_urv'
driver.get(url)

# Extract all reviews
while True:
    try:
        load_more_button = driver.find_element(By.ID, 'load-more-trigger')
        driver.execute_script("arguments[0].scrollIntoView();", load_more_button)
        time.sleep(1)
        load_more_button.click()
        time.sleep(1)
    except (NoSuchElementException, ElementNotInteractableException):
        break

rating_list = []
review_list = []

reviews = driver.find_elements(By.CSS_SELECTOR, 'div.review-container')

for d in tqdm(reviews):
    sel2 = Selector(text=d.get_attribute('innerHTML'))
    try:
        rating = sel2.css('.rating-other-user-rating span::text').extract_first()
    except:
        rating = np.NaN
    try:
        review = sel2.css('.text.show-more__control::text').extract_first()
    except:
        review = np.NaN

    rating_list.append(rating)
    review_list.append(review)


data = pd.DataFrame({'Rating': rating_list, 'Review': review_list})

# Close the instance
driver.close()

# Keeping only 500 positive and 500 negative reviews
import random

positive_reviews = data[data['Label'] == 'Positive']
negative_reviews = data[data['Label'] == 'Negative']

positive_reviews_sample = positive_reviews.sample(n=500, random_state=42)
negative_reviews_sample = negative_reviews.sample(n=500, random_state=42)

sampled_reviews = pd.concat([positive_reviews_sample, negative_reviews_sample])
sampled_reviews = sampled_reviews.sample(frac=1, random_state=42).reset_index(drop=True)
sampled_reviews = sampled_reviews.head(1000)

# Final data
data = sampled_reviews

# Saving
data.to_csv('data_q2.csv', index=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv('/content/drive/MyDrive/data_q2.csv')

In [None]:
data.head()

Unnamed: 0,Rating,Review,Label
0,3.0,This is one of those movies who make things un...,Negative
1,1.0,"...then perception becomes reality, I guess.",Negative
2,3.0,Went to see this film recently and was very di...,Negative
3,1.0,"Total waste of time, as simple as that and not...",Negative
4,10.0,"A smart and inventive thriller, this drama is ...",Positive


In [None]:
# Creating label with the help of rating

data['Rating'] = pd.to_numeric(data['Rating'], errors='coerce')
data['Label'] = data['Rating'].apply(lambda x: 'Negative' if x <= 5 else 'Positive')

In [None]:
data['Label'].value_counts()

Negative    501
Positive    499
Name: Label, dtype: int64

In [None]:
data.shape

(1000, 3)

In [None]:
data.head()

Unnamed: 0,Rating,Review,Label
0,3.0,This is one of those movies who make things un...,Negative
1,1.0,"...then perception becomes reality, I guess.",Negative
2,3.0,Went to see this film recently and was very di...,Negative
3,1.0,"Total waste of time, as simple as that and not...",Negative
4,10.0,"A smart and inventive thriller, this drama is ...",Positive


First we scraped data from IMDB, from which we got reviews and ratings of 4707 users. We then extracted 1000 random reviews and created a balanced dataset with 500 positive and 500 negative reviews.

<b>Part B:</b><br><br>
Split the dataset into training	(at	least	160examples) and test (at least	40 examples) sets.
<br><br>
<b> Solution: </b>


In [None]:
data['Label'].value_counts()

Negative    501
Positive    499
Name: Label, dtype: int64

In [None]:
# Removing the 'Rating' column as we no longer need it
dataset = data[['Review', 'Label']]

In [None]:
positive_data = dataset[dataset['Label'] == 'Positive']

In [None]:
negative_data = dataset[dataset['Label'] == 'Negative']

In [None]:
dataset_q2 = pd.concat([positive_data, negative_data], ignore_index=True)
dataset_q2 = dataset_q2.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
dataset_q2.shape

(1000, 2)

In [None]:
# Splitting
train, test = train_test_split(dataset_q2, test_size = 0.2, random_state = 42)

In [None]:
train.shape

(800, 2)

In [None]:
# Encoding
label_encoder = LabelEncoder()
train['Label'] = label_encoder.fit_transform(train['Label'])
test['Label'] = label_encoder.fit_transform(test['Label'])

<b> Part C:</b><br><br>
Fine tune a	pretrained language	model capable of generating	text (e.g.,	GPT) that you can take from	the	Hugging	Face Transformers library with the dataset your	created (I suggest using this tutorial: 	https://huggingface.co/docs/transformers/training).	Report the test accuracy. Discuss what could be	done to	improve	accuracy.

<br>
<b>Solution:</b>

In [None]:
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, TensorDataset
import torch
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokenized_data = tokenizer(train["Review"].tolist(), truncation=True, return_tensors="np", padding=True)

input_ids = torch.tensor(tokenized_data['input_ids'])
attention_mask = torch.tensor(tokenized_data['attention_mask'])

# Ensure labels are a PyTorch tensor
labels = torch.tensor(train["Label"].values)

# Print types to confirm
print("Input IDs type:", type(input_ids))
print("Attention Mask type:", type(attention_mask))
print("Labels type:", type(labels))

# Create the TensorDataset
dataset = TensorDataset(input_ids, attention_mask, labels)

# Create a DataLoader
batch_size = 16  # or any batch size suitable for your setup
tokenized_data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


Input IDs type: <class 'torch.Tensor'>
Attention Mask type: <class 'torch.Tensor'>
Labels type: <class 'torch.Tensor'>


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch

# Define your data and labels
# tokenized_data, labels = ...

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Tokenize your data
tokenized_data = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# Set up optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Fine-tune the model
num_epochs = 3

for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(**tokenized_data, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

# Save or use the fine-tuned model for predictions


In [None]:
from torch.utils.data import DataLoader, TensorDataset

# Assuming your tokenized data is in some arrays or tensors: input_ids, attention_masks, labels
dataset = TensorDataset(input_ids, attention_masks, labels)
tokenized_data = DataLoader(dataset, batch_size=your_batch_size)


In [None]:
from transformers import AutoModelForSequenceClassification, AdamW
import torch

# 1. Model Setup
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 2. Define Loss Function and Optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=3e-5)

# 3. Training Loop
num_epochs = 3  # Set the number of epochs

for epoch in range(num_epochs):
    model.train()
    for batch in tokenized_data_loader:
        # Move batch to device
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_function(outputs.logits, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # 4. Validation Phase (if you have a validation dataset)
    # model.eval()
    # ... validation steps ...

# 5. Testing and Evaluation
model.eval()
# ... testing steps ...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.eval()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

#### Fine tuning the model

In [None]:
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, TensorDataset
import torch

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Tokenize the data
tokenized_data = tokenizer(train["Review"].tolist(), truncation=True, padding=True, return_tensors="pt")

# Prepare labels
labels = torch.tensor(train["Label"].values)

# Create a TensorDataset
dataset = TensorDataset(tokenized_data['input_ids'], tokenized_data['attention_mask'], labels)

# Create a DataLoader
batch_size = 1  # Adjust as needed
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import AdamW
from torch.nn import CrossEntropyLoss

optimizer = AdamW(model.parameters(), lr=5e-5)
loss_function = CrossEntropyLoss()


In [None]:
import torch

torch.cuda.empty_cache()


In [None]:
# Optionally set the model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 3  # Set the number of epochs

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        # Unpack the batch
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Accumulate the loss
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    # Print average loss for the epoch
    print(f"Epoch {epoch} finished, Average Loss: {total_loss / len(train_dataloader)}")


Epoch 0 finished, Average Loss: 0.7242726033367216
Epoch 1 finished, Average Loss: 0.7216562905907631
Epoch 2 finished, Average Loss: 0.7208946232497692


In [None]:
# Tokenize the test data
tokenized_test_data = tokenizer(test["Review"].tolist(), truncation=True, padding=True, return_tensors="pt")

# Prepare test labels
test_labels = torch.tensor(test["Label"].values)

# Create a TensorDataset for the test data
test_dataset = TensorDataset(tokenized_test_data['input_ids'], tokenized_test_data['attention_mask'], test_labels)

# Create a DataLoader for the test data
test_dataloader = DataLoader(test_dataset, batch_size=16)  # Adjust the batch size as needed


In [None]:
# Put the model in evaluation mode
model.eval()

# Initialize variables to track predictions and actual labels
predictions, true_labels = [], []

# Evaluate the model on the test data
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        # Forward pass, calculate logit predictions
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()

        # Store predictions and true labels
        predictions.extend(logits)
        true_labels.extend(label_ids.flatten())  # Flatten the label_ids

# Convert predictions to actual class predictions
predicted_classes = np.argmax(predictions, axis=1)

# Calculate the accuracy
accuracy = np.mean(predicted_classes == np.array(true_labels))  # Convert true_labels to a numpy array
print(f"Test Accuracy: {accuracy}")


Test Accuracy: 0.5


In [None]:
# Flatten the list of arrays into a single array
true_labels_flat = np.concatenate([np.array(label_batch).flatten() for label_batch in true_labels])

# Now, iterate over the predictions and actual labels
for i in range(len(predicted_classes[:5])):  # Adjust the range as needed
    predicted_label_name = class_names[predicted_classes[i]]
    ground_truth_label_name = class_names[true_labels_flat[i]]

    if predicted_classes[i] == true_labels_flat[i]:
        correct += 1
        print("\nCorrectly classified review: ", test["Review"].iloc[i])
        print("Predicted label: ", predicted_label_name)
        print("Ground truth label: ", ground_truth_label_name)
    else:
        incorrect += 1
        print("\nIncorrectly classified review: ", test["Review"].iloc[i])
        print("Predicted label: ", predicted_label_name)
        print("Ground truth label: ", ground_truth_label_name)

print(f"\nNumber of correctly classified examples: {correct}")
print(f"Number of incorrectly classified examples: {incorrect}")



Correctly classified review:  First the good news. The movie contains decent action sequences. The acting is OK, some exceptions left aside.
Predicted label:  negative
Ground truth label:  negative

Incorrectly classified review:  Great intense film, another Christopher Nolan classic
Predicted label:  negative
Ground truth label:  positive

Correctly classified review:  This movie concept is highly copied Paprika anime and doesn't give him credits. COPYCAT NOLAN WORSE DIRECTOR
Predicted label:  negative
Ground truth label:  negative

Incorrectly classified review:  As someone who loves film, this is an outstanding movie. One of the best ever made. 100%
Predicted label:  negative
Ground truth label:  positive

Correctly classified review:  Terrible, overhyped film that people think makes them smart for saying they liked it. A complete waste of time. Nolan does it again. Plus, it's a ripoff of "Dreamscape".
Predicted label:  negative
Ground truth label:  negative

Number of correctly cl

<b>Discuss	what	could	be	done	to	improve	accuracy.</b>

1. Increasing the training data
2. Hyperparameter tuning
3. Using ensemble learning techniques
4. Model fine tuning
5. Applying a different architecture

There can be several other ways as well.