<a href="https://colab.research.google.com/github/samancha/nlp-master/blob/main/mod5/NLP_mod5_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data loading and preprocessing.

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
! pip install transformers

In [4]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split
import torch
import nltk
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.metrics import classification_report
# Download the stopwords and tokenizer from nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

seed_val = 42
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
def preprocess_text(text):
    # Convert text to lowercase
    words = word_tokenize(text)

    # Convert words to lowercase
    words = [word.lower() for word in words]

    # Remove punctuation from words
    words = [word for word in words if word.isalnum()]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Remove duplicate words
    unique_words = list(dict.fromkeys(words))

    # Join the words back into a string
    text = ' '.join(unique_words)

    return text


In [6]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/archive.zip', )
df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
df['processed_review'] = df['review'].apply(preprocess_text)
df.reset_index(drop=True)
display(df.head())

Unnamed: 0,review,sentiment,label,processed_review
0,One of the other reviewers has mentioned that ...,positive,1,one reviewers mentioned watching 1 oz episode ...
1,A wonderful little production. <br /><br />The...,positive,1,wonderful little production br filming techniq...
2,I thought this was a wonderful way to spend ti...,positive,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,0,basically family little boy jake thinks zombie...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,petter mattei love time money visually stunnin...


# Text tokenization and conversion to BERT input features.

In [7]:
# Load the pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

REF: https://huggingface.co/docs/transformers/pad_truncation

In [9]:
inputs = df.processed_review.values
labels = df.label.values
print("Train data size ", len(inputs))
print(' Original: ', inputs[0])
# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(inputs[0]))
# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(inputs[0])))

Train data size  50000
 Original:  one reviewers mentioned watching 1 oz episode hooked right exactly happened br first thing struck brutality unflinching scenes violence set word go trust show faint hearted timid pulls punches regards drugs sex hardcore classic use called nickname given oswald maximum security state penitentary focuses mainly emerald city experimental section prison cells glass fronts face inwards privacy high agenda em home many aryans muslims gangstas latinos christians italians irish scuffles death stares dodgy dealings shady agreements never far would say main appeal due fact goes shows dare forget pretty pictures painted mainstream audiences charm romance mess around ever saw nasty surreal could ready watched developed taste got accustomed levels graphic injustice crooked guards sold nickel inmates kill order get away well mannered middle class turned bitches lack street skills experience may become comfortable uncomfortable viewing thats touch darker side
Tokeni

In [10]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in inputs:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        max_length = 64,           # Pad & truncate all sentences.
                        padding='max_length',
                        truncation=True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

print('Original: ', inputs[0])
print('Token IDs:', input_ids[0])
print('Tokenized:', tokenizer.decode(input_ids[0][0]))
print('Attention_mask', attention_masks[0])

Original:  one reviewers mentioned watching 1 oz episode hooked right exactly happened br first thing struck brutality unflinching scenes violence set word go trust show faint hearted timid pulls punches regards drugs sex hardcore classic use called nickname given oswald maximum security state penitentary focuses mainly emerald city experimental section prison cells glass fronts face inwards privacy high agenda em home many aryans muslims gangstas latinos christians italians irish scuffles death stares dodgy dealings shady agreements never far would say main appeal due fact goes shows dare forget pretty pictures painted mainstream audiences charm romance mess around ever saw nasty surreal could ready watched developed taste got accustomed levels graphic injustice crooked guards sold nickel inmates kill order get away well mannered middle class turned bitches lack street skills experience may become comfortable uncomfortable viewing thats touch darker side
Token IDs: tensor([[  101,  20

In [12]:
df_input_ids = torch.cat(input_ids, dim=0)
df_attention_masks = torch.cat(attention_masks, dim=0)
df_labels = torch.tensor(labels)

# Bert Input Features
print(type(df_input_ids))
print(type(df_attention_masks))

<class 'torch.Tensor'>
<class 'torch.Tensor'>


  labels = torch.tensor(labels)


<class 'list'>
<class 'torch.Tensor'>
Original:  one reviewers mentioned watching 1 oz episode hooked right exactly happened br first thing struck brutality unflinching scenes violence set word go trust show faint hearted timid pulls punches regards drugs sex hardcore classic use called nickname given oswald maximum security state penitentary focuses mainly emerald city experimental section prison cells glass fronts face inwards privacy high agenda em home many aryans muslims gangstas latinos christians italians irish scuffles death stares dodgy dealings shady agreements never far would say main appeal due fact goes shows dare forget pretty pictures painted mainstream audiences charm romance mess around ever saw nasty surreal could ready watched developed taste got accustomed levels graphic injustice crooked guards sold nickel inmates kill order get away well mannered middle class turned bitches lack street skills experience may become comfortable uncomfortable viewing thats touch dark

# Model definition, training, and evaluation.

#### Sequence Classifacation

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single
# linear classification layer on top.
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
# Tell pytorch to run this model on the GPU.
model.cuda()

### Fine-tune the BERT model on the preprocessed IMDb dataset for sentiment analysis.

### Implement training loops and loss calculation.

In [None]:
batch_size = 64
epochs = 2
optimizer = AdamW(model.parameters(), lr=2e-5)
model.train()

for epoch in range(epochs):
    for i in range(0, df_input_ids.size(0), batch_size):
        batch_input_ids = df_input_ids[i:i+batch_size]
        batch_attention_masks = df_attention_masks[i:i+batch_size]
        batch_labels = labels[i:i+batch_size]

        optimizer.zero_grad()

        outputs = model(
            input_ids=batch_input_ids,
            attention_mask=batch_attention_masks,
            labels=batch_labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


### Evaluation

testing set using accuracy, precision, recall, and F1-score metrics

# Sample movie review predictions and explanations.


In [None]:
# Perform inference
with torch.no_grad():
    outputs = model(df_input_ids[0], attention_mask=df_attention_mask[0]).logits

# Get predicted label
predicted_label = torch.argmax(outputs, dim=1).item()

# Define label names
label_names = [1, 0]

# Print result
print("Text:", input_ids[0])
print("Predicted Label:", label_names[predicted_label])

for text, label in zip(test_texts, predicted_labels):
    print(f'Text: {text}\nPredicted Label: {label.item()}\n')