# Classify the sentiments of IMDB reviews

In [1]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from gensim.models import word2vec
import gensim.downloader as api
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import GRU, Dense, Embedding, Dropout, Bidirectional, LSTM
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
import warnings


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 1. Data Loading

The dataset is from kaggle which is already divided into train, test and validation data set.

https://www.kaggle.com/datasets/columbine/imdb-dataset-sentiment-analysis-in-csv-format

In [None]:
train_data = pd.read_csv("/content/drive/MyDrive/NLP/Train.csv")
test_data = pd.read_csv("/content/drive/MyDrive/NLP/Test.csv")
valid_data = pd.read_csv("/content/drive/MyDrive/NLP/Valid.csv")

### 2. Data Pre-processing

In [None]:
def preprocess_text(text):

    # Remove HTML tags
    text = re.sub('<[^>]*>', '', text)

    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub('[^a-zA-Z]', ' ', text).lower()

    # Tokenize the text
    words = word_tokenize(text)

    # Remove stopwords from tokenized text -- right now i am not removing stopwords as I dont want to remove words like "not" from reviews
    # stop_words = set(stopwords.words('english'))
    # words = [word for word in words if word not in stop_words]

    # Lemmatize the words from tokenized text
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Combine words back into a single string
    preprocessed_text = ' '.join(words)

    return preprocessed_text

In [None]:
train_data['processed_text'] = train_data['text'].apply(preprocess_text)
test_data['processed_text'] = test_data['text'].apply(preprocess_text)
valid_data['processed_text'] = valid_data['text'].apply(preprocess_text)

### 3. Vectorization (using BoW, TF IDF , Word2Vec) with SVM

In [None]:
X_train = train_data['processed_text']
y_train = train_data['label']

X_test = test_data['processed_text']
y_test = test_data['label']

X_valid = valid_data['processed_text']
y_valid = valid_data['label']

BoW and TF-IDF with SVM

In [None]:
def train_svm_with_representations(train_data, test_data, representation):
    if representation == 'bow':
        vectorizer = CountVectorizer()
    elif representation == 'tfidf':
        vectorizer = TfidfVectorizer()
    else:
        raise ValueError("Choose one representation from'bow' or 'tfidf'.")

    X_train = vectorizer.fit_transform(train_data)
    X_test = vectorizer.transform(test_data)

    clf = SVC()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    return y_pred

In [None]:
y_pred_bow = train_svm_with_representations(X_train, X_test, 'bow')
accuracy_bow = accuracy_score(y_test, y_pred_bow)
print("Accuracy of BoW: ", accuracy_bow)

In [None]:
y_pred_tfidf = train_svm_with_representations(X_train, X_test, 'tfidf')
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
print("Accuracy of TF-IDF: ", accuracy_tfidf)

Word2Vec with SVM

In [None]:
def get_word2vec_embeddings(data):
    tokenized_sentences = [sentence.split() for sentence in data]
    model = word2vec(tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)
    embeddings = np.array([np.mean([model.wv[word] for word in sentence], axis=0) for sentence in tokenized_sentences])

    return embeddings

def train_svm_with_word2vec(train_data, test_data):
    X_train = get_word2vec_embeddings(train_data)
    X_test = get_word2vec_embeddings(test_data)

    clf = SVC()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    return y_pred

Word2Vec (using Google News pretrained Word2Vec) with SVM

In [None]:
def get_google_word2vec_embeddings(data):
    # Load the Google News Word2Vec model
    model = api.load("word2vec-google-news-300")

    tokenized_sentences = [sentence.split() for sentence in data]
    embeddings = []

    for sentence in tokenized_sentences:
        sentence_embeddings = []
        for word in sentence:
            if word in model:
                sentence_embeddings.append(model[word])
        if sentence_embeddings:
            embeddings.append(np.mean(sentence_embeddings, axis=0))
        else:
            embeddings.append(np.zeros(300))

    return np.array(embeddings)

def train_svm_with_google_word2vec(train_data, test_data):
    X_train = get_google_word2vec_embeddings(train_data)
    X_test = get_google_word2vec_embeddings(test_data)

    clf = SVC()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    return y_pred

In [None]:
# Google News Word2Vec
y_pred_google_word2vec = train_svm_with_google_word2vec(X_train, X_test)
accuracy_google_word2vec = accuracy_score(y_test, y_pred_google_word2vec)
print("Accuracy of word2Vec model:", accuracy_google_word2vec)

Accuracy of word2Vec model: 0.8612


In addition to using this pre trained model from Google News 300, we can also train our model ourselves by choosing most optimized hyper parameters like window, vector_size, workers, min_count, etc).

We can also use any other model other than SVM to see if the accuracy is better.

### 4. RNN based models (Vanilla RNN, LSTM, GRU, Bi-Directional LSTM)

Vanilla RNN (on processed text) - with early stopping

In [None]:
# Tokenize the data
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)


## tokenize
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_valid = tokenizer.texts_to_sequences(X_valid)

## padding
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
X_valid = pad_sequences(X_valid, maxlen=max_len)

# Create the model
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))
model.add(SimpleRNN(64))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Set up early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train, to_categorical(y_train), validation_data=(X_valid, to_categorical(y_valid)),
                    epochs=25, batch_size=128, callbacks=[early_stop])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, to_categorical(y_test))

print("Vanila RNN's Test accuracy:", accuracy)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Vanila RNN's Test accuracy: 0.8479999899864197


The epoch stopped at 12 because I have included early stop.

Vanilla RNN (on un processed text) - with early stopping

In [None]:
# Tokenize the data
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_data['text'])


## tokenize
X_train = tokenizer.texts_to_sequences(train_data['text'])
X_test = tokenizer.texts_to_sequences(test_data['text'])
X_valid = tokenizer.texts_to_sequences(valid_data['text'])

## padding
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
X_valid = pad_sequences(X_valid, maxlen=max_len)

# Create the model
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))
model.add(SimpleRNN(64))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Set up early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train, to_categorical(y_train), validation_data=(X_valid, to_categorical(y_valid)),
                    epochs=25, batch_size=128, callbacks=[early_stop])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, to_categorical(y_test))

print("Vanila RNN's Test accuracy for un processed text:", accuracy)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Vanila RNN's Test accuracy for un processed text: 0.8492000102996826


Vanilla RNN has vanishing gradient problem. So to remove this, we use LSTM. The accuracy of Vanilla RNN using un processed text might be more in most cases because the RNN will take care of pre process by ensuring not much information is lost.

LSTM (Long Short Term Memory)

In [None]:
# Tokenize the data
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_data['text'])

X_train = tokenizer.texts_to_sequences(train_data['text'])
X_test = tokenizer.texts_to_sequences(test_data['text'])
X_valid = tokenizer.texts_to_sequences(valid_data['text'])

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
X_valid = pad_sequences(X_valid, maxlen=max_len)

# Create the model
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Set up early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(X_train, to_categorical(y_train), validation_data=(X_valid, to_categorical(y_valid)),
                    epochs=10, batch_size=128, callbacks=[early_stop])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, to_categorical(y_test))

print("LSTM test accuracy:", accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
LSTM test accuracy: 0.8528000116348267


Added drop out layer to avoid overfitting

GRU Model

In [None]:
# Tokenize the data
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_data['text'])

X_train = tokenizer.texts_to_sequences(train_data['text'])
X_test = tokenizer.texts_to_sequences(test_data['text'])
X_valid = tokenizer.texts_to_sequences(valid_data['text'])

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
X_valid = pad_sequences(X_valid, maxlen=max_len)

# Create the model
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))
model.add(GRU(64))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Set up early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(X_train, to_categorical(y_train), validation_data=(X_valid, to_categorical(y_valid)),
                    epochs=25, batch_size=128, callbacks=[early_stop])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, to_categorical(y_test))

print("GRU's Test accuracy:", accuracy)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
GRU's Test accuracy: 0.8633999824523926


Bi Directional LSTM

In [None]:
# Tokenize the data
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_data['text'])

X_train = tokenizer.texts_to_sequences(train_data['text'])
X_test = tokenizer.texts_to_sequences(test_data['text'])
X_valid = tokenizer.texts_to_sequences(valid_data['text'])

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
X_valid = pad_sequences(X_valid, maxlen=max_len)

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
y_valid = to_categorical(y_valid)

# Build the model
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Set up early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=10, batch_size=128, callbacks=[early_stop])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)

print("Bi directional LSTM's Test accuracy:", accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Bi directional LSTM's Test accuracy: 0.8697999715805054


### 5. Tranformers (Pre-Trained DistilBERT and RoBERTa Models)

DistilBERT

In [None]:
import transformers
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import InputExample, InputFeatures

In [None]:

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the texts
train_encodings = tokenizer(train_data["text"].tolist(), truncation=True, padding=True)




In [None]:
train_encodings[:2]

[Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

In [None]:
test_encodings = tokenizer(test_data["text"].tolist(), truncation=True, padding=True)

In [None]:
# Convert the labels to TensorFlow datasets
train_dataset_tf = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_data["label"]
))
test_dataset_tf = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_data["label"]
))

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [None]:
# # Compile the model
# model.compile(optimizer=Adam,
#               loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
#               metrics=tf.metrics.SparseCategoricalAccuracy())

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
print(optimizer.name)

# Compile the model with optimizer and loss function
model.compile(optimizer=optimizer.name,
              loss='sparse_categorical_crossentropy', metrics=['accuracy'])

Adam


In [None]:
num_train_examples = len(train_dataset_tf)
num_train_batches = num_train_examples // 16
num_train_batches


In [None]:
# Train the model
model.fit(train_dataset_tf.shuffle(100).batch(32), epochs=2, batch_size=32)

Epoch 1/2


ResourceExhaustedError: Graph execution error:

Detected at node gradient_tape/tf_distil_bert_for_sequence_classification/distilbert/transformer/layer_._5/ffn/Gelu/mul_1/Mul defined at (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code

  File "/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py", line 37, in <module>

  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start

  File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 195, in start

  File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/usr/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/usr/local/lib/python3.10/dist-packages/tornado/ioloop.py", line 685, in <lambda>

  File "/usr/local/lib/python3.10/dist-packages/tornado/ioloop.py", line 738, in _run_callback

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 825, in inner

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 786, in run

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 361, in process_one

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 261, in dispatch_shell

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 539, in execute_request

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py", line 302, in do_execute

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/zmqshell.py", line 539, in run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 2975, in run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "<ipython-input-27-49d55cc36d6a>", line 2, in <cell line: 2>

  File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_tf_utils.py", line 1161, in fit

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 1804, in fit

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 1398, in train_function

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 1381, in step_function

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 1370, in run_step

  File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_tf_utils.py", line 1641, in train_step

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/optimizers/optimizer.py", line 543, in minimize

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/optimizers/optimizer.py", line 276, in compute_gradients

failed to allocate memory
	 [[{{node gradient_tape/tf_distil_bert_for_sequence_classification/distilbert/transformer/layer_._5/ffn/Gelu/mul_1/Mul}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_33765]

In [None]:
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Prepare dataset for DistilBERT
train_examples = []
for i, row in train_data.iterrows():
    train_examples.append(
        InputExample(guid=None,
                     text_a=row["text"],
                     text_b=None,
                     label=row["label"]))

In [None]:
# Convert to features
train_features = [InputFeatures(input_ids=tokenizer.encode(example.text_a, add_special_tokens=True),
                                 input_mask=tokenizer.encode(example.text_a, add_special_tokens=True,
                                                                padding='max_length', max_length=512,
                                                                truncation=True),
                                 segment_ids=tokenizer.encode(example.text_a, add_special_tokens=True,
                                                               padding='max_length', max_length=512,
                                                               truncation=True),
                                 label=example.label) for example in train_examples]

TypeError: InputFeatures.__init__() got an unexpected keyword argument 'input_mask'

In [None]:
## Use distilbert to build classifcation model

!pip install transformers
import transformers
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import InputExample, InputFeatures

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Prepare dataset for DistilBERT
train_examples = []
for i, row in train_data.iterrows():
    train_examples.append(
        InputExample(guid=None,
                     text_a=row["text"],
                     text_b=None,
                     label=row["label"]))

# Convert to features
train_features = [InputFeatures(input_ids=tokenizer.encode(example.text_a, add_special_tokens=True),
                                 input_mask=tokenizer.encode(example.text_a, add_special_tokens=True,
                                                                padding='max_length', max_length=512,
                                                                truncation=True),
                                 segment_ids=tokenizer.encode(example.text_a, add_special_tokens=True,
                                                               padding='max_length', max_length=512,
                                                               truncation=True),
                                 label=example.label) for example in train_examples]

# Prepare dataset for RoBERTa
train_examples = []
for i, row in train_data.iterrows():
    train_examples.append(
        InputExample(guid=None,
                     text_a=row["text"],
                     text_b=None,
                     label=row["label"]))

# Convert to features
train_features = [InputFeatures(input_ids=tokenizer.encode(example.text_a, add_special_tokens=True),
                                 input_mask=tokenizer.encode(example.text_a, add_special_tokens=True,
                                                                padding='max_length', max_length=512,
                                                                truncation=True),
                                 segment_ids=tokenizer.encode(example.text_a, add_special_tokens=True,
                                                               padding='max_length', max_length=512,
                                                               truncation=True),
                                 label=example.label) for example in train_examples]
# Create TFAutoModelForSequenceClassification for DistilBERT
model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Set up early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(train_features, train_data['label'], validation_data=(X_valid, to_categorical(y_valid)),
                    epochs=10, batch_size=128, callbacks=[early_stop])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, to_categorical(y_test))

print("DistilBERT's Test accuracy:", accuracy)
# RoBERTa
# Create TFAutoModelForSequenceClassification for RoBERTa
model_checkpoint = "roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Set up early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(train_features, train_data['label'], validation_data=(X_valid, to_categorical(y_valid)),
                    epochs=10, batch_size=128, callbacks=[early_stop])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, to_categorical(y_test))

print("RoBERTa's Test accuracy:", accuracy)


In [None]:
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
max_len = 512

# Tokenize and prepare training data
X_train = tokenizer(text=train_data['text'].tolist(), add_special_tokens=True, max_length=max_len, truncation=True, padding=True, return_tensors='tf')
y_train = train_data['label'].values

type(X_train), type(y_train)

(transformers.tokenization_utils_base.BatchEncoding, numpy.ndarray)

In [None]:
# Split training data into train and validation sets
X_train_np = X_train['input_ids'].numpy()
X_train_np, X_val_np, y_train, y_val = train_test_split(X_train_np, y_train, test_size=0.2, random_state=42)


In [None]:
# Create TensorFlow datasets
batch_size = 32
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_np, y_train)).batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val_np, y_val)).batch(batch_size)

In [None]:
# Load model and compile
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
print(optimizer.name)
# Compile the model with optimizer and loss function
model.compile(optimizer=optimizer.name,
              loss='sparse_categorical_crossentropy', metrics=['accuracy'])

Adam


In [None]:
# Train the model

# model.fit(train_dataset.shuffle(1000).batch(16), epochs=3, batch_size=16)
model.fit(train_dataset, epochs=2, validation_data=val_dataset)

Epoch 1/2


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported


ResourceExhaustedError: Graph execution error:

Detected at node gradient_tape/tf_distil_bert_for_sequence_classification/distilbert/transformer/layer_._5/ffn/Gelu/mul_1/Mul defined at (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code

  File "/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py", line 37, in <module>

  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start

  File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 195, in start

  File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/usr/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/usr/local/lib/python3.10/dist-packages/tornado/ioloop.py", line 685, in <lambda>

  File "/usr/local/lib/python3.10/dist-packages/tornado/ioloop.py", line 738, in _run_callback

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 825, in inner

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 786, in run

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 361, in process_one

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 261, in dispatch_shell

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 539, in execute_request

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py", line 302, in do_execute

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/zmqshell.py", line 539, in run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 2975, in run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "<ipython-input-30-94f2ea7abdb1>", line 4, in <cell line: 4>

  File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_tf_utils.py", line 1161, in fit

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 1804, in fit

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 1398, in train_function

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 1381, in step_function

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 1370, in run_step

  File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_tf_utils.py", line 1641, in train_step

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/optimizers/optimizer.py", line 543, in minimize

  File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/optimizers/optimizer.py", line 276, in compute_gradients

failed to allocate memory
	 [[{{node gradient_tape/tf_distil_bert_for_sequence_classification/distilbert/transformer/layer_._5/ffn/Gelu/mul_1/Mul}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_16279]

In [None]:
# https://medium.com/@prateekgaurav/nlp-zero-to-hero-part-3-transformer-based-models-conclusion-8191186301a9

def infer_framework():
    # Define the model name and tokenizer
    model_name = 'distilbert-base-uncased'
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Assuming train_data and test_data are defined
    max_len = 512

    # Tokenize and prepare training data
    X_train = tokenizer(text=train_data['text'].tolist(), add_special_tokens=True, max_length=max_len, truncation=True, padding=True, return_tensors='tf')
    y_train = train_data['label'].values

    # Split training data into train and validation sets
    X_train_np = X_train['input_ids'].numpy()
    X_train_np, X_val_np, y_train, y_val = train_test_split(X_train_np, y_train, test_size=0.2, random_state=42)

    # Create TensorFlow datasets
    batch_size = 32
    train_dataset = tf.data.Dataset.from_tensor_slices((X_train_np, y_train)).batch(batch_size)
    val_dataset = tf.data.Dataset.from_tensor_slices((X_val_np, y_val)).batch(batch_size)

    # Load model and compile
    model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # Define optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

    # Compile the model with optimizer and loss function
    model.compile(optimizer=optimizer.name, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Set up early stopping
    early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    # Suppress AutoGraph warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        # Train the model
        epochs = 10
        history = model.fit(train_dataset, epochs=epochs, validation_data=val_dataset, callbacks=[early_stop])

    # Test the model
    X_test = tokenizer(text=test_data['text'].tolist(), add_special_tokens=True, max_length=max_len, truncation=True, padding=True, return_tensors='tf')
    y_test = test_data['label'].values

    test_dataset = tf.data.Dataset.from_tensor_slices((X_test['input_ids'], X_test['attention_mask'])).batch(batch_size)

    y_pred = model.predict(test_dataset)
    y_pred_labels = tf.argmax(y_pred.logits, axis=1)
    accuracy = np.sum(y_pred_labels.numpy() == y_test) / len(y_test)
    print(f"DistilBERT's Test accuracy: {accuracy:.2f}")

infer_framework()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/10


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported


RoBERTa

In [None]:
# Define the model name and tokenizer
model_name = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Assuming train_data and test_data are defined
max_len = 512

# Tokenize and prepare training data
X_train = tokenizer(text=train_data['text'].tolist(), add_special_tokens=True, max_length=max_len, truncation=True, padding=True, return_tensors='tf')
y_train = train_data['label'].values

# Split training data into train and validation sets
X_train_np = X_train['input_ids'].numpy()
X_train_np, X_val_np, y_train, y_val = train_test_split(X_train_np, y_train, test_size=0.2, random_state=42)

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_np, y_train)).batch(32)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val_np, y_val)).batch(32)

# Load model and compile
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Define optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

# Compile the model with optimizer and loss function
model.compile(optimizer=optimizer.name, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Set up early stopping
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
epochs = 10
history = model.fit(train_dataset, epochs=epochs, validation_data=val_dataset, callbacks=[early_stop])

# Test the model
X_test = tokenizer(text=test_data['text'].tolist(), add_special_tokens=True, max_length=max_len, truncation=True, padding=True, return_tensors='tf')
y_test = test_data['label'].values

test_dataset = tf.data.Dataset.from_tensor_slices((X_test['input_ids'], X_test['attention_mask'])).batch(32)

y_pred = model.predict(test_dataset)
y_pred_labels = tf.argmax(y_pred.logits, axis=1)
accuracy = np.sum(y_pred_labels.numpy() == y_test) / len(y_test)
print(f"DistilBERT's Test accuracy: {accuracy:.2f}")

NameError: name 'AutoTokenizer' is not defined

Pytorch - BERT classification implementation

In [None]:
import builtins
import collections
import os
import pandas as pd
import numpy as np
import tqdm
import time

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import IterableDataset, Dataset

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import AutoModelForSequenceClassification, AutoTokenizer


In [None]:
# Define a custom dataset class
class CustomTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text using the tokenizer
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,  # Add special tokens like [CLS] and [SEP]
            max_length=self.max_len,  # Set the maximum length of the tokenized sequence
            return_token_type_ids=False,  # Do not return token type IDs
            padding='max_length',  # Pad or truncate the sequence to the maximum length
            truncation=True,  # Truncate the sequence if it exceeds the maximum length
            return_attention_mask=True,  # Return attention mask indicating which tokens to attend to
            return_tensors='pt',  # Return PyTorch tensors
        ) #encoding returns a dictionary containing the encoded text, attention mask, and other information like token type IDs


        # Return the encoded text, attention mask, and label as a dictionary
        return {
            'text': text,  # Original text
            'input_ids': encoding['input_ids'].flatten(),  # Flattened tensor of input IDs
            'attention_mask': encoding['attention_mask'].flatten(),  # Flattened tensor of attention mask
            'labels': torch.tensor(label, dtype=torch.long)  # Tensor of label
        }


In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
# Load the data

df = pd.read_csv('/content/drive/MyDrive/NLP/Train.csv').sample(frac=0.7).reset_index(drop=True) # as dataset is too large, we will use only 70% of it

print(len(df))
df.head()

texts = df['text'].tolist()
labels = df['label'].tolist()

In [None]:
# Split the data
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2)

# Create datasets
train_dataset = CustomTextDataset(train_texts, train_labels, tokenizer, max_len=128) # 128 is the maximum length of the tokenized sequence
test_dataset = CustomTextDataset(test_texts, test_labels, tokenizer, max_len=128) # 128 is the maximum length of the tokenized sequence

# check the length of the dataset and few samples
print(len(train_dataset))
print(len(test_dataset))
print(train_dataset[0])
print(test_dataset[0])

In [None]:
# Create data loaders
batch_size = 128

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# check the length of the dataloader and few samples
print(len(train_loader)) # length of the train loader is the number of batches which can be calculated as total number of samples divided by batch size
print(len(test_loader))

for batch in train_loader:
    print(batch)
    print(type(batch), len(batch), batch.keys())
    print(len(batch['text']), len(batch['input_ids']), len(batch['attention_mask']), len(batch['labels']))
    print(batch['input_ids'].shape)
    #check the len of each of batch's texts
    print(len(batch['text'][0]), len(batch['text'][1]), len(batch['text'][2]))
    break

In [None]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2) # 2 is the number of classes in the dataset

# Move model to GPU or MPS if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


In [None]:
# Training settings
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

n_epochs = 1

# Train the model
model.train()
for epoch in range(n_epochs):
    for batch_idx, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device) # Move input IDs to GPU or MPS
        attention_mask = batch['attention_mask'].to(device) # Move attention mask to GPU or MPS
        labels = batch['labels'].to(device) # Move labels to GPU or MPS
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward() # Compute the gradients
        optimizer.step() # Update the weights
        # Print the loss for every 100th batch
        if (batch_idx + 1) % 10 == 0:
            print(f'Epoch {epoch+1}, Batch {batch_idx+1}: Loss = {loss.item()}')
    print(f'Epoch {epoch+1} completed')

Epoch 1, Batch 10: Loss = 0.5817462205886841
Epoch 1, Batch 20: Loss = 0.4283701479434967
Epoch 1, Batch 30: Loss = 0.4615080654621124
Epoch 1, Batch 40: Loss = 0.3449352979660034
Epoch 1, Batch 50: Loss = 0.32558831572532654
Epoch 1, Batch 60: Loss = 0.44657081365585327


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
# Evaluate the model
model.eval()
total = 0
correct = 0

# Initialize variables
batch_accuracy = []
batch_precision = []
batch_recall = []

# Iterate over each batch
for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask)
    _, predicted = torch.max(outputs.logits, 1)

    # Calculate metrics for the batch
    batch_accuracy.append(accuracy_score(labels.cpu(), predicted.cpu()))
    batch_precision.append(precision_score(labels.cpu(), predicted.cpu()))
    batch_recall.append(recall_score(labels.cpu(), predicted.cpu()))

    # Print metrics for the batch
    print(f"Batch Accuracy: {batch_accuracy[-1]:.4f}")
    print(f"Batch Precision: {batch_precision[-1]:.4f}")
    print(f"Batch Recall: {batch_recall[-1]:.4f}")
    print("------------------------")

# Calculate cumulative metrics
cumulative_accuracy = sum(batch_accuracy) / len(batch_accuracy)
cumulative_precision = sum(batch_precision) / len(batch_precision)
cumulative_recall = sum(batch_recall) / len(batch_recall)

# Print cumulative metrics
print(f"Cumulative Accuracy: {cumulative_accuracy:.4f}")
print(f"Cumulative Precision: {cumulative_precision:.4f}")
print(f"Cumulative Recall: {cumulative_recall:.4f}")


#### Simple RNN on pytorch

In [None]:
### Lets try simple RNN model

# import data
df = pd.read_csv('/content/drive/MyDrive/NLP/Train.csv').sample(frac=0.7).reset_index(drop=True) # as dataset is too large, we will use only 70% of it

print(len(df))
df.head()

texts = df['text'].tolist()
labels = df['label'].tolist()

# Split the data into train and validation
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)

# Tokenizer initialization
# basic english tokenizer from torchtext
from torchtext.data.utils import get_tokenizer
from collections import Counter

tokenizer = get_tokenizer("basic_english")

# Create a vocabulary
counter = Counter()
for text in train_texts:
    counter.update(tokenizer(text))

vocab = list(counter)
vocab = ['<unk>', '<pad>'] + vocab # Add special tokens for unknown tokens and padding
#vocab size
print(f'vocab size: {len(vocab)}')

# Create a dictionary
word2idx = {word: idx for idx, word in enumerate(vocab)}

# Define a function to convert text to a tensor
def text_to_tensor(text, word2idx, max_len=128):
    tensor = torch.zeros(max_len, dtype=torch.long) # Initialize a tensor of zeros with a maximum length
    tokens = tokenizer(text)
    for idx, token in enumerate(tokens):
        if idx >= max_len:
            break
        tensor[idx] = word2idx.get(token, 0) # Use the index of the token if it exists, otherwise use the index of the unknown token
    return tensor

# Define a function to convert label to a tensor
def label_to_tensor(label):
    return torch.tensor(label, dtype=torch.long)


In [None]:
# Define a custom dataset class
class CustomTextDataset_4(Dataset):
    def __init__(self, texts, labels, word2idx, max_len=128):
        self.texts = texts
        self.labels = labels
        self.word2idx = word2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        input_ids = text_to_tensor(text, self.word2idx, self.max_len)
        labels = label_to_tensor(label)
        return {'text': text,  # Original text
                'input_ids': input_ids,
                'labels': labels}

# Create datasets
train_dataset = CustomTextDataset_4(train_texts, train_labels, word2idx)
val_dataset = CustomTextDataset_4(val_texts, val_labels, word2idx)

# check the length of the dataset and few samples
print(len(train_dataset))
print(len(val_dataset))
print(train_dataset[0])
print(val_dataset[0])

In [None]:
# Create data loaders
batch_size = 128

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# check the length of the dataloader and few samples
print(len(train_loader)) # length of the train loader is the number of batches which can be calculated as total number of samples divided by batch size
print(len(val_loader))

for batch in train_loader:
    print(batch)
    print(type(batch), len(batch), batch.keys())
    print(len(batch['text']), len(batch['input_ids']), len(batch['labels']))
    print(batch['input_ids'].shape, batch['labels'].shape)
    #check the len of each of batch's texts
    print(len(batch['text'][0]), len(batch['text'][1]), len(batch['text'][2]))
    # so sequence length is
    print(batch['input_ids'].shape[1])
    break


In [None]:
# Define the RNN model
class RNNModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers=1, dropout=0.2):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        # print("Embedding shape:", x.shape)
        # [sent_len, batch_size] --> [sent_len, batch_size, emb_dim]
        output, hidden = self.rnn(x)
        # output = [sent_len, batch_size, hid_dim * n_directions]
        # hidden = [n_layers * n_directions, batch_size, hid_dim]
        hidden = hidden[-1, :, :]
        # [n_layers * n_directions, batch_size, hid_dim] --> [batch_size, hid_dim]
        return self.fc(hidden)

In [None]:

# Define the model
input_dim = len(vocab)
embedding_dim = 100
hidden_dim = 20
output_dim = 2
Bidirectional_flag = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RNNModel(input_dim, embedding_dim, hidden_dim, output_dim)
model.to(device)



# send a batch to the model and check the output, output dimensions
for batch in train_loader:
    input_ids = batch['input_ids'].to(device)
    labels = batch['labels'].to(device)
    print(input_ids.shape)
    output = model(input_ids)
    print("Output shape:", output.shape)
    print("Output:", output)
    break


In [None]:
# Training settings
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = torch.nn.CrossEntropyLoss()

n_epochs = 1

# Train the model
model.train()
for epoch in range(n_epochs):
    for batch_idx, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device) # Move input IDs to GPU or MPS
        labels = batch['labels'].to(device) # Move labels to GPU or MPS
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        loss.backward() # Compute the gradients
        optimizer.step() # Update the weights
        # Print the loss for every 100th batch
        if (batch_idx + 1) % 100 == 0:
            print(f'Epoch {epoch+1}, Batch {batch_idx+1}: Loss = {loss.item()}')
    print(f'Epoch {epoch+1} completed')

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
# Evaluate the model
model.eval()
total = 0
correct = 0

# Initialize variables
batch_accuracy = []
batch_precision = []
batch_recall = []

# Iterate over each batch
for batch in val_loader:
    input_ids = batch['input_ids'].to(device)
    labels = batch['labels'].to(device)
    outputs = model(input_ids)
    _, predicted = torch.max(outputs, 1)

    # Calculate metrics for the batch
    batch_accuracy.append(accuracy_score(labels.cpu(), predicted.cpu()))
    batch_precision.append(precision_score(labels.cpu(), predicted.cpu()))
    batch_recall.append(recall_score(labels.cpu(), predicted.cpu()))

    # Print metrics for the batch
    print(f"Batch Accuracy: {batch_accuracy[-1]:.4f}")
    print(f"Batch Precision: {batch_precision[-1]:.4f}")
    print(f"Batch Recall: {batch_recall[-1]:.4f}")
    print("------------------------")

# Calculate cumulative metrics
cumulative_accuracy = sum(batch_accuracy) / len(batch_accuracy)
cumulative_precision = sum(batch_precision) / len(batch_precision)
cumulative_recall = sum(batch_recall) / len(batch_recall)

# Print cumulative metrics
print(f"Cumulative Accuracy: {cumulative_accuracy:.4f}")
print(f"Cumulative Precision: {cumulative_precision:.4f}")
print(f"Cumulative Recall: {cumulative_recall:.4f}")


### LSTM on pytorch

In [None]:
# # Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers=1, dropout=0.2):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        # print("Embedding shape:", x.shape)
        # print elements of x
        # print(x)
        # [sent_len, batch_size] --> [sent_len, batch_size, emb_dim]
        output, (hidden, cell) = self.lstm(x)
        # output = [sent_len, batch_size, hid_dim * n_directions]
        # hidden = [n_layers * n_directions, batch_size, hid_dim]
        # cell = [n_layers * n_directions, batch_size, hid_dim]
        hidden = hidden[-1, :, :]
        # [n_layers * n_directions, batch_size, hid_dim] --> [batch_size, hid_dim]
        return self.fc(hidden)

In [None]:

# Define the model
input_dim = len(vocab)
embedding_dim = 256
hidden_dim = 20
output_dim = 2
Bidirectional_flag = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMModel(input_dim, embedding_dim, hidden_dim, output_dim)
model.to(device)


# send a batch to the model and check the output, output dimensions
for batch in train_loader:
    input_ids = batch['input_ids'].to(device)
    labels = batch['labels'].to(device)
    print(input_ids.shape)
    output = model(input_ids)
    print("Output shape:", output.shape)
    print("Output:", output)
    break

In [None]:
# Training settings
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = torch.nn.CrossEntropyLoss()

n_epochs = 1

# Train the model
model.train()
for epoch in range(n_epochs):
    for batch_idx, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device) # Move input IDs to GPU or MPS
        labels = batch['labels'].to(device) # Move labels to GPU or MPS
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        loss.backward() # Compute the gradients
        optimizer.step() # Update the weights
        # Print the loss for every 100th batch
        if (batch_idx + 1) % 100 == 0:
            print(f'Epoch {epoch+1}, Batch {batch_idx+1}: Loss = {loss.item()}')
    print(f'Epoch {epoch+1} completed')

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
# Evaluate the model
model.eval()
total = 0
correct = 0

# Initialize variables
batch_accuracy = []
batch_precision = []
batch_recall = []

# Iterate over each batch
for batch in val_loader:
    input_ids = batch['input_ids'].to(device)
    labels = batch['labels'].to(device)
    outputs = model(input_ids)
    _, predicted = torch.max(outputs, 1)

    # Calculate metrics for the batch
    batch_accuracy.append(accuracy_score(labels.cpu(), predicted.cpu()))
    batch_precision.append(precision_score(labels.cpu(), predicted.cpu()))
    batch_recall.append(recall_score(labels.cpu(), predicted.cpu()))

    # Print metrics for the batch
    print(f"Batch Accuracy: {batch_accuracy[-1]:.4f}")
    print(f"Batch Precision: {batch_precision[-1]:.4f}")
    print(f"Batch Recall: {batch_recall[-1]:.4f}")
    print("------------------------")

# Calculate cumulative metrics
cumulative_accuracy = sum(batch_accuracy) / len(batch_accuracy)
cumulative_precision = sum(batch_precision) / len(batch_precision)
cumulative_recall = sum(batch_recall) / len(batch_recall)

# Print cumulative metrics
print(f"Cumulative Accuracy: {cumulative_accuracy:.4f}")
print(f"Cumulative Precision: {cumulative_precision:.4f}")
print(f"Cumulative Recall: {cumulative_recall:.4f}")
