In [2]:
!pip    install    -U    datasets    transformers[torch]    evaluate

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting transformers[torch]
  Downloading transformers-4.44.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-no

In [3]:
from datasets import load_dataset

dataset = load_dataset('MrbBakh/Sentiment140')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/5.30M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/663k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/40000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5000 [00:00<?, ? examples/s]

## Text Pre=Processing

In [4]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

def tokenize(row):
  tokens = word_tokenize(row['text'])

  tokens = [token.lower() for token in tokens if token.isalpha()]
  return {'tokens': tokens}

dataset = dataset.map(tokenize)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'date', 'user', 'sentiment', 'query', '__index_level_0__', 'tokens'],
        num_rows: 40000
    })
    validation: Dataset({
        features: ['text', 'date', 'user', 'sentiment', 'query', '__index_level_0__', 'tokens'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'date', 'user', 'sentiment', 'query', '__index_level_0__', 'tokens'],
        num_rows: 5000
    })
})

In [6]:
dataset.shape

{'train': (40000, 7), 'validation': (5000, 7), 'test': (5000, 7)}

Remove stop word

In [7]:
from nltk.corpus import stopwords

nltk.download('stopwords')

def remove_stopwords(row):
  stop_words = set(stopwords.words('english'))
  filtered_tokens = [token for token in row['tokens'] if token not in stop_words]
  return {'filtered_tokens': filtered_tokens}

dataset = dataset.map(remove_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'date', 'user', 'sentiment', 'query', '__index_level_0__', 'tokens', 'filtered_tokens'],
        num_rows: 40000
    })
    validation: Dataset({
        features: ['text', 'date', 'user', 'sentiment', 'query', '__index_level_0__', 'tokens', 'filtered_tokens'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'date', 'user', 'sentiment', 'query', '__index_level_0__', 'tokens', 'filtered_tokens'],
        num_rows: 5000
    })
})

In [9]:
dataset.shape

{'train': (40000, 8), 'validation': (5000, 8), 'test': (5000, 8)}

### Task 1:

In [10]:
from nltk.stem import PorterStemmer

def stem_tokens(row):
  stemmer = PorterStemmer()
  stemmed_tokens = [stemmer.stem(token) for token in row['filtered_tokens']]
  return {'stemmed_tokens': stemmed_tokens}

dataset = dataset.map(stem_tokens)

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'date', 'user', 'sentiment', 'query', '__index_level_0__', 'tokens', 'filtered_tokens', 'stemmed_tokens'],
        num_rows: 40000
    })
    validation: Dataset({
        features: ['text', 'date', 'user', 'sentiment', 'query', '__index_level_0__', 'tokens', 'filtered_tokens', 'stemmed_tokens'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'date', 'user', 'sentiment', 'query', '__index_level_0__', 'tokens', 'filtered_tokens', 'stemmed_tokens'],
        num_rows: 5000
    })
})

In [12]:
dataset.shape

{'train': (40000, 9), 'validation': (5000, 9), 'test': (5000, 9)}

## Word Embedding

In [13]:
from gensim.models import Word2Vec

word_embedding = Word2Vec(dataset['train']['stemmed_tokens'], vector_size=100,
                          window=5, min_count=1, sg=1, hs=0, negative=10)

In [14]:
word_embedding.save('w2v.model')
word_embedding = Word2Vec.load('w2v.model')

## Average Vector

In [15]:
def filter_tokens(example):
  return {'tokens': [token for token in example['tokens'] if token in word_embedding.wv]}

def mean_vector(example):
  return {'mean': word_embedding.wv[example['tokens']].mean(axis=0)}

dataset = dataset.map(filter_tokens).filter(lambda e: len(e['tokens']) > 0).map(mean_vector)

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/40000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/39531 [00:00<?, ? examples/s]

Map:   0%|          | 0/4914 [00:00<?, ? examples/s]

Map:   0%|          | 0/4916 [00:00<?, ? examples/s]

In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'date', 'user', 'sentiment', 'query', '__index_level_0__', 'tokens', 'filtered_tokens', 'stemmed_tokens', 'mean'],
        num_rows: 39531
    })
    validation: Dataset({
        features: ['text', 'date', 'user', 'sentiment', 'query', '__index_level_0__', 'tokens', 'filtered_tokens', 'stemmed_tokens', 'mean'],
        num_rows: 4914
    })
    test: Dataset({
        features: ['text', 'date', 'user', 'sentiment', 'query', '__index_level_0__', 'tokens', 'filtered_tokens', 'stemmed_tokens', 'mean'],
        num_rows: 4916
    })
})

In [17]:
dataset.shape

{'train': (39531, 10), 'validation': (4914, 10), 'test': (4916, 10)}

In [18]:
import numpy as np
from sklearn.naive_bayes import GaussianNB

X = np.array(dataset['train']['mean'])
y = np.array(dataset['train']['sentiment'])

clf = GaussianNB()
clf.fit(X, y)

### Task 2:

In [19]:
from sklearn.metrics import accuracy_score, confusion_matrix

X_test = np.array(dataset['test']['mean'])
y_test = np.array(dataset['test']['sentiment'])

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.62
Confusion Matrix:
[[1981  564]
 [1318 1053]]


## LSTM

In [20]:
def vectorize(example):
  return {'vectors': word_embedding.wv[example['tokens']]}

dataset = dataset.map(vectorize)

Map:   0%|          | 0/39531 [00:00<?, ? examples/s]

Map:   0%|          | 0/4914 [00:00<?, ? examples/s]

Map:   0%|          | 0/4916 [00:00<?, ? examples/s]

In [21]:
import torch
import torch.nn as nn

lstm = nn.LSTM(100,200)
sequence = torch.tensor(dataset['train'][0]['vectors'])
output, (hidden, cell) = lstm(sequence)

In [22]:
sequence.shape

torch.Size([8, 100])

In [23]:
output.shape

torch.Size([8, 200])

In [24]:
lstm = nn.LSTM(100, 200, 2, batch_first=True)

batch = [torch.tensor(sequence) for sequence in dataset['train'][0:4]['vectors']]
padded_batch = nn.utils.rnn.pad_sequence(batch)

output, (hidden, cell) = lstm(padded_batch)

In [None]:
def word_to_index(example):
  indices = [word_embedding.wv.key_to_index[token] for token in example['tokens']]
  return {'indices': indices}

dataset = dataset.map(word_to_index)

Map:   0%|          | 0/39531 [00:00<?, ? examples/s]

Map:   0%|          | 0/4914 [00:00<?, ? examples/s]

Map:   0%|          | 0/4916 [00:00<?, ? examples/s]

In [None]:
pad_vector = np.zeros(word_embedding.vector_size)
weights = np.vstack([word_embedding.wv.vectors, pad_vector])
vocab_size, embedding_size = weights.shape
pad_idx = vocab_size - 1

In [None]:
def pad_sequences(batch):
  indices = [torch.tensor(sample, dtype=torch.long) for sample in batch['indices']]
  indices = nn.utils.rnn.pad_sequence(indices, batch_first=True, padding_value=pad_idx)
  return {'indices': indices}

dataset = dataset.map(pad_sequences, batched=True, batch_size=None).with_format('torch')

Map:   0%|          | 0/39531 [00:00<?, ? examples/s]

Map:   0%|          | 0/4914 [00:00<?, ? examples/s]

Map:   0%|          | 0/4916 [00:00<?, ? examples/s]

In [None]:
import torch.nn.functional as F

class SentimentClassifierLSTM(nn.Module):
  def __init__(self, vocab_size, embedding_size, hidden_size, num_layers):
    super().__init__()

    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.embedding = nn.Embedding(vocab_size, embedding_size)
    self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True)
    self.fc = nn.Linear(hidden_size, 1)

  def forward(self, x):
    embedded = self.embedding(x)
    output, _ = self.lstm(embedded)
    last_output = output[:, -1, :]
    logits = self.fc(last_output)
    out_sigmoid = F.sigmoid(logits)
    return out_sigmoid.squeeze(1)

In [None]:
hidden_size = 128
num_layers = 2

model = SentimentClassifierLSTM(vocab_size, embedding_size, hidden_size, num_layers)

model.embedding.weight = nn.Parameter(torch.FloatTensor(weights))
model.embedding.weight.requires_grad = False

learning_rate = 0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)

In [None]:
from torch.utils.data import DataLoader, TensorDataset

batch_size = 2048

def to_dataloader(dataset, split, shuffle):
  dataset = TensorDataset(dataset[split]['indices'], dataset[split]['sentiment'])
  return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

train_dataloader = to_dataloader(dataset, 'train', True)
test_dataloader = to_dataloader(dataset, 'test', False)
validation_dataloader = to_dataloader(dataset, 'validation', False)

In [None]:
def train_one_epoch(dataloader):
    model.train()  # Ensure the model is in training mode
    for inputs, labels in dataloader:
        inputs = inputs.to(device)
        labels = labels.to(device).float()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


### Task 4: Use  the  train  one  epoch  to  train  the  model  on  20  epochs.   Bonus :   Evaluate the model on the validation set after each epoch and print the validation accuracy.

In [None]:
def evaluate(dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device).float()

            outputs = model(inputs)
            predicted = (outputs >= 0.5).float()
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    return correct / total

num_epochs = 20

for epoch in range(num_epochs):
    train_one_epoch(train_dataloader)
    val_accuracy = evaluate(validation_dataloader)

    print(f"Epoch {epoch + 1}/{num_epochs}: Validation Accuracy: {val_accuracy:.4f}")

print("Training Complete")

Epoch 1/20: Validation Accuracy: 0.6980
Epoch 2/20: Validation Accuracy: 0.6952
Epoch 3/20: Validation Accuracy: 0.6952
Epoch 4/20: Validation Accuracy: 0.6943
Epoch 5/20: Validation Accuracy: 0.7004
Epoch 6/20: Validation Accuracy: 0.6962
Epoch 7/20: Validation Accuracy: 0.6964
Epoch 8/20: Validation Accuracy: 0.6986
Epoch 9/20: Validation Accuracy: 0.7009
Epoch 10/20: Validation Accuracy: 0.6974
Epoch 11/20: Validation Accuracy: 0.7007
Epoch 12/20: Validation Accuracy: 0.6691
Epoch 13/20: Validation Accuracy: 0.6978
Epoch 14/20: Validation Accuracy: 0.6998
Epoch 15/20: Validation Accuracy: 0.6958
Epoch 16/20: Validation Accuracy: 0.6996
Epoch 17/20: Validation Accuracy: 0.6998
Epoch 18/20: Validation Accuracy: 0.7021
Epoch 19/20: Validation Accuracy: 0.7019
Epoch 20/20: Validation Accuracy: 0.6994
Training Complete


### Task 5

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score


def evaluate_on_test(dataloader):
    model.eval()
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device).float()

            outputs = model(inputs)
            predicted = (outputs >= 0.5).float()

            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())


    accuracy = accuracy_score(all_labels, all_predictions)

    conf_matrix = confusion_matrix(all_labels, all_predictions)

    return accuracy, conf_matrix


test_accuracy, test_confusion_matrix = evaluate_on_test(test_dataloader)

print(f"Test Accuracy: {test_accuracy:.4f}")
print("Test Confusion Matrix:")
print(test_confusion_matrix)


Test Accuracy: 0.7065
Test Confusion Matrix:
[[1905  640]
 [ 803 1568]]


### Task 6

- **Average Vector Model:** </br>
Accuracy: 0.62 </br>
Confusion Matrix: </br>
[[1981  564] </br>
 [1318 1053]]

- **20 Epochs Model:** </br>
Accuracy: 0.7065 </br>
Confusion Matrix: </br>
[[1905  640] </br>
 [803 1568]]

Besides the higher accuracy score, with less true positives and higher true negatives, the 2nd model performs better. And it's worth noting as well that it's false positives and false negatives combined are less than the combined values of the Average Vector Model.

## Transformers

In [35]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('lyeonii/bert-mini')

tokenized_dataset = dataset.map(lambda x: tokenizer(
    x['text'],
    padding=True,
    return_tensors='pt',
  ), batched=True, batch_size=None).with_format('torch')

tokenizer_config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



Map:   0%|          | 0/39531 [00:00<?, ? examples/s]

Map:   0%|          | 0/4914 [00:00<?, ? examples/s]

Map:   0%|          | 0/4916 [00:00<?, ? examples/s]

In [36]:
tokenized_dataset = tokenized_dataset.rename_column('sentiment', 'labels')

In [37]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('lyeonii/bert-mini',
                                                           num_labels=2)

config.json:   0%|          | 0.00/618 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/44.7M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at lyeonii/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='sentiment_analysis',
    num_train_epochs=3,
    per_device_train_batch_size=512,
    per_device_eval_batch_size=512,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,0.6018,0.507748
2,0.4964,0.474449
3,0.4682,0.466431


TrainOutput(global_step=234, training_loss=0.5221308031652727, metrics={'train_runtime': 163.171, 'train_samples_per_second': 726.802, 'train_steps_per_second': 1.434, 'total_flos': 367261610682240.0, 'train_loss': 0.5221308031652727, 'epoch': 3.0})

### Task 7


In [None]:
import evaluate
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    return accuracy

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.4831,0.464328,0.779609
2,0.4527,0.450378,0.787953
3,0.4362,0.444995,0.79304


{'accuracy': 0.7796092796092796}
{'accuracy': 0.787952787952788}
{'accuracy': 0.793040293040293}


TrainOutput(global_step=234, training_loss=0.4573742304092798, metrics={'train_runtime': 168.86, 'train_samples_per_second': 702.316, 'train_steps_per_second': 1.386, 'total_flos': 367261610682240.0, 'train_loss': 0.4573742304092798, 'epoch': 3.0})

### Task 8

In [None]:
accuracy_metric = evaluate.load("accuracy")
test_results = trainer.predict(test_dataset=tokenized_dataset['test'])

logits = test_results.predictions
labels = test_results.label_ids

predictions = torch.tensor(logits).argmax(dim=-1)

accuracy = accuracy_score(labels, predictions)
conf_matrix = confusion_matrix(labels, predictions)

print(f"Test Accuracy: {accuracy:.4f}")
print("Test Confusion Matrix:")
print(conf_matrix)

{'accuracy': 0.8002441008950366}
Test Accuracy: 0.8002
Test Confusion Matrix:
[[2079  466]
 [ 516 1855]]


### Task 9

- **Average Vector Model:** </br>
Accuracy: 0.62 </br>
Confusion Matrix: </br>
[[1981  564] </br>
 [1318 1053]]

- **20 Epochs Model:** </br>
Accuracy: 0.7065 </br>
Confusion Matrix: </br>
[[1905  640] </br>
 [803 1568]]

- **Hugging Face's Trainer:** </br>
Accuracy: 0.8002 </br>
Confusion Matrix: </br>
[[2079  466] </br>
 [516 1855]]

With higher true positives and true negatives, the last models outperforms the first two, and it has significantly less false postives/negatives as well. Along with its higher accuracy (0.8) it is evident that it performs much better.