In [1]:
# Installing libraries
!pip install transformers

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m61.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m81.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m64.7 MB/s[0m eta [36m0:00:0

In [2]:
# importing Libraries
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from tqdm.notebook import tqdm
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Mount Drive
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [3]:
# Load your dataset
path="/gdrive/MyDrive/training.1600000.processed.noemoticon.csv"
df = pd.read_csv(path, encoding='latin-1', header=None, names=['sentiment', 'id', 'date', 'flag', 'user', 'text'])
data = df[['sentiment','text']]
data['sentiment'] = data['sentiment'].replace(4,1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentiment'] = data['sentiment'].replace(4,1)


In [4]:
# Making the Train Test Split
train_data, test_data = train_test_split(data.sample(frac=0.2), test_size=0.2, random_state=42)         # Only 20% of dataset takes about 1:30 hrs per epoch
# train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)          # Complete Dataset takes about 8 hours

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case = True)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2, output_attentions = False, output_hidden_states = False)  # Assuming binary sentiment
model.to(device)      # Moving the model to GPU if available

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [5]:
train_data["sentiment"].value_counts()

1    128393
0    127607
Name: sentiment, dtype: int64

In [6]:
del(df,data)    # Removing redundant data

def preprocess_data(data):
    data['text'] = data['text'].fillna("")  # Replace NaN with empty string
    data['sentiment'] = data['sentiment'].astype(int)  # Make sure sentiment is of integer type
    return data

train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

In [7]:
# Tokenize and create data loaders
def tokenize_data(texts, labels):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            return_attention_mask=True,
            return_tensors="pt",
            truncation=True
        )
        input_ids.append(encoded["input_ids"])
        attention_masks.append(encoded["attention_mask"])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0), torch.tensor(labels)
train_data.reset_index(drop=True, inplace=True)  # Reset index before passing to tokenize_data
test_data.reset_index(drop=True, inplace=True)

train_input_ids, train_attention_masks, train_labels = tokenize_data(train_data['text'], train_data['sentiment'])
test_input_ids, test_attention_masks, test_labels = tokenize_data(test_data['text'], test_data['sentiment'])

train_input_ids = train_input_ids.to(device)
train_attention_masks = train_attention_masks.to(device)
train_labels = train_labels.to(device)

In [8]:
batch_size = 32
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Training settings


optimizer = torch.optim.AdamW(model.parameters(), lr = 2e-5, eps=1e-8)
epochs = 1      # Multiple or long epochs generally cause cuda failure which can only be corrected by flushing the GPU and restarting from beginning
total_steps = len(train_dataloader)*epochs
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0, num_training_steps=total_steps)       # creates a learning rate for the BERT model so that the training gets a bit optimized

In [9]:
del train_data,train_input_ids, train_attention_masks, train_labels, train_dataset    # Emptying memory to prevent RAM overflow or failure

In [36]:
# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs}"):
        model.zero_grad()
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{epochs} - Average Loss: {average_loss:.4f}")


Epoch 1/1:   0%|          | 0/8000 [00:00<?, ?it/s]

Epoch 1/1 - Average Loss: 0.3602


In [43]:
test_input_ids = test_input_ids.to(device)
test_attention_masks = test_attention_masks.to(device)
test_labels = test_labels.to(device)

test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

# Evaluation
model.eval()
total_loss = 0
predictions,true_vals = [],[]
for batch in tqdm(test_dataloader):
    input_ids, attention_mask, labels = [b.to(device) for b in batch]
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    logits = outputs.logits
    total_loss += loss.item()
    logits = logits.detach().cpu().numpy()
    label = labels.cpu().numpy()
    predictions.append(logits)
    true_vals.append(label)
loss = total_loss / len(test_dataloader)
predictions = np.concatenate(predictions,axis=0)
predictions = np.argmax(predictions,axis=1).flatten()
true_vals = np.concatenate(true_vals,axis=0).flatten()
np.sum(predictions==true_vals)/len(true_vals)

  0%|          | 0/2000 [00:00<?, ?it/s]

0.859625

In [38]:
# output_dir = '/gdrive/MyDrive/Bert Analyzer'      # Saving the model and the tokenizer to a directory
# model.save_pretrained(output_dir)
# tokenizer.save_pretrained(output_dir)
# from google.colab import files
# !zip -r Analyzer.zip "Bert Analyzer"/
# files.download('Analyzer.zip')

('/gdrive/MyDrive/Bert Analyzer/tokenizer_config.json',
 '/gdrive/MyDrive/Bert Analyzer/special_tokens_map.json',
 '/gdrive/MyDrive/Bert Analyzer/vocab.txt',
 '/gdrive/MyDrive/Bert Analyzer/added_tokens.json')

In [3]:
output_dir = '/gdrive/MyDrive/Bert Analyzer'
tokenizer = BertTokenizer.from_pretrained(output_dir)
loaded_model = BertForSequenceClassification.from_pretrained(output_dir)


# text="I am depressed"
text="I am delighted"

In [4]:
encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            return_attention_mask=True,
            return_tensors="pt",
            truncation=True
        )
input_id = encoded_text['input_ids']

attention_mask = encoded_text['attention_mask']
input_id = torch.LongTensor(input_id)
attention_mask = torch.LongTensor(attention_mask)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model = loaded_model.to(device)
input_id = input_id.to(device)
attention_mask = attention_mask.to(device)

with torch.no_grad():
    outputs = loaded_model(input_id, token_type_ids=None, attention_mask=attention_mask)

logits = outputs[0]
index = logits.argmax()
print(index)

tensor(1, device='cuda:0')
