In [None]:

!pip install --quiet pytorch-lightning

In [None]:
!pip install torchmetrics



In [None]:
!pip install --quiet transformers
!pip install tensorboard==1.15.0



In [None]:
pip install --upgrade numpy tensorflow


Collecting tensorflow
  Downloading tensorflow-2.15.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (475.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard<2.16,>=2.15 (from tensorflow)
  Downloading tensorboard-2.15.2-py3-none-any.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorboard, tensorflow
  Attempting uninstall: tensorboard
    Found existing installation: tensorboard 1.15.0
    Uninstalling tensorboard-1.15.0:
      Successfully uninstalled tensorboard-1.15.0
  Attempting uninstall: tensorflow
    Found existing installation: tensorflow 2.15.0
    Uninstalling tensorflow-2.15.0:
      Successfully uninstalled tensorflow-2.15.0
Successfully installed tensorboard-2.15.2 tensorflow-2.15.0.post1


In [None]:
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import f1_score
import numpy as np
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
import pytorch_lightning as pl
from transformers import AutoModel
import torchmetrics

In [68]:
import pandas as pd
import tensorflow as tf
import torch
from torch.utils.data import TensorDataset
from transformers import AutoTokenizer

# Initialize the tokenizer
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Constants
MAX_LEN = 64
label2id = {id: id+1 for id in range(-1, 2,1)}
id2label = {v: k for k, v in label2id.items()}

def convert_examples_to_features(tweets, labels):
    # Batch encode the tweets, ensuring that we add special tokens, pad and truncate as needed, and return PyTorch tensors
    encoding = bert_tokenizer(tweets, add_special_tokens=True, return_tensors="pt", max_length=MAX_LEN, padding="max_length", truncation=True)

    # Extract the input_ids and attention_mask from the encoding
    input_ids = encoding['input_ids']
    attention_masks = encoding['attention_mask']

    # Convert labels to tensor
    labels = torch.tensor([label2id[label] for label in labels], dtype=torch.long)

    # Create a TensorDataset with input_ids, attention_masks, and labels
    dataset = TensorDataset(input_ids, attention_masks, labels)

    return dataset

In [69]:
from sklearn.model_selection import train_test_split

df = pd.read_csv("/content/sample_data/Twitter_Data.csv")
df = df.dropna(subset=['clean_text', 'category'])




# Ensure all text is string
df['clean_text'] = df['clean_text'].astype(str)

# Map labels to integers (optional, depends on your label format)
#df['category'] = df['category'].map(your_label_mapping_here) if needed

# Now call your function
dataset = convert_examples_to_features(df["clean_text"].tolist(), df["category"].tolist())

train_data, val_data, train_labels, val_labels = train_test_split(
    dataset,
    list(df.category),
    random_state=1234,
    test_size=0.2
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_text'] = df['clean_text'].astype(str)


In [None]:
len(train_data)

130375

In [70]:
class BertSentClassification(pl.LightningModule):
    def __init__(self, dataset, hidden_sz=200, output_sz=3, dropout_prob=0.2):
        super().__init__()
        self.dataset = dataset

        # Load pre-trained model
        self.bert_model = AutoModel.from_pretrained(
              pretrained_model_name_or_path="bert-base-uncased"
        )
        # Add dropout layer
        self.dropout = nn.Dropout(dropout_prob)
        # Add MLP layer
        self.mlp = nn.Sequential(
            nn.Linear(768, hidden_sz), #768 is the size of BERT output
            nn.ReLU(),
            nn.Linear(hidden_sz, output_sz),
            nn.Softmax()
        )

        # define metrics
        self.valid_acc = torchmetrics.Accuracy(num_classes=output_sz, average='macro', task='multiclass')

    def forward(self, input_ids, attention_mask, labels=None, token_type_ids=None):
        outputs = self.bert_model(
          input_ids,
          attention_mask=attention_mask,
          token_type_ids=token_type_ids
        )

        # BERT vectors corresponding to the [CLS] token
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.mlp(pooled_output)

        return logits

    def configure_optimizers(self):
        # Define the optimizer here
        optimizer = torch.optim.SGD(self.parameters(), lr=5e-3)
        return optimizer

    def training_step(self, batch, batch_idx):
        # training_step will hold processing corresponding each traning step
        # the epoch loop and batch training loop are abstracted away by
        # PyTorch Lightning
        input_ids, attention_mask, labels = batch

        logits = self(
            input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        torch.nn.utils.clip_grad_norm_(self.parameters(), 1.0)
        loss = F.cross_entropy(logits, labels)
        self.log(
            "train_loss",
            loss,
            on_epoch=True,
            on_step=True,
            prog_bar=True,
            logger=True
        )

        return loss

    def validation_step(self, batch, batch_idx):
       # implementation corresponding to processing of validation data
        input_ids, attention_mask, labels = batch

        logits = self(
            input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        self.log(
            "validation_accuracy",
            self.valid_acc(logits, labels),
            on_epoch=True,
            prog_bar=True,
            logger=True
        )

    def train_dataloader(self):
        # dataloader corresponding to training data
        train_sampler = RandomSampler(self.dataset["train"])

        return DataLoader(
            dataset=self.dataset["train"],
            sampler=train_sampler,
            batch_size=64
        )



    def val_dataloader(self):
        # dataloader corresponding to validation data
        val_sampler = SequentialSampler(self.dataset["val"])

        return DataLoader(
            dataset=self.dataset["val"],
            sampler=val_sampler,
            batch_size=64
        )

In [71]:
dataset = {"train": train_data, "val": val_data}

In [72]:
model = BertSentClassification(dataset=dataset)
#uncomment the below if you are running on a GPU (which is ideal, a CPU could take forever)
trainer = pl.Trainer(max_epochs=7)#, gpus="0") # use GPU at index 0
trainer.fit(model)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name       | Type               | Params
--------------------------------------------------
0 | bert_model | BertModel          | 109 M 
1 | dropout    | Dropout            | 0     
2 | mlp        | Sequential         | 154 K 
3 | valid_acc  | MulticlassAccuracy | 0     
--------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
438.547   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
  return self._call_impl(*args, **kwargs)
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=7` reached.


In [73]:
model.eval()

BertSentClassification(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [82]:

input_text = " they dont care for indias future co,people hate the government"
labels = [-1]
# Convert examples to features
test_dataset = convert_examples_to_features([input_text], labels=[-1])

In [83]:
input_ids, attention_mask, _ = next(iter(test_dataset))
#add a new axis for both attention mask and inpu_ids
input_ids = input_ids.unsqueeze(0)
attention_mask = attention_mask.unsqueeze(0)

In [84]:
prediction = model(input_ids, attention_mask)

In [85]:
prediction = torch.argmax(prediction).item()
id2label[prediction]

-1