<a href="https://colab.research.google.com/github/sam200530/BERT/blob/main/BERT_TRANSFROMER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

TASK_1

In [None]:
import torch
import torch.nn as nn
from transformers import BertTokenizer
import numpy as np

In [None]:
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:



!pip install transformers torch pandas scikit-learn numpy tqdm -q


import torch
import torch.nn as nn
from transformers import BertTokenizer
import numpy as np


def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

# Multi-Head Self-Attention Implementation
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        assert self.head_dim * num_heads == embed_dim, "embed_dim divisible by num_heads"

        self.query = nn.Linear(embed_dim, embed_dim)
        self.key = nn.Linear(embed_dim, embed_dim)
        self.value = nn.Linear(embed_dim, embed_dim)
        self.out = nn.Linear(embed_dim, embed_dim)

    def forward(self, x, mask=None):
        batch_size, seq_len, embed_dim = x.size()

        Q = self.query(x)
        K = self.key(x)
        V = self.value(x)

        Q = Q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        K = K.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        V = V.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim ** 0.5)

        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        attn = torch.softmax(scores, dim=-1)
        context = torch.matmul(attn, V)
        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, embed_dim)
        output = self.out(context)

        return output

print("Multi-Head Self-Attention Done")


Multi-Head Self-Attention Done


In [None]:
class FeedForward(nn.Module):
    def __init__(self, embed_dim, ff_dim=2048, dropout=0.1):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(embed_dim, ff_dim)
        self.linear2 = nn.Linear(ff_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)
        self.activation = nn.GELU()

    def forward(self, x):
        x = self.linear1(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

print(" Feed-Forward Layer ")

 Feed-Forward Layer 


In [None]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim=2048, dropout=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.attention = MultiHeadSelfAttention(embed_dim, num_heads)
        self.feed_forward = FeedForward(embed_dim, ff_dim, dropout)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output = self.attention(x, mask)
        x = self.norm1(x + self.dropout1(attn_output))

        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout2(ff_output))

        return x

print("Transformer Encoder Layer ")

Transformer Encoder Layer 


In [None]:
class BertModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=768, num_heads=12, num_layers=2,
                 max_seq_len=512, num_classes=2, dropout=0.1):
        super(BertModel, self).__init__()

        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
        self.position_embedding = nn.Embedding(max_seq_len, embed_dim)
        self.dropout = nn.Dropout(dropout)

        self.encoder_layers = nn.ModuleList([
            TransformerEncoderLayer(embed_dim, num_heads, embed_dim * 4, dropout)
            for _ in range(num_layers)
        ])

        self.pooler = nn.Linear(embed_dim, embed_dim)
        self.pooler_activation = nn.Tanh()
        self.classifier = nn.Linear(embed_dim, num_classes)

    def forward(self, input_ids, attention_mask=None):
        batch_size, seq_len = input_ids.size()

        token_embeds = self.token_embedding(input_ids)

        positions = torch.arange(seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, -1)
        pos_embeds = self.position_embedding(positions)

        embeddings = token_embeds + pos_embeds
        embeddings = self.dropout(embeddings)

        hidden_states = embeddings
        for encoder_layer in self.encoder_layers:
            hidden_states = encoder_layer(hidden_states, attention_mask)

        pooled_output = self.pooler(hidden_states[:, 0])
        pooled_output = self.pooler_activation(pooled_output)

        logits = self.classifier(pooled_output)

        return {
            'token_embeddings': token_embeds,
            'positional_embeddings': pos_embeds,
            'attention_output': hidden_states,
            'pooled_output': pooled_output,
            'logits': logits
        }

print("  BERT Model ")



  BERT Model 


In [None]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
print(f" Tokenizer (vocab size: {tokenizer.vocab_size})")

# BERT with Sample Sentence
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")

model = BertModel(
    vocab_size=tokenizer.vocab_size,
    embed_dim=768,
    num_heads=12,
    num_layers=2,
    max_seq_len=512,
    num_classes=2,
    dropout=0.1
).to(device)

test_sentence = "This is a comprehensive test sentence with more than ten words to verify the BERT model implementation"
print(f"\nTest sentence ({len(test_sentence.split())} words): {test_sentence}")

encoded = tokenizer(test_sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
input_ids = encoded['input_ids'].to(device)
attention_mask = encoded['attention_mask'].to(device)

print(f"\nTokenized input shape: {input_ids.shape}")
print(f"Tokens: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
print("Segmentation info: All tokens assigned to segment 0 (for single sentence)")


 Tokenizer (vocab size: 30522)

Using device: cuda

Test sentence (17 words): This is a comprehensive test sentence with more than ten words to verify the BERT model implementation

Tokenized input shape: torch.Size([1, 19])
Tokens: ['[CLS]', 'this', 'is', 'a', 'comprehensive', 'test', 'sentence', 'with', 'more', 'than', 'ten', 'words', 'to', 'verify', 'the', 'bert', 'model', 'implementation', '[SEP]']
Segmentation info: All tokens assigned to segment 0 (for single sentence)


In [None]:
model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask)




print(f"\n1. Token Embeddings Shape: {outputs['token_embeddings'].shape}")
print(f"   (batch_size, sequence_length, embedding_dim)")

print(f"\n2. Positional Embeddings Shape: {outputs['positional_embeddings'].shape}")
print(f"   (batch_size, sequence_length, embedding_dim)")

print(f"\n3. Output of Multi-Head Attention Shape: {outputs['attention_output'].shape}")
print(f"   (batch_size, sequence_length, embedding_dim)")

print(f"\n4. Output of Feed-Forward Layer Shape: {outputs['attention_output'].shape}")
print(f"   (same as attention output after residual connection)")

print(f"\n5. Output of Transformer Encoder Layer Shape: {outputs['attention_output'].shape}")
print(f"   (batch_size, sequence_length, embedding_dim)")

print(f"\n6. Output Probabilities from Classifier Shape: {outputs['logits'].shape}")
print(f"   (batch_size, num_classes)")


print("SHAPE OF EACH PARAMETER ")

total_params = 0
for name, param in model.named_parameters():
    print(f"{name:60s}: {str(param.shape):30s} ({param.numel():,} params)")
    total_params += param.numel()

print(f"\nTotal parameters: {total_params:,}")



1. Token Embeddings Shape: torch.Size([1, 19, 768])
   (batch_size, sequence_length, embedding_dim)

2. Positional Embeddings Shape: torch.Size([1, 19, 768])
   (batch_size, sequence_length, embedding_dim)

3. Output of Multi-Head Attention Shape: torch.Size([1, 19, 768])
   (batch_size, sequence_length, embedding_dim)

4. Output of Feed-Forward Layer Shape: torch.Size([1, 19, 768])
   (same as attention output after residual connection)

5. Output of Transformer Encoder Layer Shape: torch.Size([1, 19, 768])
   (batch_size, sequence_length, embedding_dim)

6. Output Probabilities from Classifier Shape: torch.Size([1, 2])
   (batch_size, num_classes)
SHAPE OF EACH PARAMETER 
token_embedding.weight                                      : torch.Size([30522, 768])       (23,440,896 params)
position_embedding.weight                                   : torch.Size([512, 768])         (393,216 params)
encoder_layers.0.attention.query.weight                     : torch.Size([768, 768])         

TASK_2


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
import random

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd


base_path = "/content/drive/MyDrive/nlp/"


train_path = base_path + 'all_train.tsv'
test_path = base_path + 'all_test_public.tsv'
validate_path = base_path + 'all_validate.tsv'

df_train = pd.read_csv(train_path, sep='\t')
df_test = pd.read_csv(test_path, sep='\t')
df_validate = pd.read_csv(validate_path, sep='\t')

Mounted at /content/drive


In [None]:
print(f" Train shape: {df_train.shape}")
print(f" Test shape: {df_test.shape}")
print(f" Validate shape: {df_validate.shape}")
print(f"\nColumns available: {df_train.columns.tolist()}")

 Train shape: (878218, 20)
 Test shape: (92444, 20)
 Validate shape: (92444, 20)

Columns available: ['Unnamed: 0.2', 'Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'author', 'clean_title', 'created_utc', 'domain', 'hasImage', 'id', 'image_url', 'linked_submission_id', 'num_comments', 'score', 'subreddit', 'title', 'upvote_ratio', '2_way_label', '3_way_label', '6_way_label']


In [None]:
required_cols = ['clean_title', '2_way_label', 'id']
df_train = df_train[required_cols].copy()
df_test = df_test[required_cols].copy()
df_validate = df_validate[required_cols].copy()



In [None]:
print(f"\nColumns available: {df_train.columns.tolist()}")


Columns available: ['clean_title', '2_way_label', 'id']


In [None]:
df_train = df_train.drop_duplicates(subset=['clean_title'])
df_test = df_test.drop_duplicates(subset=['clean_title'])
df_validate = df_validate.drop_duplicates(subset=['clean_title'])

In [None]:
import pandas as pd


base_path = "/content/drive/MyDrive/all/"

comments_path = base_path + 'all_comments.tsv'
df_comments = pd.read_csv(comments_path, sep='\t')

required_comment_cols = ['id', 'submission_id', 'body', 'parent_id']
df_comments = df_comments[required_comment_cols].dropna()

print(f"\n Comments loaded: {len(df_comments):,}")
print(f"Columns: {df_comments.columns.tolist()}")
print(f"\nSample comment structure:")
print(df_comments.head(3))

  df_comments = pd.read_csv(comments_path, sep='\t')



 Comments loaded: 10,000,647
Columns: ['id', 'submission_id', 'body', 'parent_id']

Sample comment structure:
        id submission_id                                               body  \
0  f4deplg        dkdml1  Scroll, scroll, scroll.  Pause.  Scroll back u...   
1  f4d79bi        dkdml1  A lot of the people who felt quite strongly ab...   
2  f4ddmlk        dkdml1  T H E   S P H E R E   S H A L L   R I S E   A ...   

   parent_id  
0  t3_dkdml1  
1  t3_dkdml1  
2  t3_dkdml1  


In [None]:
def add_comments_to_posts(df_posts, df_comments):

    top_level = df_comments[df_comments['parent_id'].str.startswith('t3_', na=False)].copy()

    comments_grouped = top_level.groupby('submission_id')['body'].apply(
        lambda x: ' [SEP] '.join(x.head(3))
    ).to_dict()

    df_posts['text'] = df_posts.apply(
        lambda row: f"{row['clean_title']} [SEP] {comments_grouped.get(row['id'], '')}",
        axis=1
    )

    return df_posts

df_train = add_comments_to_posts(df_train, df_comments)
df_test = add_comments_to_posts(df_test, df_comments)
df_validate = add_comments_to_posts(df_validate, df_comments)

print("\n Comments add done")
print("Format used: [POST_TITLE] [SEP] [COMMENT1] [SEP] [COMMENT2] [SEP] [COMMENT3]")
print("  - Prefix t3_ → Comment directly on post (top-level)")
print("  - Prefix t1_ → Reply to another comment")
print(f"\nSample enriched text:\n{df_train['text'].iloc[0][:200]}...")


 Comments add done
Format used: [POST_TITLE] [SEP] [COMMENT1] [SEP] [COMMENT2] [SEP] [COMMENT3]
  - Prefix t3_ → Comment directly on post (top-level)
  - Prefix t1_ → Reply to another comment

Sample enriched text:
my walgreens offbrand mucinex was engraved with the letters mucinex but in a different order [SEP] Does it help with Dyslexia?...


In [None]:
df_all_original = pd.concat([df_train, df_test, df_validate])

print(f"\n Total number of posts in original dataset: {len(df_all_original):,}")

print(f"\n Distribution:")
dist = df_all_original['2_way_label'].value_counts()
print(f"   Fake posts (label=1): {dist.get(1, 0):,}")
print(f"   Non-fake posts (label=0): {dist.get(0, 0):,}")

comment_counts = df_comments.groupby('submission_id').size()
df_all_original['num_comments'] = df_all_original['id'].map(comment_counts).fillna(0)

posts_with_comments = (df_all_original['num_comments'] > 0).sum()
print(f"\n Number of posts with at least one comment: {posts_with_comments:,}")

print(f"\n Mean and Std of comments:")
fake_comments = df_all_original[df_all_original['2_way_label'] == 1]['num_comments']
nonfake_comments = df_all_original[df_all_original['2_way_label'] == 0]['num_comments']

print(f"   Fake posts - Mean: {fake_comments.mean():.2f}, Std: {fake_comments.std():.2f}")
print(f"   Non-fake posts - Mean: {nonfake_comments.mean():.2f}, Std: {nonfake_comments.std():.2f}")



a. Total number of posts in original dataset: 880,437

b. Distribution:
   Fake posts (label=1): 463,755
   Non-fake posts (label=0): 416,682

c. Number of posts with at least one comment: 556,802

d. Mean and Std of comments:
   Fake posts - Mean: 14.65, Std: 55.41
   Non-fake posts - Mean: 5.08, Std: 20.34


In [None]:
def balance_dataset(df, min_samples=2500):

    fake = df[df['2_way_label'] == 1]
    nonfake = df[df['2_way_label'] == 0]

    n_samples = max(min_samples, min(len(fake), len(nonfake)))

    fake_sampled = fake.sample(n=n_samples, random_state=42)
    nonfake_sampled = nonfake.sample(n=n_samples, random_state=42)

    balanced_df = pd.concat([fake_sampled, nonfake_sampled]).sample(frac=1, random_state=42)

    return balanced_df

df_train_balanced = balance_dataset(df_train, min_samples=2500)
df_test_balanced = balance_dataset(df_test, min_samples=2500)


print(f"Balanced Train: {len(df_train_balanced):,}")
print(f"  Fake: {(df_train_balanced['2_way_label']==1).sum():,}")
print(f"  Non-fake: {(df_train_balanced['2_way_label']==0).sum():,}")

print(f"\nBalanced Test: {len(df_test_balanced):,}")
print(f"  Fake: {(df_test_balanced['2_way_label']==1).sum():,}")
print(f"  Non-fake: {(df_test_balanced['2_way_label']==0).sum():,}")


Balanced Train: 679,240
  Fake: 339,620
  Non-fake: 339,620

Balanced Test: 77,030
  Fake: 38,515
  Non-fake: 38,515


In [None]:
import os

train_df, val_df = train_test_split(
    df_train_balanced,
    test_size=0.2,
    stratify=df_train_balanced['2_way_label'],
    random_state=42
)


print(f"Training set: {len(train_df):,}")
print(f"  Fake: {(train_df['2_way_label']==1).sum():,}")
print(f"  Non-fake: {(train_df['2_way_label']==0).sum():,}")

print(f"\nValidation set: {len(val_df):,}")
print(f"  Fake: {(val_df['2_way_label']==1).sum():,}")
print(f"  Non-fake: {(val_df['2_way_label']==0).sum():,}")


OUTPUT_DIR = "/content/output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

df_train[['text', '2_way_label']].to_csv(os.path.join(OUTPUT_DIR, 'train.csv'), index=False)
df_test[['text', '2_way_label']].to_csv(os.path.join(OUTPUT_DIR, 'test.csv'), index=False)
print(f" Train set saved: {len(df_train)} posts (80%)")
print(f" Test set saved: {len(df_test)} posts (20%)")

Training set: 543,392
  Fake: 271,696
  Non-fake: 271,696

Validation set: 135,848
  Fake: 67,924
  Non-fake: 67,924
 Train set saved: 719696 posts (80%)
 Test set saved: 80433 posts (20%)


In [None]:
import torch
print(torch.cuda.is_available())


True


In [None]:
df_balanced_all = pd.concat([df_train_balanced, df_test_balanced])

print(f"\n Number of fake posts: {(df_balanced_all['2_way_label']==1).sum():,}")
print(f"   Number of non-fake posts: {(df_balanced_all['2_way_label']==0).sum():,}")

print(f"\n Distribution of posts in train set:")
print(f"   Fake: {(train_df['2_way_label']==1).sum():,}")
print(f"   Non-fake: {(train_df['2_way_label']==0).sum():,}")

print(f"\n Distribution of posts in test set:")
print(f"   Fake: {(df_test_balanced['2_way_label']==1).sum():,}")
print(f"   Non-fake: {(df_test_balanced['2_way_label']==0).sum():,}")

train_df['num_comments'] = train_df['id'].map(comment_counts).fillna(0)
df_test_balanced['num_comments'] = df_test_balanced['id'].map(comment_counts).fillna(0)

print(f"\n Comment statistics in balanced dataset:")
fake_bal = train_df[train_df['2_way_label'] == 1]['num_comments']
nonfake_bal = train_df[train_df['2_way_label'] == 0]['num_comments']

print(f"   Fake posts - Mean: {fake_bal.mean():.2f}, Std: {fake_bal.std():.2f}")
print(f"   Non-fake posts - Mean: {nonfake_bal.mean():.2f}, Std: {nonfake_bal.std():.2f}")


 Number of fake posts: 378,135
   Number of non-fake posts: 378,135

 Distribution of posts in train set:
   Fake: 271,696
   Non-fake: 271,696

 Distribution of posts in test set:
   Fake: 38,515
   Non-fake: 38,515

 Comment statistics in balanced dataset:
   Fake posts - Mean: 14.49, Std: 55.00
   Non-fake posts - Mean: 5.09, Std: 20.23


TASK_3


In [None]:
from transformers import BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tqdm import tqdm
import time

In [None]:
class FakedditDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

print(" Dataset class created")

 Dataset class created


In [None]:
MAX_LENGTH = 128
BATCH_SIZE = 16

train_dataset = FakedditDataset(
    train_df['text'].values,
    train_df['2_way_label'].values,
    tokenizer,
    MAX_LENGTH
)

val_dataset = FakedditDataset(
    val_df['text'].values,
    val_df['2_way_label'].values,
    tokenizer,
    MAX_LENGTH
)

test_dataset = FakedditDataset(
    df_test_balanced['text'].values,
    df_test_balanced['2_way_label'].values,
    tokenizer,
    MAX_LENGTH
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

print(f" Data loaders created")
print(f"  Train batches: {len(train_loader)}")
print(f"  Val batches: {len(val_loader)}")
print(f"  Test batches: {len(test_loader)}")
print(f"\nUsing max_length={MAX_LENGTH} tokens (subset of input)")

 Data loaders created
  Train batches: 33962
  Val batches: 8491
  Test batches: 4815

Using max_length=128 tokens (subset of input)


In [None]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

model = model.to(device)
print(f" Pre-trained BERT model loaded on {device}")


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Pre-trained BERT model loaded on cuda


In [None]:
EPOCHS = 1
LEARNING_RATE = 2e-5
WARMUP_STEPS = 100

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8)

total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=WARMUP_STEPS,
    num_training_steps=total_steps
)


print(f"Epochs: {EPOCHS}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Optimizer: AdamW")
print(f"Loss function: CrossEntropyLoss (on training set)")
print(f"Warmup steps: {WARMUP_STEPS}")
print(f"Total training steps: {total_steps}")

Epochs: 1
Batch size: 16
Learning rate: 2e-05
Optimizer: AdamW
Loss function: CrossEntropyLoss (on training set)
Warmup steps: 100
Total training steps: 33962


In [None]:
def train_epoch(model, data_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    predictions, true_labels = [], []

    progress_bar = tqdm(data_loader, desc='Training')

    for batch in progress_bar:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

        progress_bar.set_postfix({'loss': loss.item()})

    avg_loss = total_loss / len(data_loader)
    return avg_loss, predictions, true_labels


In [None]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return predictions, true_labels

print("Evaluation function defined")

Evaluation function defined


In [None]:
history = {'train_loss': [], 'val_acc': []}

for epoch in range(EPOCHS):
    print(f'\nEpoch {epoch + 1}/{EPOCHS}')


    start_time = time.time()

    train_loss, train_preds, train_labels = train_epoch(
        model, train_loader, optimizer, scheduler, device
    )

    val_preds, val_labels = evaluate(model, val_loader, device)
    val_acc = accuracy_score(val_labels, val_preds)

    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)

    epoch_time = time.time() - start_time

    print(f'\nTrain Loss: {train_loss:.4f}')
    print(f'Val Accuracy: {val_acc:.4f}')
    print(f'Epoch Time: {epoch_time:.2f}s')

print("\n Training done")



Epoch 1/1


Training: 100%|██████████| 33962/33962 [3:24:58<00:00,  2.76it/s, loss=0.112]
Evaluating: 100%|██████████| 8491/8491 [17:15<00:00,  8.20it/s]



Train Loss: 0.2082
Val Accuracy: 0.9368
Epoch Time: 13333.68s

 Training done


In [None]:
test_preds, test_labels = evaluate(model, test_loader, device)

accuracy = accuracy_score(test_labels, test_preds)
precision = precision_score(test_labels, test_preds, average='binary')
recall = recall_score(test_labels, test_preds, average='binary')
f1 = f1_score(test_labels, test_preds, average='binary')
cm = confusion_matrix(test_labels, test_preds)


print("METRICS  FOR TEST SET")
print(f"{'Metric':<20} {'Value':<10}")
print(f"{'Accuracy':<20} {accuracy:.4f}")
print(f"{'Precision':<20} {precision:.4f}")
print(f"{'Recall':<20} {recall:.4f}")
print(f"{'F1-Score':<20} {f1:.4f}")


print("CONFUSION MATRIX")
print(f"                 Predicted")
print(f"                 0      1")
print(f"Actual   0     {cm[0][0]:5d}  {cm[0][1]:5d}")
print(f"         1     {cm[1][0]:5d}  {cm[1][1]:5d}")

Evaluating: 100%|██████████| 4815/4815 [10:52<00:00,  7.38it/s]


METRICS  FOR TEST SET
Metric               Value     
Accuracy             0.9384
Precision            0.9444
Recall               0.9316
F1-Score             0.9380
CONFUSION MATRIX
                 Predicted
                 0      1
Actual   0     36403   2112
         1      2633  35882
