In [1]:
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW
from sklearn.base import BaseEstimator
import torch
from tqdm import tqdm
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from cleanlab.classification import CleanLearning

In [2]:
class SentimentDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        target = self.targets[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

def train_epoch(model, data_loader, optimizer, device, scheduler=None):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in tqdm(data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
        loss = outputs.loss
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == targets)

        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        if scheduler:
            scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)


In [3]:
class BertSentimentClassifier(BaseEstimator):
    def __init__(self, model_path='bert-base-uncased', device=None):
        self.model_path = model_path
        self.tokenizer = BertTokenizerFast.from_pretrained(model_path)
        self.model = BertForSequenceClassification.from_pretrained(model_path, num_labels=3)
        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)
        self.max_len=128

    def fit(self, X, y, epochs=3):
        self.classes_ = np.unique(y)
        
        train_data = SentimentDataset(X, y, self.tokenizer, max_len=self.max_len)
        train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
        
        optimizer = AdamW(self.model.parameters(), lr=2e-5)
        
        for epoch in range(epochs):
            train_acc, train_loss = train_epoch(self.model, train_loader, optimizer, self.device)
            print(f'Epoch {epoch + 1}/{epochs} - Train loss: {train_loss}, accuracy: {train_acc}')

    def predict(self, X):
        X_list = X.tolist()
        encoding = self.tokenizer.batch_encode_plus(
            X_list,
            add_special_tokens=True,
            max_length=128,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)

        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
        
        return self.classes_[preds.cpu().numpy()]
    
    def predict_proba(self, X):
        X_list = X.tolist()  # Convert to list
        encoding = self.tokenizer.batch_encode_plus(
            X_list,  # Updated this line
            add_special_tokens=True,
            max_length=128,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)
        
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Convert logits to probabilities
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        
        return probs.cpu().numpy()

    def score(self, X, y):
        y_pred = self.predict(X)
        accuracy = (y_pred == y).mean()
        return accuracy


In [5]:
data = pd.read_csv('../data/merged/merged_8.csv', encoding='unicode_escape')[0:1500]

#data.drop(columns=['textID', 'selected_text', 'Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)'], inplace=True)

data = data.dropna()

In [6]:
data

Unnamed: 0,text,confidence_scores,predicted_labels
0,"I`d have responded, if I were going",1.0,neutral
1,Sooo SAD I will miss you here in San Diego!!!,1.0,negative
2,my boss is bullying me...,1.0,negative
3,what interview! leave me alone,1.0,negative
4,"Sons of ****, why couldn`t they put them on t...",1.0,negative
...,...,...,...
195,i talk to you,1.0,neutral
196,im soo bored...im deffo missing my music channels,1.0,negative
197,nite nite bday girl have fun at concert,1.0,positive
198,Had nicotine replacement patch on for 4 hours....,1.0,negative


In [7]:
clf = BertSentimentClassifier() 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
raw_texts, raw_labels = data["text"].values, data["predicted_labels"].values
raw_train_texts, raw_test_texts, raw_train_labels, raw_test_labels = train_test_split(raw_texts, raw_labels, test_size=0.2)

In [9]:
data.predicted_labels.value_counts()

predicted_labels
positive    76
negative    75
neutral     49
Name: count, dtype: int64

In [10]:
num_classes = len(set(raw_train_labels))

print(f"This dataset has {num_classes} classes.")
print(f"Classes: {set(raw_train_labels)}")

This dataset has 3 classes.
Classes: {'negative', 'neutral', 'positive'}


In [11]:
#Label encoding the labels
encoder = LabelEncoder()
encoder.fit(raw_train_labels)

train_labels = encoder.transform(raw_train_labels)
test_labels = encoder.transform(raw_test_labels)

In [12]:
clf.fit(raw_train_texts, train_labels, 3) # Initial model prediction , without removing noisy labels
accuracy = clf.score(raw_test_texts, test_labels) # This gives the baseline accuracy without acting on noisy labels
print(f'Accuracy: {accuracy:.4f}')

100%|██████████| 10/10 [00:51<00:00,  5.13s/it]


Epoch 1/3 - Train loss: 1.1396486163139343, accuracy: 0.325


100%|██████████| 10/10 [00:42<00:00,  4.27s/it]


Epoch 2/3 - Train loss: 0.9907938301563263, accuracy: 0.51875


100%|██████████| 10/10 [00:46<00:00,  4.63s/it]


Epoch 3/3 - Train loss: 0.8983046293258667, accuracy: 0.61875
Accuracy: 0.5750


In [13]:
cv_n_folds = 3  # values like 5 or 10 will generally work better
cl = CleanLearning(clf, cv_n_folds=cv_n_folds) 

In [14]:
label_issues = cl.find_label_issues(X=raw_train_texts, labels=train_labels) # Finding label issues in dataset

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 7/7 [00:26<00:00,  3.76s/it]


Epoch 1/3 - Train loss: 1.126502479825701, accuracy: 0.2830188679245283


100%|██████████| 7/7 [00:27<00:00,  3.98s/it]


Epoch 2/3 - Train loss: 1.0170882259096419, accuracy: 0.5377358490566038


100%|██████████| 7/7 [00:29<00:00,  4.16s/it]


Epoch 3/3 - Train loss: 0.9128633056368146, accuracy: 0.6792452830188679


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 7/7 [00:29<00:00,  4.28s/it]


Epoch 1/3 - Train loss: 1.1038631541388375, accuracy: 0.3364485981308411


100%|██████████| 7/7 [00:30<00:00,  4.34s/it]


Epoch 2/3 - Train loss: 1.0303674169949122, accuracy: 0.4579439252336448


100%|██████████| 7/7 [00:36<00:00,  5.25s/it]


Epoch 3/3 - Train loss: 0.9966505255017962, accuracy: 0.5700934579439252


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 7/7 [00:31<00:00,  4.48s/it]


Epoch 1/3 - Train loss: 1.0884945392608643, accuracy: 0.34579439252336447


100%|██████████| 7/7 [00:25<00:00,  3.68s/it]


Epoch 2/3 - Train loss: 1.0137472919055395, accuracy: 0.5327102803738317


100%|██████████| 7/7 [00:26<00:00,  3.84s/it]


Epoch 3/3 - Train loss: 0.9488757848739624, accuracy: 0.5700934579439252


In [15]:
identified_issues = label_issues[label_issues["is_label_issue"] == True] 
lowest_quality_labels = label_issues["label_quality"].argsort().to_numpy()

In [16]:
lowest_quality_labels

array([ 68, 142,  18,  76, 139,   7,  13, 154,  90,  89, 140,   3,  66,
        60,  41, 101,  97,  54,  85, 106,  24, 112, 147,  23, 153,  57,
       128, 103,   2,  43,  69,  28,  15,  82, 108, 151,  88,  22,  36,
        63, 155, 137, 123, 124, 114,  16,   0, 111,  61,  75, 121,  17,
        91, 138,  47, 148, 132, 104,  31,  49,  95,  37,  79, 105, 136,
       159,  67,  92,  27,  45,   9,  51, 113,  81, 100,  33,   5, 158,
        21, 127, 116,  72,  99,  83,  71,  25,  39, 115,  78, 120, 134,
       107, 133, 145,  73,   1, 109,  11,  19, 144,   8, 156, 110,  52,
        87, 130,  86, 143,  84,  74, 146, 135,  53, 152,  62,  80,  40,
        14, 126,  50, 119,  55,  77,   4,  12, 122,  26,  64,  59,  65,
        48, 125,   6, 157, 149,  58,  46,  56, 131,  98,  94, 117, 118,
        38, 141, 102,  34,  20,  32,  10,  44,  96, 129,  93,  35,  30,
        42, 150,  29,  70], dtype=int64)

In [22]:
def get_dataframe_by_index(index):
    df = pd.DataFrame(
        {
            "text": raw_train_texts,
            "given_label": raw_train_labels,
            "predicted_label": encoder.inverse_transform(label_issues["predicted_label"]),
            "quality": label_issues["label_quality"]
        }
    )
    
    return df.iloc[index]


In [23]:
top_20_error_rows = get_dataframe_by_index(lowest_quality_labels[:20]) # 20 labels with the least quality 

In [24]:
error_rows

Unnamed: 0,text,given_label,predicted_label,quality
68,graduation is done im a little sad.. anyone w...,neutral,negative,0.141265
142,Hi how are you doing ??? *just joined twitt...,neutral,negative,0.163042
18,if u have a friendster add me!!!!!!!!! ...,neutral,negative,0.163777
76,"I`d have responded, if I were going",neutral,negative,0.16455
139,http://www.dothebouncy.com/smf - some shameles...,neutral,positive,0.172663
7,MAYDAY?!,neutral,positive,0.18551
13,i realy wanted to go out cause its so nice but...,neutral,negative,0.191149
154,"Ahhh, I slept through the game. I`m gonna tr...",neutral,positive,0.191231
90,"also bored at school, its my third freelesson...",neutral,negative,0.191403
89,"Thats it, its the end. Tears for Fears vs Eric...",neutral,positive,0.192056


In [28]:
updated_clf = cl.fit(X=raw_train_texts, labels=train_labels, label_issues=cl.get_label_issues()) #New model fitting by pruning error labels

100%|██████████| 56/56 [00:22<00:00,  2.53it/s]


Epoch 1/3 - Train loss: 0.031053377356978933, accuracy: 0.988826815642458


100%|██████████| 56/56 [00:21<00:00,  2.55it/s]


Epoch 2/3 - Train loss: 0.03172345966491515, accuracy: 0.994413407821229


100%|██████████| 56/56 [00:25<00:00,  2.19it/s]

Epoch 3/3 - Train loss: 0.024068544668677663, accuracy: 0.9955307262569831





In [29]:
accuracy = updated_clf.score(raw_test_texts, test_labels)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.7400


In [1]:
'''def iterative_cleanlab_training(raw_train_texts, train_labels, raw_test_texts, test_labels, max_iterations=3, tolerance=10):
    clf = BertSentimentClassifier()
    prev_accuracy = 0
    for iteration in range(max_iterations):
        print(f"Starting Iteration {iteration + 1}")
        
        # 1. Train the model on the current labeled data
        clf.fit(raw_train_texts, train_labels, epochs=3)
        accuracy = clf.score(raw_test_texts, test_labels)
        print(f'Accuracy in Iteration {iteration + 1}: {accuracy:.4f}')
        
        # Stopping condition based on performance change
        #if abs(accuracy - prev_accuracy) < tolerance:
        #    break
        prev_accuracy = accuracy

        # 2. Use CleanLab to detect noisy labels
        cl = CleanLearning(clf, cv_n_folds=cv_n_folds)
        label_issues = cl.find_label_issues(X=raw_train_texts, labels=train_labels)

        # 3. Remove or correct the identified noisy labels (here, I'm just removing them)
        noisy_indices = label_issues[label_issues["is_label_issue"] == True].index
        raw_train_texts = np.delete(raw_train_texts, noisy_indices)
        train_labels = np.delete(train_labels, noisy_indices)
        
    return clf'''

'def iterative_cleanlab_training(raw_train_texts, train_labels, raw_test_texts, test_labels, max_iterations=3, tolerance=10):\n    clf = BertSentimentClassifier()\n    prev_accuracy = 0\n    for iteration in range(max_iterations):\n        print(f"Starting Iteration {iteration + 1}")\n        \n        # 1. Train the model on the current labeled data\n        clf.fit(raw_train_texts, train_labels, epochs=3)\n        accuracy = clf.score(raw_test_texts, test_labels)\n        print(f\'Accuracy in Iteration {iteration + 1}: {accuracy:.4f}\')\n        \n        # Stopping condition based on performance change\n        #if abs(accuracy - prev_accuracy) < tolerance:\n        #    break\n        prev_accuracy = accuracy\n\n        # 2. Use CleanLab to detect noisy labels\n        cl = CleanLearning(clf, cv_n_folds=cv_n_folds)\n        label_issues = cl.find_label_issues(X=raw_train_texts, labels=train_labels)\n\n        # 3. Remove or correct the identified noisy labels (here, I\'m just remov

In [2]:
'''# Calling the function
clf_final = iterative_cleanlab_training(raw_train_texts, train_labels, raw_test_texts, test_labels)
final_accuracy = clf_final.score(raw_test_texts, test_labels)
print(f'Final Accuracy: {final_accuracy:.4f}')
'''

"# Calling the function\nclf_final = iterative_cleanlab_training(raw_train_texts, train_labels, raw_test_texts, test_labels)\nfinal_accuracy = clf_final.score(raw_test_texts, test_labels)\nprint(f'Final Accuracy: {final_accuracy:.4f}')\n"