In [3]:
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW
from sklearn.base import BaseEstimator
import torch
from tqdm import tqdm
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from cleanlab.classification import CleanLearning

In [4]:
class SentimentDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        target = self.targets[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

def train_epoch(model, data_loader, optimizer, device, scheduler=None):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in tqdm(data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
        loss = outputs.loss
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == targets)

        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        if scheduler:
            scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)


In [5]:
class BertSentimentClassifier(BaseEstimator):
    def __init__(self, model_path='bert-base-uncased', device=None):
        self.model_path = model_path
        self.tokenizer = BertTokenizerFast.from_pretrained(model_path)
        self.model = BertForSequenceClassification.from_pretrained(model_path, num_labels=3)
        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)
        self.max_len=128

    def fit(self, X, y, epochs=3):
        self.classes_ = np.unique(y)
        
        train_data = SentimentDataset(X, y, self.tokenizer, max_len=self.max_len)
        train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
        
        optimizer = AdamW(self.model.parameters(), lr=2e-5)
        
        for epoch in range(epochs):
            train_acc, train_loss = train_epoch(self.model, train_loader, optimizer, self.device)
            print(f'Epoch {epoch + 1}/{epochs} - Train loss: {train_loss}, accuracy: {train_acc}')

    def predict(self, X):
        X_list = X.tolist()
        encoding = self.tokenizer.batch_encode_plus(
            X_list,
            add_special_tokens=True,
            max_length=128,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)

        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
        
        return self.classes_[preds.cpu().numpy()]
    
    def predict_proba(self, X):
        X_list = X.tolist()  # Convert to list
        encoding = self.tokenizer.batch_encode_plus(
            X_list,  # Updated this line
            add_special_tokens=True,
            max_length=128,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)
        
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Convert logits to probabilities
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        
        return probs.cpu().numpy()

    def score(self, X, y):
        y_pred = self.predict(X)
        accuracy = (y_pred == y).mean()
        return accuracy


In [18]:
data = pd.read_csv('../data/merged/merged_6.csv', encoding='unicode_escape')[0:1500]

#data.drop(columns=['textID', 'selected_text', 'Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)'], inplace=True)

data = data.dropna()

In [19]:
data

Unnamed: 0,text,confidence_scores,predicted_labels
0,"I`d have responded, if I were going",1.0,neutral
1,Sooo SAD I will miss you here in San Diego!!!,1.0,negative
2,my boss is bullying me...,1.0,negative
3,what interview! leave me alone,1.0,negative
4,"Sons of ****, why couldn`t they put them on t...",1.0,negative
...,...,...,...
194,i talk to you,1.0,neutral
195,im soo bored...im deffo missing my music channels,1.0,negative
196,nite nite bday girl have fun at concert,1.0,positive
197,Had nicotine replacement patch on for 4 hours....,1.0,negative


In [20]:
clf = BertSentimentClassifier() 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
raw_texts, raw_labels = data["text"].values, data["predicted_labels"].values
raw_train_texts, raw_test_texts, raw_train_labels, raw_test_labels = train_test_split(raw_texts, raw_labels, test_size=0.2)

In [21]:
data.predicted_labels.value_counts()

predicted_labels
negative    79
positive    76
neutral     44
Name: count, dtype: int64

In [14]:
num_classes = len(set(raw_train_labels))

print(f"This dataset has {num_classes} classes.")
print(f"Classes: {set(raw_train_labels)}")

This dataset has 4 classes.
Classes: {'positive', 'mixed', 'neutral', 'negative'}


In [16]:
#Label encoding the labels
encoder = LabelEncoder()
encoder.fit(raw_train_labels)

train_labels = encoder.transform(raw_train_labels)
test_labels = encoder.transform(raw_test_labels)

In [17]:
clf.fit(raw_train_texts, train_labels, 3) # Initial model prediction , without removing noisy labels
accuracy = clf.score(raw_test_texts, test_labels) # This gives the baseline accuracy without acting on noisy labels
print(f'Accuracy: {accuracy:.4f}')

  0%|          | 0/10 [00:06<?, ?it/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [23]:
cv_n_folds = 3  # values like 5 or 10 will generally work better
cl = CleanLearning(clf, cv_n_folds=cv_n_folds) 

In [24]:
label_issues = cl.find_label_issues(X=raw_train_texts, labels=train_labels) # Finding label issues in dataset

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 50/50 [00:13<00:00,  3.66it/s]


Epoch 1/3 - Train loss: 1.0438781011104583, accuracy: 0.4355444305381727


100%|██████████| 50/50 [00:14<00:00,  3.52it/s]


Epoch 2/3 - Train loss: 0.7833910012245178, accuracy: 0.6795994993742178


100%|██████████| 50/50 [00:14<00:00,  3.46it/s]


Epoch 3/3 - Train loss: 0.5369862079620361, accuracy: 0.8085106382978724


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 50/50 [00:14<00:00,  3.37it/s]


Epoch 1/3 - Train loss: 1.057125790119171, accuracy: 0.41551939924906134


100%|██████████| 50/50 [00:18<00:00,  2.77it/s]


Epoch 2/3 - Train loss: 0.8346887052059173, accuracy: 0.623279098873592


100%|██████████| 50/50 [00:19<00:00,  2.63it/s]


Epoch 3/3 - Train loss: 0.5958329010009765, accuracy: 0.7684605757196495


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 50/50 [00:17<00:00,  2.89it/s]


Epoch 1/3 - Train loss: 1.0601495099067688, accuracy: 0.45125


100%|██████████| 50/50 [00:17<00:00,  2.81it/s]


Epoch 2/3 - Train loss: 0.7712335515022278, accuracy: 0.69125


100%|██████████| 50/50 [00:19<00:00,  2.59it/s]


Epoch 3/3 - Train loss: 0.48221698552370074, accuracy: 0.8225


In [25]:
identified_issues = label_issues[label_issues["is_label_issue"] == True] 
lowest_quality_labels = label_issues["label_quality"].argsort().to_numpy()

In [26]:
def print_as_df(index):
    return pd.DataFrame(
        {
            "text": raw_train_texts,
            "given_label": raw_train_labels,
            "predicted_label": encoder.inverse_transform(label_issues["predicted_label"]),
            "quality": label_issues["label_quality"]
        },
    ).iloc[index]

In [27]:
print_as_df(lowest_quality_labels[:10]) # Prints 10 labels with the least quality 

Unnamed: 0,text,given_label,predicted_label,quality
178,but my bday is JUNE 19.. this is wack... and ...,negative,positive,0.013122
190,okie gonna tweet more because i am loosing you...,negative,positive,0.014401
95,I am definitely ready... actually ahead of yo...,negative,positive,0.015469
250,Thanks! My mom`s seed is larger and already ...,negative,positive,0.018361
433,ahaha its stuck in my head; thanxx,positive,negative,0.018489
1139,Really wishes he had some spare cash to buy th...,negative,positive,0.018788
230,lmao yeaa iight n u shuld put tha c.b flick u...,positive,neutral,0.019159
640,Flap-a-taco was nice until the plebs came in.,negative,positive,0.021892
368,did you see the 15 sec clip of the New Moon t...,negative,neutral,0.022637
1113,so very irratated,negative,positive,0.022738


In [28]:
updated_clf = cl.fit(X=raw_train_texts, labels=train_labels, label_issues=cl.get_label_issues()) #New model fitting by pruning error labels

100%|██████████| 56/56 [00:22<00:00,  2.53it/s]


Epoch 1/3 - Train loss: 0.031053377356978933, accuracy: 0.988826815642458


100%|██████████| 56/56 [00:21<00:00,  2.55it/s]


Epoch 2/3 - Train loss: 0.03172345966491515, accuracy: 0.994413407821229


100%|██████████| 56/56 [00:25<00:00,  2.19it/s]

Epoch 3/3 - Train loss: 0.024068544668677663, accuracy: 0.9955307262569831





In [29]:
accuracy = updated_clf.score(raw_test_texts, test_labels)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.7400


In [1]:
'''def iterative_cleanlab_training(raw_train_texts, train_labels, raw_test_texts, test_labels, max_iterations=3, tolerance=10):
    clf = BertSentimentClassifier()
    prev_accuracy = 0
    for iteration in range(max_iterations):
        print(f"Starting Iteration {iteration + 1}")
        
        # 1. Train the model on the current labeled data
        clf.fit(raw_train_texts, train_labels, epochs=3)
        accuracy = clf.score(raw_test_texts, test_labels)
        print(f'Accuracy in Iteration {iteration + 1}: {accuracy:.4f}')
        
        # Stopping condition based on performance change
        #if abs(accuracy - prev_accuracy) < tolerance:
        #    break
        prev_accuracy = accuracy

        # 2. Use CleanLab to detect noisy labels
        cl = CleanLearning(clf, cv_n_folds=cv_n_folds)
        label_issues = cl.find_label_issues(X=raw_train_texts, labels=train_labels)

        # 3. Remove or correct the identified noisy labels (here, I'm just removing them)
        noisy_indices = label_issues[label_issues["is_label_issue"] == True].index
        raw_train_texts = np.delete(raw_train_texts, noisy_indices)
        train_labels = np.delete(train_labels, noisy_indices)
        
    return clf'''

'def iterative_cleanlab_training(raw_train_texts, train_labels, raw_test_texts, test_labels, max_iterations=3, tolerance=10):\n    clf = BertSentimentClassifier()\n    prev_accuracy = 0\n    for iteration in range(max_iterations):\n        print(f"Starting Iteration {iteration + 1}")\n        \n        # 1. Train the model on the current labeled data\n        clf.fit(raw_train_texts, train_labels, epochs=3)\n        accuracy = clf.score(raw_test_texts, test_labels)\n        print(f\'Accuracy in Iteration {iteration + 1}: {accuracy:.4f}\')\n        \n        # Stopping condition based on performance change\n        #if abs(accuracy - prev_accuracy) < tolerance:\n        #    break\n        prev_accuracy = accuracy\n\n        # 2. Use CleanLab to detect noisy labels\n        cl = CleanLearning(clf, cv_n_folds=cv_n_folds)\n        label_issues = cl.find_label_issues(X=raw_train_texts, labels=train_labels)\n\n        # 3. Remove or correct the identified noisy labels (here, I\'m just remov

In [2]:
'''# Calling the function
clf_final = iterative_cleanlab_training(raw_train_texts, train_labels, raw_test_texts, test_labels)
final_accuracy = clf_final.score(raw_test_texts, test_labels)
print(f'Final Accuracy: {final_accuracy:.4f}')
'''

"# Calling the function\nclf_final = iterative_cleanlab_training(raw_train_texts, train_labels, raw_test_texts, test_labels)\nfinal_accuracy = clf_final.score(raw_test_texts, test_labels)\nprint(f'Final Accuracy: {final_accuracy:.4f}')\n"