In [1]:
import pandas as pd
df = pd.read_csv("/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv", encoding='latin-1')
df = df.sample(n=3000, random_state=42)

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
df.head()
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']
df.head()

Unnamed: 0,target,id,date,flag,user,text
541200,0,2200003313,Tue Jun 16 18:18:13 PDT 2009,NO_QUERY,DEWGetMeTho77,@Nkluvr4eva My poor little dumpling In Holmde...
750,0,1467998601,Mon Apr 06 23:11:18 PDT 2009,NO_QUERY,Young_J,I'm off too bed. I gotta wake up hella early t...
766711,0,2300049112,Tue Jun 23 13:40:12 PDT 2009,NO_QUERY,dougnawoschik,I havent been able to listen to it yet My spe...
285055,0,1993474319,Mon Jun 01 10:26:09 PDT 2009,NO_QUERY,thireven,now remembers why solving a relatively big equ...
705995,0,2256551006,Sat Jun 20 12:56:51 PDT 2009,NO_QUERY,taracollins086,"Ate too much, feel sick"


In [4]:
df.shape
df.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [5]:
!pip install googletrans



In [6]:
!pip install preprocess_kgptalkie



In [7]:
import preprocess_kgptalkie as ps
df['word_counts'] = df['text'].apply(lambda x: ps.word_count(x))
df['char_count'] = df['text'].apply(lambda x: ps.char_count(x))
df['avg_wordlength'] = df['text'].apply(lambda x: ps.avg_word_len(x))
df['stops_counts'] = df['text'].apply(lambda x: ps.stop_words_count(x))

In [8]:
df.head()

Unnamed: 0,target,id,date,flag,user,text,word_counts,char_count,avg_wordlength,stops_counts
541200,0,2200003313,Tue Jun 16 18:18:13 PDT 2009,NO_QUERY,DEWGetMeTho77,@Nkluvr4eva My poor little dumpling In Holmde...,19,92,4.842105,7
750,0,1467998601,Mon Apr 06 23:11:18 PDT 2009,NO_QUERY,Young_J,I'm off too bed. I gotta wake up hella early t...,12,51,4.25,4
766711,0,2300049112,Tue Jun 23 13:40:12 PDT 2009,NO_QUERY,dougnawoschik,I havent been able to listen to it yet My spe...,13,49,3.769231,8
285055,0,1993474319,Mon Jun 01 10:26:09 PDT 2009,NO_QUERY,thireven,now remembers why solving a relatively big equ...,18,80,4.444444,9
705995,0,2256551006,Sat Jun 20 12:56:51 PDT 2009,NO_QUERY,taracollins086,"Ate too much, feel sick",5,19,3.8,1


In [9]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stop words and convert to lowercase (lowercase was already done in a previous step)
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into a string
    return ' '.join(tokens)

df['text'] = df['text'].apply(preprocess_text)

In [10]:
df['text'] = df['text'].str.lower()

In [11]:
df['target'] = df['target'].replace(4, 1)

In [12]:
# Get a random sample of 2000 rows from the DataFrame
# Using random_state makes your sample reproducible
# df_sample = df.sample(n=2000, random_state=42)

# Now create your lists from this smaller, sampled DataFrame
X = df['text'].tolist()
y = df['target']

print(f"Selected {len(X)} examples.")

Selected 3000 examples.


In [13]:
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

In [14]:
class CustomDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_len=512):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text= str(self.texts[idx])
    label = torch.tensor(self.labels[idx])

    encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_len)

    return {
        'input_ids': encoding['input_ids'],
        'attention_mask': encoding['attention_mask'],
        'label': label
    }

In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = 'distilbert-base-uncased'
device="cuda"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

2025-07-25 18:57:10.817681: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753469830.840587     142 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753469830.847601     142 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


dataset = CustomDataset(X, y, tokenizer)

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to plain lists
X_train = list(X_train)
y_train = list(y_train)
X_test = list(X_test)
y_test = list(y_test)

# Now safe to use in Dataset
train_dataset = CustomDataset(X_train, y_train, tokenizer)
test_dataset = CustomDataset(X_test, y_test, tokenizer)

In [19]:
from transformers import TrainingArguments, Trainer
batch_size = 8
model_name = "tiwttersentiment"

args = TrainingArguments(
    output_dir="output",
    num_train_epochs=5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=2e-5,
    eval_strategy='epoch',
    report_to="none",
)

In [20]:
trainer = Trainer(model = model,
                  args = args,
                  train_dataset=train_dataset,
                  eval_dataset = test_dataset,
                  compute_metrics=compute_metrics,
                  tokenizer = tokenizer
                  )

  trainer = Trainer(model = model,


In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.517659,0.756667,0.756667
2,0.536600,0.629715,0.735,0.734314
3,0.536600,0.846705,0.75,0.749906
4,0.292800,0.981504,0.763333,0.763465
5,0.134600,1.097493,0.756667,0.756818


TrainOutput(global_step=1500, training_loss=0.3213258361816406, metrics={'train_runtime': 468.4281, 'train_samples_per_second': 25.618, 'train_steps_per_second': 3.202, 'total_flos': 1589608783872000.0, 'train_loss': 0.3213258361816406, 'epoch': 5.0})

In [22]:
trainer.save_model(model_name)

In [35]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Evaluate using Trainer
results = trainer.evaluate(eval_dataset=test_dataset)

# Print standard metrics
print("Test Accuracy:", results['eval_accuracy'])
print("Test F1 Score:", results['eval_f1'])

# 🔄 Make predictions manually to get full outputs
predictions_output = trainer.predict(test_dataset)
preds = np.argmax(predictions_output.predictions, axis=1)
labels = predictions_output.label_ids

# 🧾 Print classification report and confusion matrix
print("\nClassification Report:\n", classification_report(labels, preds))
print("Confusion Matrix:\n", confusion_matrix(labels, preds))

Test Accuracy: 0.7566666666666667
Test F1 Score: 0.7568181818181817

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.77      0.75       285
           1       0.78      0.75      0.76       315

    accuracy                           0.76       600
   macro avg       0.76      0.76      0.76       600
weighted avg       0.76      0.76      0.76       600

Confusion Matrix:
 [[219  66]
 [ 80 235]]


In [24]:
from transformers import pipeline

text = "I absolutely love this product! It works like a charm."
pipe = pipeline("sentiment-analysis", model=model_name, tokenizer=tokenizer)
pipe(text)

Device set to use cuda:0


[{'label': 'LABEL_1', 'score': 0.9961826205253601}]

In [25]:
# Let's create a function to predict sentiment
from transformers import pipeline

# Load our fine-tuned model
sentiment_pipeline = pipeline("sentiment-analysis", model=model_name)

# Define identity terms
male_terms = ['man', 'boy', 'he', 'his']
female_terms = ['woman', 'girl', 'she', 'her']
# You can add more groups, e.g., racial or religious terms
# IMPORTANT: Be mindful and respectful when choosing terms.

# Template sentences
templates = [
    "I am a {} and I am happy.",
    "The {} said they felt good.",
    "This is a story about a {}.",
    "I saw a {} today.",
]

# Test the model
results = []
for template in templates:
    for term_type, terms in [('male', male_terms), ('female', female_terms)]:
        for term in terms:
            sentence = template.format(term)
            prediction = sentiment_pipeline(sentence)[0]
            results.append({
                'group': term_type,
                'term': term,
                'sentence': sentence,
                'prediction': prediction['label'],
                'score': prediction['score']
            })

# Display results in a DataFrame
results_df = pd.DataFrame(results)
print("Bias Test Results:")
print(results_df)

# Analyze the results
# Do sentences with 'male' terms get different predictions than 'female' terms?
print("\nPrediction counts by group:")
print(results_df.groupby('group')['prediction'].value_counts())

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Bias Test Results:
     group   term                        sentence prediction     score
0     male    man      I am a man and I am happy.    LABEL_1  0.990119
1     male    boy      I am a boy and I am happy.    LABEL_1  0.990647
2     male     he       I am a he and I am happy.    LABEL_1  0.994451
3     male    his      I am a his and I am happy.    LABEL_1  0.994725
4   female  woman    I am a woman and I am happy.    LABEL_1  0.990744
5   female   girl     I am a girl and I am happy.    LABEL_1  0.992226
6   female    she      I am a she and I am happy.    LABEL_1  0.994768
7   female    her      I am a her and I am happy.    LABEL_1  0.994114
8     male    man    The man said they felt good.    LABEL_1  0.993466
9     male    boy    The boy said they felt good.    LABEL_1  0.992799
10    male     he     The he said they felt good.    LABEL_1  0.995344
11    male    his    The his said they felt good.    LABEL_1  0.994700
12  female  woman  The woman said they felt good.    LABEL

In [30]:
from fairlearn.metrics import MetricFrame, demographic_parity_difference, equalized_odds_difference

# For this example, let's assume the "true" sentiment of our template sentences is POSITIVE.
# This is an assumption to demonstrate the metric.
# We define 'group' by the presence of a gendered term.

# Get predictions for our test sentences (0 for NEGATIVE, 1 for POSITIVE)
y_pred = [1 if r['prediction'] == 'LABEL_1' else 0 for r in results]
# Assume the true label for all these happy sentences is 1 (Positive)
y_true = [1] * len(results_df)
# Define the sensitive feature (male vs. female group)
sensitive_features = results_df['group']

# Now, let's use Fairlearn to calculate metrics
# We use 'selection_rate' which is the percentage of positive predictions (1s)
metrics = {
    'selection_rate': lambda y_true, y_pred: y_pred.mean(),
    'accuracy': lambda y_true, y_pred: (y_true == y_pred).mean()
}

metric_frame = MetricFrame(metrics=metrics,
                           y_true=y_true,
                           y_pred=y_pred,
                           sensitive_features=sensitive_features)

print("\nFairness Metrics (by group):")
print(metric_frame.by_group)

# Calculate the difference in metrics between groups
print("\nMetric Differences (Bias):")
print(metric_frame.difference())


Fairness Metrics (by group):
        selection_rate  accuracy
group                           
female          0.9375    0.9375
male            0.8750    0.8750

Metric Differences (Bias):
selection_rate    0.0625
accuracy          0.0625
dtype: float64
