In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install transformers datasets evaluate 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from transformers import TFDistilBertForSequenceClassification

In [None]:
train_df = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv', encoding='latin-1')
test_df = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/test.csv', encoding='latin-1')

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
# Remove unnecessary columns
train_df.drop(columns=['textID', 'selected_text', 'Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)'], inplace=True)
train_df.head()

In [None]:
# Remove unnecessary columns
test_df.drop(columns=['textID', 'Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)'], inplace=True)
test_df.head()

In [None]:
# Null values
train_df.dropna(subset=['text'], inplace=True)
print(train_df.isna().sum())

In [None]:
test_df.dropna(subset=['text'], inplace=True)
print(test_df.isna().sum())

In [None]:
def clean_text(text: str) -> str:
    if not isinstance(text, str) or text.strip() == "":
        return ""
    
    # 1. Remove excessive whitespace (but keep single spaces)
    text = re.sub(r'\s+', ' ', text)
    
    # 2. Normalize some social media patterns
    text = re.sub(r'@\w+', '@USER', text)
    
    # 3. Replace URLs with a standard token (optional)  
    text = re.sub(r'https?://\S+|www\.\S+', 'URL', text)
    
    # 3. Basic cleanup - remove non-printable characters
    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)
    
    # 4. Strip leading/trailing whitespace
    text = text.strip()
    
    return text

In [None]:
train_df['cleaned_text'] = train_df['text'].astype(str).apply(clean_text)
test_df['cleaned_text'] = test_df['text'].astype(str).apply(clean_text)

In [None]:
# Encoding the labels in the train dataset
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_df['idx'] = label_encoder.fit_transform(train_df['sentiment'])
train_df.head()

In [None]:
# Encoding the labels in the test dataset
test_df['idx'] = label_encoder.transform(test_df['sentiment'])
test_df.head()

In [None]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=1, stratify=train_df['idx'])

# DistilBERT

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
max_length = 128

def tokenize_texts(texts):
    return tokenizer(
        list(texts),
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_tensors='tf'
    )

train_encodings = tokenize_texts(train_df['cleaned_text'])
val_encodings = tokenize_texts(val_df['cleaned_text'])
test_encodings = tokenize_texts(test_df['cleaned_text'])

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_df['idx'].values
)).shuffle(1000).batch(32)
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_df['idx'].values
)).batch(32)
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_df['idx'].values
)).batch(32)

In [None]:
model_bert = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

In [None]:
model_bert.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
)
model_bert.summary()

In [None]:
# Early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=0.0001,
    patience=3,
    verbose=1,
    mode="auto",
    baseline=None,
    restore_best_weights=True,
    start_from_epoch=0,
)

In [None]:
model_bert_history = model_bert.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5,
    callbacks=[early_stopping]
)

In [None]:
validation_accuracy = model_bert.evaluate(val_dataset)
test_accuracy = model_bert.evaluate(test_dataset)

print(f"validation accuracy : {validation_accuracy[1]:0.4f}")
print(f"test accuracy : {test_accuracy[1]:0.4f}")

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

# Predictions
predictions_out = model_bert.predict(test_dataset)
if isinstance(predictions_out, dict):
    logits = predictions_out["logits"]
elif hasattr(predictions_out, "logits"):
    logits = predictions_out.logits
else:
    logits = predictions_out
y_pred = np.argmax(logits, axis=1)

# True labels
y_true = np.concatenate([y.numpy() for _, y in test_dataset], axis=0)

# Reports
class_names = label_encoder.classes_
print("\n--- Classification Report ---")
print(classification_report(y_true, y_pred, target_names=class_names))

cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names, cbar=False)
plt.title('DistilBERT')
plt.ylabel('Actual Labels')
plt.xlabel('Predicted Labels')
plt.show()

# RoBERTa

In [None]:
from transformers import RobertaTokenizerFast, TFRobertaForSequenceClassification

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
max_length = 128

In [None]:
def tokenize_texts(texts):
    return tokenizer(
        list(texts),
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_tensors='tf'
    )

train_encodings = tokenize_texts(train_df['cleaned_text'])
val_encodings   = tokenize_texts(val_df['cleaned_text'])
test_encodings  = tokenize_texts(test_df['cleaned_text'])

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    tf.cast(train_df['idx'].values, tf.int32)   
)).shuffle(1000).batch(32)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    tf.cast(val_df['idx'].values, tf.int32)
)).batch(32)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    tf.cast(test_df['idx'].values, tf.int32)
)).batch(32)

In [None]:
model_roberta = TFRobertaForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels=3   
)

In [None]:
model_roberta.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
)
model_roberta.summary()

In [None]:
model_roberta_history = model_roberta.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5,
    callbacks=[early_stopping]
)

In [None]:
validation_accuracy = model_roberta.evaluate(val_dataset)
test_accuracy = model_roberta.evaluate(test_dataset)

print(f"validation accuracy : {validation_accuracy[1]:0.4f}")
print(f"test accuracy : {test_accuracy[1]:0.4f}")

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

# Predictions
predictions_out = model_roberta.predict(test_dataset)
if isinstance(predictions_out, dict):
    logits = predictions_out["logits"]
elif hasattr(predictions_out, "logits"):
    logits = predictions_out.logits
else:
    logits = predictions_out
y_pred = np.argmax(logits, axis=1)

# True labels
y_true = np.concatenate([y.numpy() for _, y in test_dataset], axis=0)

# Reports
class_names = label_encoder.classes_
print("\n--- Classification Report ---")
print(classification_report(y_true, y_pred, target_names=class_names))

cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(4, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names, cbar=False)
plt.title('RoBERTa')
plt.ylabel('Actual Labels')
plt.xlabel('Predicted Labels')
plt.show()