In [1]:
# Install required packages
%pip install transformers datasets accelerate evaluate scikit-learn imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import required libraries
import pandas as pd
import numpy as np
from collections import Counter
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from imblearn.over_sampling import ADASYN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.neighbors import NearestNeighbors
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load the dataset
csv_path = r'/Users/sk/Library/Mobile Documents/com~apple~CloudDocs/3. Techno (Professional)/21. Prj2_Transformer based Comment analyzer Chrome Plugin/ Prj2_CodeFile/data/dataset.csv'
df = pd.read_csv(csv_path)

# Clean the data
df = df.dropna(subset=['clean_comment', 'category'])
df['clean_comment'] = df['clean_comment'].astype(str)

# Keep all 3 classes: -1 (negative), 0 (neutral), 1 (positive)
# Map categories to 0, 1, 2 for model training
# -1 -> 0 (negative), 0 -> 1 (neutral), 1 -> 2 (positive)
df['label'] = df['category'].map({-1: 0, 0: 1, 1: 2})

print("Original class distribution:")
print(Counter(df['label']))
print(f"\nTotal samples: {len(df)}")
df.head()

Original class distribution:
Counter({2: 15830, 1: 13042, 0: 8277})

Total samples: 37149


Unnamed: 0.1,Unnamed: 0,clean_comment,category,label
0,0,family mormon have never tried explain them t...,1,2
1,1,buddhism has very much lot compatible with chr...,1,2
2,2,seriously don say thing first all they won get...,-1,0
3,3,what you have learned yours and only yours wha...,0,1
4,4,for your own benefit you may want read living ...,1,2


In [4]:
# Apply ADASYN for handling class imbalance
# ADASYN works on numeric features, so we'll convert text to embeddings first
# Using TF-IDF embeddings for ADASYN (lightweight and effective)

print("Converting text to TF-IDF embeddings for ADASYN...")
vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2), min_df=2)
X_tfidf = vectorizer.fit_transform(df['clean_comment']).toarray()
y = df['label'].values

print(f"TF-IDF shape: {X_tfidf.shape}")
print(f"Original class distribution: {Counter(y)}")

# Apply ADASYN
print("\nApplying ADASYN...")
adasyn = ADASYN(random_state=42, n_neighbors=5)
X_resampled, y_resampled = adasyn.fit_resample(X_tfidf, y)

print(f"Resampled shape: {X_resampled.shape}")
print(f"Resampled class distribution: {Counter(y_resampled)}")

Converting text to TF-IDF embeddings for ADASYN...
TF-IDF shape: (37149, 500)
Original class distribution: Counter({np.int64(2): 15830, np.int64(1): 13042, np.int64(0): 8277})

Applying ADASYN...
Resampled shape: (44838, 500)
Resampled class distribution: Counter({np.int64(2): 15830, np.int64(0): 15580, np.int64(1): 13428})


In [5]:
# ADASYN generates synthetic samples, but we need actual text for BERT
# We'll map synthetic embeddings back to original text samples using nearest neighbors

print("Mapping synthetic samples to original text samples...")

# Find nearest neighbors for synthetic samples
nbrs = NearestNeighbors(n_neighbors=1, metric='cosine').fit(X_tfidf)
_, indices = nbrs.kneighbors(X_resampled)

# Get the original text samples corresponding to resampled indices
resampled_indices = indices.flatten()
resampled_texts = df['clean_comment'].iloc[resampled_indices].values
resampled_labels = y_resampled

# Create balanced dataframe
balanced_df = pd.DataFrame({
    'clean_comment': resampled_texts,
    'label': resampled_labels
})

print(f"\nBalanced dataset shape: {balanced_df.shape}")
print("Balanced class distribution:")
print(Counter(balanced_df['label']))

Mapping synthetic samples to original text samples...

Balanced dataset shape: (44838, 2)
Balanced class distribution:
Counter({2: 15830, 0: 15580, 1: 13428})


In [None]:
# Create HuggingFace Dataset
dataset = Dataset.from_pandas(balanced_df.reset_index(drop=True))

# Load BERT tokenizer
model_name = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("Tokenizer loaded successfully!")

In [None]:
# Preprocess function for tokenization
def preprocess_function(examples):
    return tokenizer(examples["clean_comment"], truncation=True, padding=False)

# Tokenize the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Create data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print("Dataset tokenized successfully!")

In [None]:
# Split dataset into train, validation, and test sets
# Train: 80%, Validation: 10%, Test: 10%
train_testvalid = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_testvalid['train']

test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)
eval_dataset = test_valid['train']
test_dataset = test_valid['test']

print(f"Train Size: {len(train_dataset)}")
print(f"Validation Size: {len(eval_dataset)}")
print(f"Test Size: {len(test_dataset)}")

In [None]:
# Load BERT model for 3-class classification
id2label = {0: "Negative", 1: "Neutral", 2: "Positive"}
label2id = {"Negative": 0, "Neutral": 1, "Positive": 2}

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

print("Model loaded successfully!")
print(f"Model has {sum(p.numel() for p in model.parameters())} parameters")

In [None]:
# Define evaluation metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    # Get predicted classes
    predicted_classes = np.argmax(predictions, axis=1)
    
    # Calculate metrics
    accuracy = accuracy_score(labels, predicted_classes)
    f1_macro = f1_score(labels, predicted_classes, average='macro')
    f1_weighted = f1_score(labels, predicted_classes, average='weighted')
    
    return {
        "accuracy": round(accuracy, 4),
        "f1_macro": round(f1_macro, 4),
        "f1_weighted": round(f1_weighted, 4)
    }

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="bert-sentiment-3class-adasyn",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    save_total_limit=2,
    seed=42,
    fp16=False,  # Set to True if using GPU with CUDA
)

print("Training arguments configured!")

In [None]:
# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Trainer created successfully!")

In [None]:
# Train the model
print("Starting training...")
print("="*60)
trainer.train()
print("="*60)
print("Training completed!")

In [None]:
# Evaluate on test set
print("Evaluating on test set...")
print("="*60)
test_results = trainer.predict(test_dataset)
test_predictions = np.argmax(test_results.predictions, axis=1)
test_labels = test_results.label_ids

# Print detailed classification report
target_names = ['Negative (-1)', 'Neutral (0)', 'Positive (1)']
print("\nClassification Report on Test Set:")
print("="*60)
print(classification_report(test_labels, test_predictions, target_names=target_names, digits=4))
print("="*60)

print(f"\nTest Accuracy: {accuracy_score(test_labels, test_predictions):.4f}")
print(f"Test F1-Macro: {f1_score(test_labels, test_predictions, average='macro'):.4f}")
print(f"Test F1-Weighted: {f1_score(test_labels, test_predictions, average='weighted'):.4f}")

In [None]:
# Save the final model
model_save_path = "./bert-sentiment-3class-final"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")