In [None]:
!pip install imblearn

In [None]:
!pip install lightgbm

In [None]:
!pip install wordcloud

#### The notebook draws reference to https://www.kaggle.com/code/manankakkar/fastbert-mutlilabel-text-classification/notebook 

In [1]:
import json
import re 
import nltk
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.decomposition import PCA

from imblearn.over_sampling import SMOTE
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

#Modelling
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline

import lightgbm as lgb
from lightgbm import LGBMClassifier

In [43]:
pd.set_option('display.max_rows', None)

In [2]:
nltk.download('stopwords')
stopwords = stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ruhwang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [102]:
df_train = pd.read_csv("/Users/ruhwang/Desktop/AI/spring2025_courses/aipi540-dl/ig_post_generator/data/raw/captions_dataset/data/train.tsv", sep='\t', 
                       header=None, names=['Text', 'Class', 'ID'])
df_dev = pd.read_csv("/Users/ruhwang/Desktop/AI/spring2025_courses/aipi540-dl/ig_post_generator/data/raw/captions_dataset/data/dev.tsv", sep='\t', 
                     header=None, names=['Text', 'Class', 'ID'])

In [103]:
df_train.columns

Index(['Text', 'Class', 'ID'], dtype='object')

In [104]:
df_train['list_classes'] = df_train['Class'].apply(lambda x: x.split(','))
df_train['len_classes'] = df_train['list_classes'].apply(lambda x: len(x))
df_dev['list_classes'] = df_dev['Class'].apply(lambda x: x.split(','))
df_dev['len_classes'] = df_dev['list_classes'].apply(lambda x: len(x))

In [105]:
del df_train['Class']
del df_dev['Class']

In [106]:
emotion_file = open("/Users/ruhwang/Desktop/AI/spring2025_courses/aipi540-dl/ig_post_generator/data/raw/captions_dataset/data/emotions.txt", "r")
emotion_list = emotion_file.read()
emotion_list = emotion_list.split("\n")
print(emotion_list)

['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


In [107]:
def idx2class(idx_list):
    arr = []
    for i in idx_list:
        arr.append(emotion_list[int(i)])
    return arr

In [108]:
df_train['emotions'] = df_train['list_classes'].apply(idx2class)
df_dev['emotions'] = df_dev['list_classes'].apply(idx2class)

In [109]:
df_train['emotions'] = df_train['emotions'].apply(list)
df_dev['emotions'] = df_dev['emotions'].apply(list)

In [29]:
with open('/Users/ruhwang/Desktop/AI/spring2025_courses/aipi540-dl/ig_post_generator/data/raw/captions_dataset/data/ekman_mapping.json') as file:
    ekman_mapping = json.load(file)

In [95]:
ekman_mapping

{'anger': ['anger', 'annoyance', 'disapproval'],
 'disgust': ['disgust'],
 'fear': ['fear', 'nervousness'],
 'joy': ['joy',
  'amusement',
  'approval',
  'excitement',
  'gratitude',
  'love',
  'optimism',
  'relief',
  'pride',
  'admiration',
  'desire',
  'caring'],
 'sadness': ['sadness', 'disappointment', 'embarrassment', 'grief', 'remorse'],
 'surprise': ['surprise', 'realization', 'confusion', 'curiosity']}

In [110]:
df_train['emotions'].value_counts()

emotions
[neutral]                                                      12823
[admiration]                                                    2710
[approval]                                                      1873
[gratitude]                                                     1857
[amusement]                                                     1652
[annoyance]                                                     1451
[love]                                                          1427
[disapproval]                                                   1402
[curiosity]                                                     1389
[anger]                                                         1025
[optimism]                                                       861
[confusion]                                                      858
[joy]                                                            853
[sadness]                                                        817
[surprise]               

In [111]:
# Create inverse mapping (sub-mood → main emotion)
inverse_mapping = {}
for main_emotion, sub_moods in ekman_mapping.items():
    for sub_mood in sub_moods:
        inverse_mapping[sub_mood] = main_emotion

# Function to check if any sub-mood in a list belongs to a main emotion category
def contains_emotion(mood_list, main_emotion):
    if not isinstance(mood_list, list):
        return 0
    return int(any(sub_mood in ekman_mapping[main_emotion] for sub_mood in mood_list))

# Create new columns for each main emotion
for main_emotion in ekman_mapping.keys():
    df_train[f'is_{main_emotion}'] = df_train['emotions'].apply(
        lambda x: contains_emotion(x, main_emotion)
    )

In [112]:
# Create new columns for each main emotion
for main_emotion in ekman_mapping.keys():
    df_dev[f'is_{main_emotion}'] = df_dev['emotions'].apply(
        lambda x: contains_emotion(x, main_emotion)
    )

In [114]:
train = df_train.copy(deep=True)
dev = df_dev.copy(deep=True)

In [115]:
# Split the list into separate columns
emotion_cols = df_train['emotions'].apply(pd.Series)

# Rename the columns
emotion_cols = emotion_cols.rename(columns=lambda x: f'emotion{x+1}' if x != 0 else 'emotion')

# Join the new columns back to the original DataFrame
df_train = pd.concat([df_train, emotion_cols], axis=1)
df_dev = pd.concat([df_dev, emotion_cols], axis=1)

In [116]:
df_train.head(5)

Unnamed: 0,Text,ID,list_classes,len_classes,emotions,is_anger,is_disgust,is_fear,is_joy,is_sadness,is_surprise,emotion,emotion2,emotion3,emotion4,emotion5
0,My favourite food is anything I didn't have to...,eebbqej,[27],1,[neutral],0,0,0,0,0,0,neutral,,,,
1,"Now if he does off himself, everyone will thin...",ed00q6i,[27],1,[neutral],0,0,0,0,0,0,neutral,,,,
2,WHY THE FUCK IS BAYLESS ISOING,eezlygj,[2],1,[anger],1,0,0,0,0,0,anger,,,,
3,To make her feel threatened,ed7ypvh,[14],1,[fear],0,0,1,0,0,0,fear,,,,
4,Dirty Southern Wankers,ed0bdzj,[3],1,[annoyance],1,0,0,0,0,0,annoyance,,,,


In [None]:
import re

def clean_text(text):
    """
    Cleans text by:
    - Converting to string (handles numbers, NaN, etc.)
    - Removing email addresses
    - Keeping only alphanumeric + basic punctuation
    - Trimming whitespace
    """
    if pd.isna(text):  # Handle NaN/None
        return ""
    
    # Force convert to string (e.g., numbers like 123 → "123")
    text = str(text)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+\.\S+', '', text)
    
    # Keep only letters, numbers, and basic punctuation (.!?)
    # Adjust regex as needed for your use case
    text = re.sub(r'[^a-zA-Z0-9\s.!?,]', '', text)
    
    # Collapse multiple spaces and trim
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [None]:
# Apply to DataFrame
df_train["Text"] = df_train["Text"].apply(clean_text)
df_dev["Text"] = df_dev["Text"].apply(clean_text)

In [None]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained model (e.g., 'all-MiniLM-L6-v2' for fast & decent performance)
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
sentences = df_train["Text"].tolist()  # Convert text column to a list of sentences
embeddings = model.encode(sentences, convert_to_tensor=False)  # Returns numpy array

# Convert embeddings to DataFrame
embedding_df = pd.DataFrame(embeddings, columns=[f"embedding_{i}" for i in range(embeddings.shape[1])])

# Combine with original DataFrame
df_train = pd.concat([df_train.reset_index(drop=True), embedding_df], axis=1)

In [127]:
df_dev["Text"].unique()

array(['Is this in New Orleans?? I really feel like this is New Orleans.',
       'You know the answer man, you are programmed to capture those codes they send you, don’t avoid them!',
       "I've never been this sad in my life!", ...,
       'I can’t stand this arrogant prick he’s no better thenFord in Ontario and that guy is a buffoon',
       '::but I like baby bangs:: /tiny voice', nan], dtype=object)

In [130]:
dev_embeddings = model.encode(df_dev["Text"].tolist(), convert_to_tensor=False)

# Convert embeddings to DataFrame
dev_embeddings = pd.DataFrame(dev_embeddings, columns=[f"embedding_{i}" for i in range(dev_embeddings .shape[1])])

# Combine with original DataFrame
df_dev = pd.concat([df_dev.reset_index(drop=True), dev_embeddings], axis=1)

In [125]:
df_train.head(5)

Unnamed: 0,Text,ID,list_classes,len_classes,emotions,is_anger,is_disgust,is_fear,is_joy,is_sadness,...,embedding_374,embedding_375,embedding_376,embedding_377,embedding_378,embedding_379,embedding_380,embedding_381,embedding_382,embedding_383
0,My favourite food is anything I didn't have to...,eebbqej,[27],1,[neutral],0,0,0,0,0,...,0.052991,-0.005411,-0.008929,0.033815,0.088385,0.03236,0.071984,0.125724,-0.092886,-0.093087
1,"Now if he does off himself, everyone will thin...",ed00q6i,[27],1,[neutral],0,0,0,0,0,...,0.156789,-0.04431,0.001823,0.040813,-0.045302,0.013656,0.008513,-0.017755,-0.019391,-0.056808
2,WHY THE FUCK IS BAYLESS ISOING,eezlygj,[2],1,[anger],1,0,0,0,0,...,0.056932,0.046582,0.066673,0.010786,0.001259,0.135211,0.034313,-0.071771,0.050022,0.021015
3,To make her feel threatened,ed7ypvh,[14],1,[fear],0,0,1,0,0,...,0.000347,-0.019538,0.042923,-0.080768,-0.020088,0.080839,-0.061527,0.007211,-0.015224,-0.026027
4,Dirty Southern Wankers,ed0bdzj,[3],1,[annoyance],1,0,0,0,0,...,0.033567,-0.063048,0.051423,0.020454,0.024418,-0.005523,-0.018856,0.025774,-0.024104,-0.016645


In [None]:
from sklearn.decomposition import PCA

# Reduce to 50 dimensions
pca = PCA(n_components=100)
reduced_embeddings_train = pca.fit_transform(embeddings)
reduced_embeddings_dev = pca.fit_transform(dev_embeddings)

# Add to DataFrame
for i in range(100):
    df_train[f"pca_embedding_{i}"] = reduced_embeddings_train[:, i]
    df_dev[f"pca_embedding_{i}"] = reduced_embeddings_dev[:, i]

In [135]:
df_dev.head(5)

Unnamed: 0,Text,ID,list_classes,len_classes,emotions,is_anger,is_disgust,is_fear,is_joy,is_sadness,...,pca_embedding_90,pca_embedding_91,pca_embedding_92,pca_embedding_93,pca_embedding_94,pca_embedding_95,pca_embedding_96,pca_embedding_97,pca_embedding_98,pca_embedding_99
0,Is this in New Orleans?? I really feel like th...,edgurhb,[27],1.0,[neutral],0.0,0.0,0.0,0.0,0.0,...,-0.006211,0.020302,-0.065543,-0.055371,0.011569,-0.008038,-0.053903,-0.083935,0.001317,0.062415
1,"You know the answer man, you are programmed to...",ee84bjg,"[4, 27]",2.0,"[approval, neutral]",0.0,0.0,0.0,1.0,0.0,...,-0.090453,-0.002191,-0.005424,0.086313,0.041064,0.001798,-0.03163,0.066323,0.086109,0.103127
2,Ive never been this sad in my life!,edcu99z,[25],1.0,[sadness],0.0,0.0,0.0,0.0,1.0,...,-0.018886,0.01031,0.088331,0.0141,0.021493,-0.072843,0.003108,-0.002447,-0.130587,0.032121
3,The economy is heavily controlled and subsidiz...,edc32e2,"[4, 27]",2.0,"[approval, neutral]",0.0,0.0,0.0,1.0,0.0,...,-0.052336,0.022836,-0.17931,0.037226,-0.08158,-0.011912,-0.008295,-0.049886,-0.011118,-0.015673
4,He could have easily taken a real camera from ...,eepig6r,[20],1.0,[optimism],0.0,0.0,0.0,1.0,0.0,...,-0.124876,-0.082038,-0.031849,-0.051353,0.02002,-0.001773,-0.029538,-0.017921,0.001923,-0.049744


In [None]:
emotion_columns = ['emotion2', 'emotion3', 'emotion4', 'emotion5']
df_train[emotion_columns].fillna(0, inplace=True)
df_dev[emotion_columns].fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train[emotion_columns].fillna(0, inplace=True)


In [160]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier

In [142]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df_train["emotions"])  # Converts to binary matrix
y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from lightgbm import LGBMClassifier
import numpy as np
from sklearn.metrics import classification_report


# 1. Get emotion distribution statistics
emotion_counts = df_train[['is_anger', 'is_disgust', 'is_fear', 'is_joy', 'is_sadness', 'is_surprise']].sum()
emotion_distribution = {col.replace('is_', ''): count for col, count in emotion_counts.items()}
print("Emotion Distribution:\n", emotion_distribution)

# 2. Prepare features and target
to_keep = [c for c in df_train.columns if c.startswith('pca')]
X = df_train[to_keep]
y = df_train[['is_anger', 'is_disgust', 'is_fear', 'is_joy', 'is_sadness', 'is_surprise']]

# 3. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Train LightGBM with automatic class balancing
lgbm = MultiOutputClassifier(
    LGBMClassifier(
        n_jobs=-1,
        class_weight='balanced',  # Let LightGBM handle class balancing automatically
        random_state=42,
        verbose=-1
    )
)

# Fit the model
lgbm.fit(X_train, y_train)

# 5. Predict
y_pred = lgbm.predict(X_test)

# 6. Convert predictions to emotion names
emotion_columns = y.columns.tolist()

def get_emotion_names(binary_row):
    return [emotion.replace('is_', '') for emotion, val in zip(emotion_columns, binary_row) if val == 1]

predicted_emotions = [get_emotion_names(row) for row in y_pred]
print("\nSample Predictions:")
for i, emotions in enumerate(predicted_emotions[:5]):  # Show first 5 predictions
    print(f"Sample {i+1}: {emotions if emotions else ['neutral']}")

# 7. Evaluate performance
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=emotion_columns))

Emotion Distribution:
 {'anger': 5579, 'disgust': 793, 'fear': 726, 'joy': 17410, 'sadness': 3263, 'surprise': 5367}

Sample Predictions:
Sample 1: ['neutral']
Sample 2: ['neutral']
Sample 3: ['joy']
Sample 4: ['joy']
Sample 5: ['joy']

Classification Report:
              precision    recall  f1-score   support

    is_anger       0.31      0.65      0.42      1096
  is_disgust       0.24      0.41      0.30       156
     is_fear       0.39      0.48      0.43       143
      is_joy       0.75      0.72      0.74      3531
  is_sadness       0.31      0.59      0.41       651
 is_surprise       0.27      0.55      0.36      1061

   micro avg       0.46      0.66      0.54      6638
   macro avg       0.38      0.57      0.44      6638
weighted avg       0.54      0.66      0.57      6638
 samples avg       0.42      0.47      0.43      6638



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Clear 

In [139]:
to_keep = [c for c in df_train.columns if c.startswith('pca')]

In [None]:
train = df_train[to_keep]
val = df_dev[to_keep]
# df_train.drop(columns=to_del, inplace=True) # but to_del is not easily defined here

In [155]:
x_train, x_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=0)

In [156]:
y_train

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [157]:
# Sum columns to get counts per emotion
emotion_counts = np.sum(y_train, axis=0)

# Map counts to emotion labels (using mlb.classes_)
emotion_distribution = dict(zip(mlb.classes_, emotion_counts))

In [None]:
df_train['emotions'].value_counts()

In [158]:
emotion_distribution 

{'admiration': 3304,
 'amusement': 1887,
 'anger': 1254,
 'annoyance': 2019,
 'approval': 2356,
 'caring': 873,
 'confusion': 1121,
 'curiosity': 1756,
 'desire': 526,
 'disappointment': 1009,
 'disapproval': 1634,
 'disgust': 635,
 'embarrassment': 251,
 'excitement': 664,
 'fear': 488,
 'gratitude': 2137,
 'grief': 58,
 'joy': 1168,
 'love': 1705,
 'nervousness': 129,
 'neutral': 11279,
 'optimism': 1253,
 'pride': 92,
 'realization': 896,
 'relief': 123,
 'remorse': 446,
 'sadness': 1053,
 'surprise': 835}

In [None]:
# from sklearn.multioutput import MultiOutputClassifier
# lgbm = MultiOutputClassifier(LGBMClassifier(n_jobs=-1)) # Shortname the LGBMClassifier()
lgbm.fit(x_train, y_train) # Train the lgbm on train sets

In [176]:
param_grid = {
    'estimator': [100,200,300,400],
    'learning_rate': [0.01,0.1,0.2],
    'min_child_samples': [20,50,100,200,500],
    'subsample': [0.5,0.7,0.8,1],
    'colsample_bytree': [0.5,0.7,0.8,1],
    "max_depth": [5,10,20,30]
}

In [170]:
from sklearn.model_selection import GridSearchCV

In [180]:
# 1. Create the base estimator with fixed parameters
base_lgbm = LGBMClassifier(
    n_jobs=-1,
    random_state=42,
    verbose=-1
)

# 2. Create the MultiOutputClassifier wrapper
multi_lgbm = MultiOutputClassifier(base_lgbm)

# 3. Define the parameter grid with the estimator__ prefix
param_grid = {
    'estimator__num_leaves': [30, 60],
    'estimator__learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1],
    'estimator__n_estimators': [100, 200, 300],
    'estimator__max_depth': [5, 10, 20],
    'estimator__min_child_samples': [50, 100, 200]
}

# 4. Create GridSearchCV
grid_search = GridSearchCV(
    estimator=multi_lgbm,
    param_grid=param_grid,
    cv=2,
    scoring='f1_macro',  # Use appropriate multi-label metric
    verbose=2,
    n_jobs=1  # Avoid nested parallelism issues
)

# 5. Fit the grid search
grid_search.fit(x_train, y_train)

# 6. Get results
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fitting 2 folds for each of 270 candidates, totalling 540 fits
[CV] END estimator__learning_rate=0.001, estimator__max_depth=5, estimator__min_child_samples=50, estimator__n_estimators=100, estimator__num_leaves=30; total time=   2.4s
[CV] END estimator__learning_rate=0.001, estimator__max_depth=5, estimator__min_child_samples=50, estimator__n_estimators=100, estimator__num_leaves=30; total time=   2.4s
[CV] END estimator__learning_rate=0.001, estimator__max_depth=5, estimator__min_child_samples=50, estimator__n_estimators=100, estimator__num_leaves=60; total time=   2.3s
[CV] END estimator__learning_rate=0.001, estimator__max_depth=5, estimator__min_child_samples=50, estimator__n_estimators=100, estimator__num_leaves=60; total time=   2.4s
[CV] END estimator__learning_rate=0.001, estimator__max_depth=5, estimator__min_child_samples=50, estimator__n_estimators=200, estimator__num_leaves=30; total time=   4.4s
[CV] END estimator__learning_rate=0.001, estimator__max_depth=5, estimator__m

In [None]:
# Get probabilities for each emotion class (shape: [n_samples, n_emotions])
y_proba = np.array([estimator.predict_proba(X_new)[:, 1] for estimator in lgbm.estimators_]).T

threshold = 0.3  # Adjust based on your needs
y_pred_binary = (y_proba >= threshold).astype(int)

emotion_columns = ['is_anger', 'is_disgust', 'is_fear', 'is_joy', 'is_sadness', 'is_surprise']

# Create DataFrame of probabilities
prob_df = pd.DataFrame(y_proba, columns=[f"prob_{col.replace('is_', '')}" for col in emotion_columns])

# Add predicted emotions (using threshold)
prob_df['predicted_emotions'] = [
    [emotion.replace('is_', '') for emotion, prob in zip(emotion_columns, row) if prob >= threshold]
    for row in y_proba
]

In [None]:
predicted_emotions = mlb.inverse_transform(y_pred)