In [82]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer


import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, roc_auc_score, classification_report


In [83]:
df = pd.read_csv("https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/main/tweet_emotions.csv")
# df.to_csv("data.csv", index = False)

In [84]:
df.shape

(40000, 3)

In [85]:
df.drop(columns = ["tweet_id"], inplace = True)

final_df = df[df['sentiment'].isin(["happiness", "sadness"])]
final_df['sentiment'].replace({"happiness": 1, "sadness": 0}, inplace = True)

final_df.shape

train_data, test_data = train_test_split(final_df, test_size = 0.2, random_state = 42)
train_data.shape, test_data.shape

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_df['sentiment'].replace({"happiness": 1, "sadness": 0}, inplace = True)
  final_df['sentiment'].replace({"happiness": 1, "sadness": 0}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['sentiment'].replace({"happiness": 1, "sadness": 0}, inplace = True)


((8299, 2), (2075, 2))

In [86]:
# transform the data
nltk.download("wordnet")
nltk.download("stopwords")

def lemmatization(text):
    lemmatizer = WordNetLemmatizer()

    text = text.split()
    text = [lemmatizer.lemmatize(y) for y in text]

    return " ".join(text)

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))

    text = [i for i in str(text).split() if i not in stop_words]

    return " ".join(text)

def remove_numbers(text):
    text = re.sub(r"\d+", "", text)
    return text

def lower_case(text):
    text = text.split()

    text = [y.lower() for y in text]
    return " ".join(text)

def remove_punctuation(text):
    text = "".join([i for i in text if i not in string.punctuation])

    text = re.sub("\s+", " ", text)
    text = " ".join(text.split())   
    return text.strip()


def remove_urls(text):
    url_pattern = re.compile(r"https?://\S+|www\.\S+")
    return url_pattern.sub(r"", text)

def remove_small_sentences(df):
    df = df[df['content'].str.len() >= 3].reset_index(drop=True)
    return df

def normalize_text(df):
    df.content = df.content.apply(lambda content: lower_case(content))
    df.content = df.content.apply(lambda content: remove_stopwords(content))
    df.content = df.content.apply(lambda content: remove_numbers(content))
    df.content = df.content.apply(lambda content: remove_punctuation(content))
    df.content = df.content.apply(lambda content: remove_urls(content))
    df.content = df.content.apply(lambda content: lemmatization(content))
    return df

# def normalized_sentence(sentence):
#     sentence = lower_case(sentence)
#     sentence = remove_stopwords(sentence)
#     sentence = remove_numbers(sentence)
#     sentence = remove_punctuation(sentence)
#     sentence = remove_urls(sentence)
#     sentence = lemmatization(sentence)
#     return sentence

train_processed_data = normalize_text(train_data)
test_processed_data = normalize_text(test_data)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\senor\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\senor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [87]:
train_data.fillna("", inplace=True)
test_data.fillna("", inplace = True)

X_train = train_data["content"].values
y_train = train_data['sentiment'].values

X_test = test_data["content"].values
y_test = test_data['sentiment'].values

In [88]:
y_train

array([0, 0, 0, ..., 1, 0, 0], shape=(8299,))

In [90]:
# applybag of words
vectorizer = CountVectorizer()
vectorizer = CountVectorizer(max_features = 500) # limit the number of features to 5000
# print(X_train_bow.shape, X_test_bow.shape)

X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)


In [91]:
X_train_bow.toarray()

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], shape=(8299, 500))

In [92]:
train_df = pd.DataFrame(X_train_bow.toarray())
test_df = pd.DataFrame(X_test_bow.toarray())  

In [93]:
train_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
8295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8296,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8297,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [94]:
train_df['label'] = y_train
test_df['label'] = y_test

In [95]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [96]:

X_train = train_df.iloc[:, :-1].values
y_train = train_df.iloc[:, -1].values
X_test = test_df.iloc[:, :-1].values
y_test = test_df.iloc[:, -1].values


In [97]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((8299, 500), (8299,), (2075, 500), (2075,))

In [98]:
# fit the model
model = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train, y_train)

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,50
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [99]:

y_pred = model.predict(X_test)
y_pred



array([0, 0, 0, ..., 0, 0, 1], shape=(2075,))

In [101]:
y_pred_proba = model.predict_proba(X_test)
y_pred_proba

array([[0.51051625, 0.48948375],
       [0.74471521, 0.25528479],
       [0.82236993, 0.17763007],
       ...,
       [0.51051625, 0.48948375],
       [0.74471521, 0.25528479],
       [0.19677635, 0.80322365]], shape=(2075, 2))

In [102]:
y_pred_proba[:, 1]

array([0.48948375, 0.25528479, 0.17763007, ..., 0.48948375, 0.25528479,
       0.80322365], shape=(2075,))

In [None]:
y_pred_proba = model.predict_proba(X_test)[:, 1]

In [None]:
# evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

In [67]:
accuracy, precision, recall, roc_auc

(0.7214457831325302,
 0.8134863701578192,
 0.5586206896551724,
 0.8177288781485269)

In [70]:
# Define and train the XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [71]:
# Make predictions
y_pred = xgb_model.predict(X_test)
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

In [72]:
# Calculate evaluation metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [77]:
accuracy, precision, recall, auc

(0.7628915662650603,
 0.7180984153461217,
 0.8482758620689655,
 0.8564216005204945)

In [75]:
classification_rep

'              precision    recall  f1-score   support\n\n           0       0.82      0.68      0.75      1060\n           1       0.72      0.85      0.78      1015\n\n    accuracy                           0.76      2075\n   macro avg       0.77      0.76      0.76      2075\nweighted avg       0.77      0.76      0.76      2075\n'

In [37]:
models_dict = {"model_building":
    {"models": 
               {"GradientBoosting": {
                  "n_estimators": 200,
                  "learning_rate": 0.1 
               }, 
               "RandomForest": {
                   "n_estimators": 100,
                   "max_depth": 3
               }}
               }
}

In [38]:
models_config = models_dict.get("model_building").get("models")
models_config

{'GradientBoosting': {'n_estimators': 200, 'learning_rate': 0.1},
 'RandomForest': {'n_estimators': 100, 'max_depth': 3}}

In [39]:
list(models_config.keys())

['GradientBoosting', 'RandomForest']

In [40]:
def get_model(model_name: str, config: dict):
    """Factory function to create models dynamically based on params.yaml."""
    if model_name == "GradientBoosting":
        return GradientBoostingClassifier(**config, random_state=42)
    elif model_name == "RandomForest":
        return RandomForestClassifier(**config, random_state=42)
    else:
        raise ValueError(f"Unsupported model: {model_name}")

In [44]:
for model_name, config in models_config.items():
    # model = get_model(model_name, config)
    print(model_name, config)

GradientBoosting {'n_estimators': 200, 'learning_rate': 0.1}
RandomForest {'n_estimators': 100, 'max_depth': 3}


In [46]:
config = {'n_estimators': 200, 'learning_rate': 0.1}

In [3]:
models_dict.items()

dict_items([('models', {'GradientBoosting': {'n_estimators': 200, 'learning_rate': 0.1}, 'RandomForest': {'n_estimators': 100, 'max_depth': 3}})])

In [6]:
list(models_dict.keys())

['models']

In [15]:
list(models_dict.values())

[{'GradientBoosting': {'n_estimators': 200, 'learning_rate': 0.1},
  'RandomForest': {'n_estimators': 100, 'max_depth': 3}}]

In [9]:
model_config = models_dict.get("models")
model_config

{'GradientBoosting': {'n_estimators': 200, 'learning_rate': 0.1},
 'RandomForest': {'n_estimators': 100, 'max_depth': 3}}

In [11]:
list(model_config.keys())

['GradientBoosting', 'RandomForest']

In [12]:
model_config.items()

dict_items([('GradientBoosting', {'n_estimators': 200, 'learning_rate': 0.1}), ('RandomForest', {'n_estimators': 100, 'max_depth': 3})])

In [52]:
import os

In [79]:
# for model in os.path.join("data", "models"):
for file_name in os.listdir(os.getcwd()):
    if file_name.endswith(".py"):
        print(file_name) 

data_ingestion.py
data_preprocessing.py
feature_engineering.py
model_building.py
model_evaluation.py


In [63]:
print(os.getcwd())

c:\Users\senor\OneDrive\Desktop\ds\campusx\mlops\ml_pipeline_dvc\src


In [81]:
for file in os.listdir("./data/raw"):
    print(file)

test.csv
train.csv


In [2]:
import os
os.chdir(r"C:\Users\senor\OneDrive\Desktop\ds\campusx\mlops\ml_pipeline_dvc\data\metrics")

In [3]:
import json
import pandas as pd

with open(r"metrics_summary.json") as f:
    metrics = json.load(f)

In [4]:
metrics

{'gradientboosting_latest': {'accuracy': 0.7383132530120482,
  'precision': 0.8010204081632653,
  'recall': 0.6187192118226601,
  'roc_auc': 0.8415317408681104,
  'evaluated_at': '2025-09-11T16:02:25.103540'},
 'logisticregression_latest': {'accuracy': 0.7932530120481928,
  'precision': 0.7795801526717557,
  'recall': 0.8049261083743843,
  'roc_auc': 0.8719681197137281,
  'evaluated_at': '2025-09-11T16:02:25.260685'},
 'randomforest_latest': {'accuracy': 0.6973493975903614,
  'precision': 0.6257309941520468,
  'recall': 0.948768472906404,
  'roc_auc': 0.8327762803234502,
  'evaluated_at': '2025-09-11T16:02:25.313307'},
 'xgboost_latest': {'accuracy': 0.7354216867469879,
  'precision': 0.7979539641943734,
  'recall': 0.6147783251231527,
  'roc_auc': 0.8352444465098987,
  'evaluated_at': '2025-09-11T16:02:26.824969'}}

In [5]:
pd.DataFrame(metrics).T

Unnamed: 0,accuracy,precision,recall,roc_auc,evaluated_at
gradientboosting_latest,0.738313,0.80102,0.618719,0.841532,2025-09-11T16:02:25.103540
logisticregression_latest,0.793253,0.77958,0.804926,0.871968,2025-09-11T16:02:25.260685
randomforest_latest,0.697349,0.625731,0.948768,0.832776,2025-09-11T16:02:25.313307
xgboost_latest,0.735422,0.797954,0.614778,0.835244,2025-09-11T16:02:26.824969


In [1]:
import os
os.getcwd()

'c:\\Users\\senor\\OneDrive\\Desktop\\ds\\campusx\\mlops\\ml_pipeline_dvc_cookiecutter\\emotion_detection\\notebooks'

In [None]:
import os
os.chdir(r"C:\Users\senor\OneDrive\Desktop\ds\campusx\mlops\ml_pipeline_dvc\data\metrics")

In [5]:
os.chdir("..")

In [6]:
os.getcwd()

'c:\\Users\\senor\\OneDrive\\Desktop\\ds\\campusx\\mlops\\ml_pipeline_dvc_cookiecutter\\emotion_detection'

In [7]:
import os
os.chdir(os.path.join(os.getcwd(), "reports", "metrics"))

In [8]:
import json
import pandas as pd

with open(r"metrics_summary.json") as f:
    metrics = json.load(f)

In [9]:
metrics

{'gradientboosting_latest': {'accuracy': 0.7383132530120482,
  'precision': 0.8010204081632653,
  'recall': 0.6187192118226601,
  'roc_auc': 0.8415317408681104,
  'evaluated_at': '2025-09-12T02:03:39.366541'},
 'logisticregression_latest': {'accuracy': 0.7932530120481928,
  'precision': 0.7795801526717557,
  'recall': 0.8049261083743843,
  'roc_auc': 0.8719681197137281,
  'evaluated_at': '2025-09-12T02:03:39.539637'},
 'randomforest_latest': {'accuracy': 0.6973493975903614,
  'precision': 0.6257309941520468,
  'recall': 0.948768472906404,
  'roc_auc': 0.8327762803234502,
  'evaluated_at': '2025-09-12T02:03:39.616180'},
 'xgboost_latest': {'accuracy': 0.7354216867469879,
  'precision': 0.7979539641943734,
  'recall': 0.6147783251231527,
  'roc_auc': 0.8352444465098987,
  'evaluated_at': '2025-09-12T02:03:40.514218'}}

In [10]:
pd.DataFrame(metrics).T

Unnamed: 0,accuracy,precision,recall,roc_auc,evaluated_at
gradientboosting_latest,0.738313,0.80102,0.618719,0.841532,2025-09-12T02:03:39.366541
logisticregression_latest,0.793253,0.77958,0.804926,0.871968,2025-09-12T02:03:39.539637
randomforest_latest,0.697349,0.625731,0.948768,0.832776,2025-09-12T02:03:39.616180
xgboost_latest,0.735422,0.797954,0.614778,0.835244,2025-09-12T02:03:40.514218
