# <center>Moodify Pipelining</center>

In [1]:
#ignore warnings
import warnings
warnings.filterwarnings('ignore')

#necessary Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.preprocessing import RobustScaler
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
pd.set_option('display.max_column', None)




In [2]:
# spliting the dataset into test and split                                   done
# separate the categorical and numerical features                            done
# Data cleaning the dataset, some columns are not in the proper format       done
# Data preprocessing for caegorical and numerical data                        done
# feature engineering                                                         done
# combining the Data preprocessing process for both                           done

In [3]:
# splittying the dataset into test and split

df = pd.read_csv('278k_song_labelled.csv', index_col='Unnamed: 0')
X = df.drop('labels', axis=1)
y = df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [4]:

# Identify numerical and categorical columns
#num_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
#cat_cols = X_train.select_dtypes(include=['object']).columns

In [5]:
df.columns

Index(['duration (ms)', 'danceability', 'energy', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'spec_rate', 'labels'],
      dtype='object')

In [6]:
numerical_cols = ['duration (ms)', 'danceability', 'energy', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'spec_rate']

In [7]:
# feaure engineering class

class FeatureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return self.feature_eng(X)

    @staticmethod
    def feature_eng(df3):
        #scale loudness column
        df3['loudness'] = df3['loudness'] / df3['loudness'].max()

        #scale spec_rate column
        df3['spec_rate'] = df3['spec_rate'] * 1000000

        #Engineering the feature that affect the emotion label to caption interaction
        df3['all_six'] = df3['danceability'] * df3['energy'] * df3['loudness'] * df3['acousticness'] * df3['instrumentalness'] * df3['valence']
        df3['DE'] = df3['danceability'] * df3['energy']
        df3['DL'] = df3['danceability'] * df3['loudness']
        df3['DV'] = df3['danceability'] * df3['valence']
        df3['EL'] = df3['energy'] + df3['loudness']
        df3['EA'] = df3['energy'] * df3['acousticness']
        df3['EV'] = df3['energy'] * df3['valence']
        df3['LA'] = df3['loudness'] * df3['acousticness'] 
        df3['LI'] = df3['loudness'] * df3['instrumentalness']
        df3['LV'] = df3['loudness'] * df3['valence']
        df3['AI'] = df3['acousticness'] * df3['instrumentalness']
        df3['IV'] = df3['instrumentalness'] * df3['valence']
        df3['TL'] = df3['tempo'] * df3['loudness']
        df3['TA'] = df3['tempo'] * df3['acousticness']
        return df3

In [8]:
# Define preprocessing for feature engineering
numerical_transformer_feature_engineering = Pipeline(steps=[
    ('feature_engineering', FeatureEngineering())
])

numerical_transformer_feature_engineering

In [9]:
# Define preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('scaler', RobustScaler())
])

numerical_transformer

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('feat_enginering', numerical_transformer_feature_engineering, X_train.columns),
        ('num', numerical_transformer, numerical_cols)
    ])

In [11]:
preprocessor

In [12]:
# Define the model
model = XGBClassifier({'n_estimators': 532, 'max_depth': 4, 'learning_rate': 0.24326009625329156, 'subsample': 0.736214653492401, 'colsample_bytree': 0.908850163356979, 'gamma': 0.00010778044237254376, 'reg_alpha': 0.005929616007862493, 'reg_lambda': 0.4347499942773969, 'min_child_weight': 5})

# Create and evaluate the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

In [None]:
#Best parameters: {'iterations': 995, 'depth': 10, 'learning_rate': 0.05517552052238403, 'l2_leaf_reg': 8.105159370607822, 'border_count': 147, 'random_strength': 4.1228002436767275, 'bagging_temperature': 0.18025994923144195}

In [13]:
from catboost import CatBoostClassifier

catboost_params = {
    'iterations': 995, 
    'border_count': 147,
    'bagging_temperature': 0.1802599492,
    'random_strength': 4.122800244,
    'depth': 10,
    'l2_leaf_reg': 8.105159371,
    'learning_rate': 0.05517552052,
    'verbose':0
}

# Create the classifier with valid params
cat_model = CatBoostClassifier(**catboost_params)

In [14]:
# Create and evaluate the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', cat_model)
])

In [15]:
pipeline

In [16]:
# Train the model
pipeline.fit(X_train, y_train)

In [17]:
y_pred_test = pipeline.predict(X_test)

In [19]:
y_pred_test[:10]

array([[1],
       [1],
       [1],
       [1],
       [2],
       [2],
       [2],
       [0],
       [2],
       [1]], dtype=int64)

In [22]:
y_test[:10]

100080    1
106132    1
81729     1
130669    0
123203    2
169280    2
76234     2
265419    0
59451     2
233463    1
Name: labels, dtype: int64

In [23]:
#Evaluate the model
print("Training Data Evaluation:")
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))
print("Accuracy:", accuracy_score(y_test, y_pred_test))


Training Data Evaluation:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     16412
           1       0.96      0.96      0.96     21286
           2       0.95      0.94      0.94      9413
           3       0.98      0.98      0.98      8477

    accuracy                           0.96     55588
   macro avg       0.96      0.96      0.96     55588
weighted avg       0.96      0.96      0.96     55588

[[15814   403    58   137]
 [  373 20472   403    38]
 [   68   506  8830     9]
 [  138    45     5  8289]]
Accuracy: 0.9607289343023674


### Saving our model as a file

In [24]:
import joblib
# Save the pipeline
joblib.dump(pipeline, 'model_pipeline.pkl', compress=3)

# Load the pipeline
pipeline = joblib.load('model_pipeline.pkl')

In [25]:
# Use the model
predictions = pipeline.predict(X_test)
print(predictions)

[[1]
 [1]
 [1]
 ...
 [0]
 [2]
 [0]]
