In [43]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder,
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
import joblib

In [51]:
df = pd.read_csv('DATASET-balanced.csv')
df.head(10)

Unnamed: 0,chroma_stft,rms,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,mfcc4,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,LABEL
0,0.338055,0.027948,2842.948867,4322.916759,6570.586186,0.04105,-462.169586,90.311272,19.073769,24.046888,...,-6.686564,0.902086,-7.251551,-1.198342,4.747403,-4.986279,0.953935,-5.013138,-6.77906,FAKE
1,0.443766,0.037838,2336.129597,3445.777044,3764.949874,0.04773,-409.413422,120.348808,-7.161531,5.114784,...,-2.131157,-6.876417,-1.359395,0.326401,-5.420016,-2.109968,-1.757634,-9.537907,-8.494421,FAKE
2,0.302528,0.056578,2692.988386,2861.13318,4716.610271,0.080342,-318.996033,120.490273,-24.625771,23.891073,...,-5.853725,-3.724773,-6.627182,-5.117002,-6.072106,-0.994653,-1.61712,-3.922354,-7.033001,FAKE
3,0.319933,0.031504,2241.665382,3503.766175,3798.641521,0.04718,-404.636749,136.320908,2.308172,-3.907071,...,-1.898315,-2.046493,-7.176277,-3.293508,4.209121,0.121835,-5.407063,-3.654926,-3.274857,FAKE
4,0.420055,0.016158,2526.069123,3102.659519,5025.077899,0.051905,-410.497925,152.7314,-18.266771,51.993462,...,-1.95234,0.810868,6.238493,6.555839,7.535542,2.849219,2.616843,-1.793357,-5.060998,FAKE
5,0.44288,0.012317,3952.880304,3702.717829,7104.089991,0.115387,-498.179657,97.245255,-21.382017,45.624386,...,-6.466366,2.856888,-0.157036,-2.824058,0.383832,-1.089466,2.998828,-5.218136,-1.423959,FAKE
6,0.453897,0.021782,4178.07215,3698.644769,7508.242075,0.131647,-410.383087,97.026733,-22.508402,49.689598,...,-5.320498,-4.209944,-2.051953,-0.714617,-7.889741,-0.377954,-8.421499,-5.800948,-12.060839,FAKE
7,0.474154,0.011107,3993.039753,3948.154333,7872.563956,0.114879,-440.81897,103.029533,-21.213911,49.066498,...,2.81023,-2.150585,-1.529685,3.354003,-6.589828,5.678379,-2.742477,-5.159016,-10.969421,FAKE
8,0.60269,0.00097,3815.431438,3992.517515,6887.564689,0.121769,-539.965088,94.81163,-51.38866,33.196476,...,-1.619109,10.417936,-9.675606,-0.937404,0.909395,4.827445,5.055848,-3.618021,5.053717,FAKE
9,0.453962,0.017612,2894.560788,3435.434131,5663.232422,0.065904,-403.053009,138.07962,-20.038134,39.301231,...,0.496345,-0.003924,0.801856,3.294856,-5.852333,-1.849402,-7.108891,-0.818681,-7.824217,FAKE


In [45]:
X = df.drop(columns='LABEL', axis=1)
numeric_columns = X.select_dtypes(include='number').columns
categorical_columns = X.select_dtypes(include='object').columns
y = df['LABEL']
le = LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values using median
    ('scaler', StandardScaler())  # Standardize features
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values using most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ], remainder='passthrough'
)

In [47]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier())
])

In [48]:
"""
    'classifier__n_estimators': [100, 200, 300],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__max_depth': [3, 6, 9],
    'classifier__min_child_weight': [1, 5, 10],
    'classifier__subsample': [0.6, 0.8, 1.0],
    'classifier__colsample_bytree': [0.7, 0.85, 1.0],
    'classifier__gamma': [0, 0.1, 0.5],
    'classifier__reg_alpha': [0, 0.01, 0.1, 1],
    'classifier__reg_lambda': [1, 0.1, 0.01, 0]
"""

# Define hyperparameters for grid search
param_grid = {
    'classifier__n_estimators': [100, 200, 300],  # Number of boosting rounds
    'classifier__max_depth': [3, 6, 9],           # Maximum depth of the tree
    'classifier__learning_rate': [0.2, 0.1, 0.05, 0.001],  # Learning rate
    'classifier__min_child_weight': [1, 5, 10],
}
# Create GridSearchCV object
grid_search = GridSearchCV(
    estimator = pipeline,
    param_grid = param_grid,
    cv = StratifiedKFold(
        n_splits=5,
        shuffle=True,
        random_state=42
    ),
    n_jobs=5
)  # 5-fold cross-validation

In [49]:
grid_search.fit(X_train, y_train)
predictions = grid_search.predict(X_test)
cr = classification_report(y_test, predictions)
print(cr)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1193
           1       0.99      0.99      0.99      1163

    accuracy                           0.99      2356
   macro avg       0.99      0.99      0.99      2356
weighted avg       0.99      0.99      0.99      2356



In [50]:
joblib.dump(grid_search.best_estimator_, 'model.pkl')

['model.pkl']