In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
# from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_df = pd.read_csv('/kaggle/input/music-genre-classification-2024/train (1).csv')
test_df = pd.read_csv('/kaggle/input/music-genre-classification-2024/test (2).csv')

In [None]:
train_df , val_df = train_test_split(train_df ,  test_size=0.2, random_state=42)

In [None]:
print(train_df.head())

In [None]:
print(test_df.head())

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.describe(include = 'object')

In [None]:
train_df.time_signature.unique()

In [None]:
train_df.nunique()

In [None]:
print(train_df.isnull().sum())

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x='Class', data=train_df)
plt.title('Distribution of Music Genres')
plt.show()

In [None]:
numeric_df = train_df.select_dtypes(include=['float64', 'int64'])

plt.figure(figsize=(14,8))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x='Class',hue = 'time_signature', data=train_df)
plt.title('Distribution of time signature within Music Genres')
plt.show()

In [None]:
selected_features = ['danceability', 'energy', 'loudness', 'valence', 'Class']
sns.pairplot(train_df[selected_features], hue='Class', diag_kind='kde')
plt.show()

In [None]:
features = ['Popularity', 'danceability', 'energy', 'loudness', 'tempo']
plt.figure(figsize=(16,10))
for i, feature in enumerate(features):
    plt.subplot(2, 3, i + 1)
    sns.boxplot(x='Class', y=feature, data=train_df)
    plt.title(f'Boxplot of {feature} by Genre')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
for i, feature in enumerate(['danceability', 'energy', 'tempo']):
    plt.subplot(1, 3, i + 1)
    sns.histplot(train_df[feature], kde=True)
    plt.title(f'Distribution of {feature}')
plt.tight_layout()
plt.show()

In [None]:
numerical_features = ['Popularity', 'danceability','energy', 'loudness', 'speechiness', 'acousticness','instrumentalness', 'liveness', 'valence', 'tempo','duration_in min/ms']

from scipy.stats import chi2_contingency
for i in numerical_features :
    contingency_table = pd.crosstab(train_df[i], train_df['Class'])

    chi2, p, dof, ex = chi2_contingency(contingency_table)
    
    if p < 0.05 :
        print("chi : " , chi2)
        print("P : " , p)
        print(i)
        print()



In [None]:
numerical_features = ['Popularity', 'danceability','energy', 'loudness', 'speechiness', 'acousticness','instrumentalness', 'valence', 'tempo','duration_in min/ms']
categorical_features = ['time_signature', 'mode', 'key']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


# model = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', LogisticRegression())
# ])

In [None]:
# Prepare data for training
le = LabelEncoder()
train_df['Class'] = le.fit_transform(train_df['Class'])

In [None]:
X_train = train_df[numerical_features + categorical_features]
y_train = train_df['Class']

X_val = val_df[numerical_features + categorical_features]
y_val = val_df['Class']

In [None]:
# models = {
#     'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
#     'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
#     'SVC': SVC(kernel='rbf', random_state=42),
#     'XGBoost': XGBClassifier(random_state=42),
#     'KNN': KNeighborsClassifier(n_neighbors=5),
#     'DecisionTree': DecisionTreeClassifier(random_state=42),
#     'LogisticRegression' :  LogisticRegression(multi_class='multinomial', solver='lbfgs')
# }

In [None]:
# for name, model in models.items():
#     full_model = Pipeline(steps=[
#         ('preprocessor', preprocessor),
#         ('classifier', model)])
#     full_model.fit(X_train, y_train)
#     train_y = full_model.predict(X_train)
#     y_pred = full_model.predict(X_val)
#     print(f"Model: {name}")
#     print(f"train f1_score: {f1_score(y_train, train_y, average='micro'):.4f}")
#     print(f"val f1_score: {f1_score(y_val, y_pred, average='micro'):.4f}")
#     print(classification_report(y_val, y_pred))

In [None]:
# Hyperparameter tuning for RandomForest
# param_grid = {
#     'classifier__n_estimators': [50, 100, 200],
#     'classifier__max_depth': [None, 10, 20, 30],
#     'classifier__min_samples_split': [2, 5, 10]
# }

# full_model = Pipeline(steps=[
#         ('preprocessor', preprocessor),
#         ('classifier', RandomForestClassifier(random_state=42))])

# grid_search = GridSearchCV(full_model, param_grid, cv=5)
# grid_search.fit(X_train, y_train)
# best_rf = grid_search.best_estimator_

In [None]:
# param_grid = {
#     'classifier__n_estimators': [50, 100, 200],
#     'classifier__max_depth': [3, 5, 10],
#     'classifier__learning_rate': [0.01, 0.1, 0.2],
#     'classifier__subsample': [0.5, 0.75, 1.0] 
# }

# full_model = Pipeline(steps=[
#         ('preprocessor', preprocessor),
#         ('classifier',XGBClassifier(random_state=42))])

# grid_search = GridSearchCV(full_model, param_grid, cv=5)
# grid_search.fit(X_train, y_train)
# best_Xgb = grid_search.best_estimator_

In [None]:
#  best_Xgb.get_params()
# grid_search.best_params_

In [None]:
import keras.backend as K
K.clear_session()

In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# from sklearn.base import BaseEstimator, ClassifierMixin

# class CustomKerasClassifier(BaseEstimator, ClassifierMixin):
#     def __init__(self, epochs=100, batch_size=5):
#         self.epochs = epochs
#         self.batch_size = batch_size
#         self.model = None

#     def build_model(self, input_shape, num_classes):
#         model = Sequential()
#         model.add(Dense(64, activation='relu', input_shape=(input_shape,)))
#         model.add(Dense(32, activation='relu'))
#         model.add(Dense(num_classes, activation='softmax'))
#         model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
#         return model

#     def fit(self, X, y):
#         self.model = self.build_model(X.shape[1], y.shape[1])
#         self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)

#     def predict(self, X):
#         return self.model.predict(X)



In [None]:
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.utils import to_categorical
# from scikeras.wrappers import KerasClassifier

y_train = train_df['Class']
y_train = to_categorical(y_train, num_classes=11)
X = preprocessor.fit_transform(X_train)
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X.shape[1],)))  
model.add(Dense(64, activation='relu'))  
model.add(Dense(32, activation='relu'))
model.add(Dense(11, activation='softmax'))  


model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['f1_score'])

# pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('nn', CustomKerasClassifier(epochs=100, batch_size=5)) 
# ])
model.fit(X, y_train , epochs=100, batch_size=5, verbose=1)

# epochs=100, batch_size=5, verbose=1
train_y = model.predict(X)
# y_pred = pipeline.predict(X_val)

train_y =  np.argmax(train_y, axis=1)
y_train =  np.argmax(y_train, axis=1)
print(f"train f1_score: {f1_score(y_train, train_y, average='micro'):.4f}")
# print(f"val f1_score: {f1_score(y_val, y_pred, average='micro'):.4f}")
# print(classification_report(y_val, y_pred))

In [None]:
y_pred = model.predict(preprocessor.fit_transform(X_val))
y_pred =  np.argmax(y_pred, axis=1)
print(f"val f1_score: {f1_score(y_val, y_pred, average='micro'):.4f}")
print(classification_report(y_val, y_pred))

In [None]:
# best_Xgb.get_params()

In [None]:
# stacking_clf = StackingClassifier(
#     estimators=[
#         ('rf', RandomForestClassifier(random_state=42 ,max_depth = 3,n_estimators = 200,  )),
#         ('gb', models['GradientBoosting']),
#         ('svc', models['SVC']),
#         ('xgb', XGBClassifier(random_state=42 , learning_rate = 0.1, max_depth = 3, n_estimators= 200,subsample = 0.5 ))
#     ],
#     final_estimator=LogisticRegression()
# )

In [None]:
# full_model = Pipeline(steps=[
#         ('preprocessor', preprocessor),
#         ('classifier',stacking_clf)])
# full_model.fit(X_train, y_train)

In [None]:
# Evaluation
# y_pred_stacking = full_model.predict(X_val)
# print("Stacking Classifier")
# print(f"Accuracy: {f1_score(y_val, y_pred_stacking , average='micro'):.4f}")
# print(classification_report(y_val, y_pred_stacking))

In [None]:
X_test = test_df[numerical_features + categorical_features]

In [None]:
y_test_pred = model.predict(preprocessor.fit_transform(X_test))
y_test_pred_labels = le.inverse_transform( np.argmax(y_test_pred, axis=1))

In [None]:
submission_df = pd.DataFrame({
    'Id': test_df['Id'],
    'Class': y_test_pred_labels
})

submission_df.to_csv('finalsubmission2.csv', index=False)

In [None]:
print(submission_df.head())

In [None]:
from IPython.display import FileLink

file_path = 'finalsubmission2.csv'

FileLink(file_path)