In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd drive/MyDrive/github/SIGNATE/Signate/MusicCmp

# Get Ready

In [None]:
!pip install catboost
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import KernelPCA, PCA

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoost
from catboost import Pool

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import accuracy_score, f1_score
from sklearn import metrics

In [None]:
INPUT = Path("./Input")
df_train = pd.read_csv(INPUT / "train.csv")
df_test = pd.read_csv(INPUT / "test.csv")
df_sample_sub = pd.read_csv(INPUT / "sample_submit.csv", header=None)
df_sample_sub.columns = ["index", "genre"]
df_genre_labels = pd.read_csv(INPUT / "genre_labels.csv")

In [None]:
def merge_train_test(df_train, df_test):
    if "genre" not in df_test.columns.tolist():
        df_test["genre"] = -100
    res = pd.concat([df_train, df_test])
    res.reset_index(inplace=True, drop=True)
    return res

def split_train_test(df):
    df_train = df[df["genre"] != -100]
    df_test = df[df["genre"] == -100]
    df_train.reset_index(inplace=True, drop=True)
    df_test.reset_index(inplace=True, drop=True)
    return df_train, df_test

df = merge_train_test(df_train, df_test)

In [None]:
df["genre_name"] = df["genre"].map(dict(df_genre_labels[["labels", "genre"]].values))

df["tempo"] = df["tempo"].map(lambda x: sum(map(int, x.split("-"))) / 2)

df = pd.concat([df, pd.get_dummies(df["region"]).rename(columns={"unknown": "region_unknown"})], axis=1)

df["num_nans"] = 0
for col in [
    "acousticness",
    "positiveness",
    "danceability",
    "energy",
    "liveness",
    "speechiness",
    "instrumentalness",
]:
    df["num_nans"] += df[col].isna()

In [None]:
df["log_tempo"] = np.log(df["tempo"])
for col in [
    'popularity', 'duration_ms', 'acousticness',
    'positiveness', 'danceability', 'loudness', 'energy', 'liveness',
    'speechiness', 'instrumentalness', 'log_tempo', 'num_nans',
]:
    df["standardscaled_" + col] = StandardScaler().fit_transform(df[[col]])[:, 0]

In [None]:
df_train, df_test = split_train_test(df)
target = df_train["genre"]

In [None]:
features = [
   'region_A', 'region_B', 'region_C', 'region_D', 'region_E', 'region_F',
   'region_G', 'region_H', 'region_I', 'region_J', 'region_K', 'region_L',
   'region_M', 'region_N', 'region_O', 'region_P', 'region_Q', 'region_R',
   'region_S', 'region_T', 'region_unknown',
   'standardscaled_popularity', 'standardscaled_duration_ms',
   'standardscaled_acousticness', 'standardscaled_positiveness',
   'standardscaled_danceability', 'standardscaled_loudness',
   'standardscaled_energy', 'standardscaled_liveness',
   'standardscaled_speechiness', 'standardscaled_instrumentalness',
   'standardscaled_log_tempo', 'standardscaled_num_nans'
]

dict_feature_weights = {}

for col in [
    'region_A', 'region_B', 'region_C', 'region_D', 'region_E', 'region_F',
    'region_G', 'region_H', 'region_I', 'region_J', 'region_K', 'region_L',
    'region_M', 'region_N', 'region_O', 'region_P', 'region_Q', 'region_R',
    'region_S', 'region_T', 'region_unknown'
]:
    dict_feature_weights[col] = 100.0

for col in [
    'standardscaled_duration_ms',
    'standardscaled_acousticness', 'standardscaled_positiveness',
    'standardscaled_danceability', 'standardscaled_loudness',
    'standardscaled_energy', 'standardscaled_liveness',
    'standardscaled_speechiness', 'standardscaled_instrumentalness'
]:
    dict_feature_weights[col] = 1.0

dict_feature_weights["standardscaled_popularity"] = 8.0
dict_feature_weights["standardscaled_log_tempo"] = 0.001
dict_feature_weights["standardscaled_num_nans"] = 100.0

feature_weights = np.array([dict_feature_weights[col] for col in features])

In [None]:
# parameters
N_CLASSES = 11
n_neighbors = 1

In [None]:
train = pd.DataFrame(df_train[features].fillna(0.0).values * feature_weights)
test = pd.DataFrame(df_test[features].fillna(0.0).values * feature_weights)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=1234,stratify=target)

smote = SMOTE(random_state=1234)
X_train_res , y_train_res = smote.fit_resample(X_train,y_train)

In [None]:
X_train_final = X_train_res
X_val_final = X_val

# Modeling

## Functions

In [None]:
def plot_scores(_range,accuracy,f1):
    plt.figure(figsize=(12,5))
    plt.subplot(1,2,1)
    plt.plot(_range, accuracy)
    plt.title('Testing Accuracy')

    plt.subplot(1,2,2)
    plt.plot(_range, f1)
    plt.title('Testing F1 score')

    plt.tight_layout(pad=1)

In [None]:
def grid_search_optimizer(model, X, y, params):
    gs = GridSearchCV(
          model,
          params,
          scoring='f1_macro',
          cv = 5,
          n_jobs = -1,
          verbose=3
        )
    gs.fit(X,y)
    print(gs.best_estimator_)
    print('best F1 score:',gs.best_score_)
    return gs

In [None]:
def bagging_grid_search_optimizer(model, X, y, model_params, bag_params):
    bag_clf = BaggingClassifier(grid_search_optimizer(model=model, X=X, y=y, params=model_params).best_estimator_)
    bag_gs = GridSearchCV(
              bag_clf,
              bag_params,
              scoring='f1_macro',
              n_jobs = -1,
              verbose = 2
            )
    bag_gs.fit(X,y)
    print(bag_gs.best_estimator_)
    print('best F1 score:', bag_gs.best_score_)
    return bag_gs

In [None]:
def model_predicts_to_feature(models, X, y):
    feature_preds = [[0] * 11 for i in range(len(y))]
    for model in models:
      y_pred = model.predict(X)
      weight = f1_score(y_pred,y,average="macro")
      for i, pred in enumerate(y_pred):
        feature_preds[i][pred] += weight
    return feature_preds

In [None]:
# Example
params = {
    'C':[10,30],
    'gamma':[0.01],
    'degree':[1,2,3],
    "decision_function_shape": ["ovo"]
}
example_svc = grid_search_optimizer(SVC(), X_train, y_train, params)
example_svc_f1 = f1_score(example_svc.predict(X_val),y_val,average="macro")
print(example_svc_f1)

In [None]:
example_svc.best_estimator_

In [None]:
# Example
params = {
    'C':[1,5,10],
    'gamma':[0.01],
    'degree':[1],
    "decision_function_shape": ["ovo"]
}
bag_params = {
    'n_estimators':[30,40]
}
example_bagging_svc = bagging_grid_search_optimizer(model=SVC(), X=X_train, y=y_train, model_params=params, bag_params=bag_params)
example_bagging_svc_f1 = f1_score(example_svc.predict(X_val),y_val,average="macro")
print(example_bagging_svc_f1)

In [None]:
example_svc.predict()

In [None]:
# Example
new_feature = model_predicts_to_feature([example_svc],X_val,y_val)
y_pred = example_svc.predict(X_train)
print(y_pred[0])
new_feature[0]

## Kernel SVC

In [None]:
for C in [10,50,100]:
    for gamma in [0.01,0.015]:
        svc = SVC(C=C, gamma=gamma)
        svc.fit(X_train_final, y_train_res)
        svc_ac = accuracy_score(y_val, svc.predict(X_val_final))
        svc_f1 = f1_score(y_val,svc.predict(X_val_final),average='macro')
        print(f"accuracy:{svc_ac:.4f} f1:{svc_f1:.4f} C={C} gamma={gamma}")

## Logistic Regression

In [None]:
# logistic regression
iter_range = range(1,100, 3)

lr_accuracy = []
lr_f1 = []
for _iter in iter_range:
    lr = LogisticRegression(max_iter=_iter)
    lr.fit(X_train_final, y_train_res)
    y_pred_lr = lr.predict(X_val_final)
    lr_accuracy.append(metrics.accuracy_score(y_val, y_pred_lr))
    lr_f1.append(metrics.f1_score(y_val,lr.predict(X_val_final),average='macro'))

In [None]:
plot_scores(iter_range,lr_accuracy,lr_f1)

## K-means

In [None]:
# K-means
k_range = range(1, 20)

knn_accuracy = []
knn_f1 = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_final, y_train_res)
    y_pred_knn = knn.predict(X_val_final)
    knn_accuracy.append(metrics.accuracy_score(y_val, y_pred_knn))
    knn_f1.append(metrics.f1_score(y_val,knn.predict(X_val_final),average='macro'))

In [None]:
plot_scores(k_range,knn_accuracy,knn_f1)

## Bagging

### Decision Tree

In [None]:
# Decision Tree
bag_dt = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, bootstrap=True, n_jobs=6, oob_score=True)
bag_dt.fit(X_train_final, y_train_res)
bag_dt.oob_score_

In [None]:
y_pred_bag_dt = bag_dt.predict(X_val_final)
bag_dt_ac = accuracy_score(y_val, y_pred_bag_dt)
bag_dt_f1 = f1_score(y_val, y_pred_bag_dt,average="macro")
print(f"Ac:{bag_dt_ac} F1:{bag_dt_f1}")

### Random Forest

In [None]:
# Random Forest
bag_rf = BaggingClassifier(RandomForestClassifier(n_estimators=500), n_estimators=10, bootstrap=True, n_jobs=6)
bag_rf.fit(X_train, y_train)

In [None]:
y_pred_bag_rf = bag_rf.predict(X_val_final)
bag_rf_ac = accuracy_score(y_val, y_pred_bag_rf)
bag_rf_f1 = f1_score(y_val, y_pred_bag_rf,average="macro")
print(f"AC:{bag_rf_ac} F1:{bag_rf_f1}")

### K-means

In [None]:
# K-means
k_range = range(1, 15)
for n_estimators in [5,10,15]:
  bag_knn_acs = []
  bag_knn_f1s = []
  for k in k_range:
      bag_knn = BaggingClassifier(KNeighborsClassifier(n_neighbors=k),n_estimators=n_estimators,random_state=0, n_jobs=-1)
      bag_knn.fit(X_train, y_train)
      y_pred_bag_knn = bag_knn.predict(X_val)
      bag_knn_ac = accuracy_score(y_val, y_pred_bag_knn)
      bag_knn_f1 = f1_score(y_val, y_pred_bag_knn, average="macro")
      bag_knn_acs.append(bag_knn_ac)
      bag_knn_f1s.append(bag_knn_f1)
      print(f"AC:{bag_knn_ac} F1:{bag_knn_f1} K={k} n_estimators={n_estimators}")
  plot_scores(k_range,bag_knn_acs,bag_knn_f1s)

### SVC

In [None]:
# SVC
for C in [30,40,50]:
    for gamma in [0.01]:
      for n_estimators in [15,20]:
          bag_svc = BaggingClassifier(SVC(C=C, gamma=gamma), n_estimators=n_estimators, random_state=0, n_jobs=6)
          bag_svc.fit(X_train, y_train)
          y_pred_bag_svc = bag_svc.predict(X_val_final)
          bag_svc_ac = accuracy_score(y_val, y_pred_bag_svc)
          # print(type(y_val), type(y_pred_bag_svc))
          bag_svc_f1 = f1_score(y_val, y_pred_bag_svc ,average="macro")
          print(f"AC:{bag_svc_ac:.4f} F1:{bag_svc_f1:.4f} C={C} gamma={gamma} n_estimators:{n_estimators}")

In [None]:
# Polynominal Kernel SVC
for C in [0.1,1]:
    for gamma in [0.001,0.01]:
      for n_estimators in [1,5,10]:
          bag_poly_kernel_svc = BaggingClassifier(SVC(kernel="poly",degree=2, C=C, gamma=gamma, coef0=1), n_estimators=n_estimators, random_state=0, n_jobs=-1)
          bag_poly_kernel_svc.fit(X_train, y_train)
          y_pred_bag_poly_kernel_svc = bag_poly_kernel_svc.predict(X_val_final)
          bag_poly_kernel_svc_ac = accuracy_score(y_val, y_pred_bag_poly_kernel_svc)
          # print(type(y_val), type(y_pred_bag_svc))
          bag_poly_kernel_svc_f1 = f1_score(y_val, y_pred_bag_poly_kernel_svc ,average="macro")
          print(f"AC:{bag_poly_kernel_svc_ac:.4f} F1:{bag_poly_kernel_svc_f1:.4f} C={C} gamma={gamma} n_estimators:{n_estimators}")

### Gauss(rbf) Kernel SVC

In [None]:
# Gauss(rbf) Kernel SVC
for C in [30,40,50,60]:
    for gamma in [0.01]:
      for n_estimators in [15,20,25]:
          bag_rbf_kernel_svc = BaggingClassifier(SVC(kernel="rbf",C=C, gamma=gamma), n_estimators=n_estimators, bootstrap=True, random_state=0, n_jobs=-1)
          bag_rbf_kernel_svc.fit(X_train, y_train)
          y_pred_bag_rbf_kernel_svc = bag_rbf_kernel_svc.predict(X_val_final)
          bag_rbf_kernel_svc_ac = accuracy_score(y_val, y_pred_bag_rbf_kernel_svc)
          # print(type(y_val), type(y_pred_bag_svc))
          bag_rbf_kernel_svc_f1 = f1_score(y_val, y_pred_bag_rbf_kernel_svc ,average="macro")
          print(f"AC:{bag_rbf_kernel_svc_ac:.4f} F1:{bag_rbf_kernel_svc_f1:.4f} C={C} gamma={gamma} n_estimators:{n_estimators}")

In [None]:
best_params = {}

## Gridsearch 

### Logistic Regression

In [None]:
lr_params = [
    {'solver': ['liblinear', 'saga'], 'penalty':['l1', 'l2'], 'C': [0.1, 1, 10, 100]},
    {'solver': ['newton-cg', 'sag', 'lbfgs' ], 'penalty':['l2'], 'C': [0.1, 1, 10, 100]},
]
gs_lr = grid_search_optimizer(LogisticRegression(), X_train, y_train, lr_params)

### K-means

In [None]:
knn_params = {
    "n_neighbors":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
}

### SVC

In [None]:
# rbf
svc_params = {
    'C':[0.1,10,100,1000],
    "decision_function_shape": ["ovo","ovr"],
    'gamma':[0.01,1,5,10]
}
gs_svc = grid_search_optimizer(SVC(),X_train, y_train, svc_params)

In [None]:
best_params['svc'] = gs_svc.best_params_
best_params

# Submission

In [None]:
clf = BaggingClassifier(SVC(kernel="rbf",C=30, gamma=0.01), n_estimators=20, random_state=0, n_jobs=-1)
clf.fit(train, target)
y_pred = clf.predict(test)

In [None]:
submission_df = pd.DataFrame(y_pred)
submission_df.insert(0,'index',0)
for i in range(len(submission_df)):
    submission_df.loc[i,"index"] = i + 4046
submission_df.set_index("index")
submission_df = submission_df.astype("int64")
submission_df.dtypes

In [None]:
submission_df.to_csv("./output/prediction_baggingKernelSVC.csv",index=False,header=False)