# Get Ready

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("darkgrid")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv("./input/train.csv",index_col="index")
test = pd.read_csv("./input/test.csv")
test = test.drop(["index"],axis=1)
y = train["genre"].copy()
train.drop(["genre"],axis=1)

In [None]:
train

In [None]:
#make tempo to num
def tempo_to_int(tempo):
    tempos = []
    for tempo in tempo:
        tmp = tempo.split("-")
        tmp[0] = int(tmp[0])
        tmp[1] = int(tmp[1])
        tempos.append([tmp[1]-tmp[0],(tmp[0]+tmp[1])//2])
    tempos_df = pd.DataFrame(tempos,columns=["tempo_range","tempo_center"])
    return tempos_df

In [None]:
# tempo -> tempo_renge, tempo_center
train = pd.concat([train.drop(["tempo"],axis=1),tempo_to_int(train["tempo"])],axis=1)
test = pd.concat([test.drop(["tempo"],axis=1),tempo_to_int(test["tempo"])],axis=1)

In [None]:
#fill missing value
missing_cols = ["positiveness","danceability","speechiness","liveness","instrumentalness","acousticness","energy"]
for col in missing_cols:
    train[col].fillna(train[col].median(),inplace=True)
    test[col].fillna(train[col].median(),inplace=True)

In [None]:
train.drop(["genre"],axis=1,inplace=True)

In [None]:
un_dummied_train = train.copy()
un_dummied_test = test.copy()

In [None]:
# region to dummy
# Unkonwnは埋め方が分からないです
train = pd.get_dummies(train)
test = pd.get_dummies(test)
train = train.drop(["region_unknown"],axis=1)
test = test.drop(["region_unknown"],axis=1)
test.insert(24,"region_region_M",0)

# Modeling

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import KernelPCA, PCA

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb
from catboost import Pool, CatBoostClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn import metrics

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train, y, test_size=0.2, random_state=1234,stratify=y)
X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train,test_size=0.2,random_state=1,stratify=y_train)

smote = SMOTE(random_state=1234)
X_train_res , y_train_res = smote.fit_resample(X_train,y_train)

stdScaler = StandardScaler()
X_train_std = pd.DataFrame(stdScaler.fit_transform(X_train_res))
X_val_std = pd.DataFrame(stdScaler.transform(X_val))
X_eval_std = pd.DataFrame(stdScaler.transform(X_eval))

In [None]:
pca = PCA(n_components=20)
X_train_pca = pca.fit_transform(X_train_std)
X_val_pca = pca.transform(X_val_std)
X_eval_pca = pca.transform(X_eval_std)

In [None]:
kpca = KernelPCA(n_components=2,kernel="rbf",gamma=15)
X_train_kpca = kpca.fit_transform(X_train_std)
X_val_kpca = kpca.transform(X_val_std)
X_eval_kpca = kpca.transform(X_eval_std)

In [None]:
lda = LinearDiscriminantAnalysis(n_components=None)
X_train_lda = lda.fit_transform(X_train_std,y_train_res)
X_val_lda = lda.transform(X_val_std)
X_eval_lda = pca.transform(X_eval_std)

In [None]:
X_train_final = X_train_std
X_val_final = X_val_std
X_eval_final = X_eval_std

In [None]:
# logistic regression
iter_range = range(1,100, 3)

lr_accuracy = []
lr_f1 = []
for _iter in iter_range:
    lr = LogisticRegression(max_iter=_iter)
    lr.fit(X_train_final, y_train_res)
    y_pred_lr = lr.predict(X_val_final)
    lr_accuracy.append(metrics.accuracy_score(y_val, y_pred_lr))
    lr_f1.append(metrics.f1_score(y_val,lr.predict(X_val_final),average='macro'))

In [None]:
def plot_scores(_range,accuracy,f1):
    plt.figure(figsize=(12,5))
    plt.subplot(1,2,1)
    plt.plot(_range, accuracy)
    plt.title('Testing Accuracy')

    plt.subplot(1,2,2)
    plt.plot(_range, f1)
    plt.title('Testing F1 score')

    plt.tight_layout(pad=1)

In [None]:
plot_scores(iter_range,lr_accuracy,lr_f1)

In [None]:
# K-means
k_range = range(1, 20)

knn_accuracy = []
knn_f1 = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_final, y_train_res)
    y_pred_knn = knn.predict(X_val_final)
    knn_accuracy.append(metrics.accuracy_score(y_val, y_pred_knn))
    knn_f1.append(metrics.f1_score(y_val,knn.predict(X_val_final),average='macro'))

In [None]:
plot_scores(k_range,knn_accuracy,knn_f1)

In [None]:
#kernel SVC
for C in [0.01, 0.1, 1, 10, 100]:
    for gamma in [0.01, 0.1, 1]:
        svc = SVC(C=C, gamma=gamma)
        svc.fit(X_train_final, y_train_res)
        ac_score = metrics.accuracy_score(y_val, svc.predict(X_val_final))
        f1_score = metrics.f1_score(y_val,svc.predict(X_val_final),average='macro')
        print(f"accuracy:{ac_score:.4f} f1:{f1_score:.4f} C={C} gamma={gamma}")

In [None]:
for C in [5,10,20,50,100,200]:
    for gamma in [0.01,0.1]:
        svc = SVC(C=C, gamma=gamma)
        svc.fit(X_train_final, y_train_res)
        ac_score = metrics.accuracy_score(y_val, svc.predict(X_val_final))
        f1_score = metrics.f1_score(y_val,svc.predict(X_val_final),average='macro')
        print(f"accuracy:{ac_score:.4f} f1:{f1_score:.4f} C={C} gamma={gamma}")

In [None]:
# XGBoost
# dtrain = xgb.DMatrix(X_train_final, label=y_train_res)
# dvalid = xgb.DMatrix(X_val_final, label=y_val)

# # param = {'max_depth': 6, 'eta': 0.1, 'objective': 'multi:softmax', 'num_class': 11, 'eval_metric': 'mlogloss'}
# params = {"learning_rate":[0.1,0.3,0.5],
#         "max_depth": [2,3,5,8,10],
#          "subsample":[0.5,0.8,0.9,1],
#          "colsample_bytree": [0.5,1.0],
#          'objective': ['multi:softmax'],
#          'num_class': [11],
#          }

# mod = xgb.XGBRegressor()
# cv = GridSearchCV(mod, params, cv = 10, n_jobs =6)

# cv.fit(X_train_final,y_train_res)

# evallist = [(dvalid, 'eval'), (dtrain, 'train')]
# num_round = 100
# bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=5)

# dtest = xgb.DMatrix(X_eval_final)
# pred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)

In [None]:
# y_train_pred = cv.predict(X_train_std)
# y_val_pred = cv.predict(X_val_std)

# ac_score = metrics.accuracy_score(y_val, y_val_pred)
# f1_score = metrics.f1_score(y_val,y_val_pred,average='macro')
# print('ac score:{0:.4f}'.format(ac_score))
# print('f1 score:{0:.4f}'.format(f1_score))

In [None]:
# ac_score = metrics.accuracy_score(y_eval, pred)
# f1_score = metrics.f1_score(y_eval,pred,average='macro')
# print('ac score:{0:.4f}'.format(ac_score))
# print('f1 score:{0:.4f}'.format(f1_score))

In [None]:
# xgb.plot_importance(bst)

In [None]:
un_dummied_X_train, un_dummied_X_val, un_dummied_y_train, un_dummied_y_val = train_test_split(un_dummied_train, y, test_size=0.2, random_state=1234,stratify=y)
un_dummied_X_train, un_dummied_X_eval, un_dummied_y_train, un_dummied_y_eval = train_test_split(un_dummied_X_train, un_dummied_y_train,test_size=0.2,random_state=1,stratify=un_dummied_y_train)

smote = SMOTE(random_state=1234)
# un_dummied_X_train_res , un_dummied_y_train_res = smote.fit_resample(un_dummied_X_train,un_dummied_y_train)

In [None]:
list(un_dummied_train)

In [None]:
# Cat boost
# trainデータセット
train_pool = Pool(
    X_train_final, 
    y_train_res,
    feature_names=list(X_train_final)
)

# testデータセット
test_pool = Pool(
    X_val_final, 
    y_val,
    feature_names=list(X_val_final)
)

catboost_default_params = {
    'iterations': 1000,
    'learning_rate': 0.03,
    'eval_metric': 'Accuracy',
    'random_seed': 0, 
    'verbose': 100

}

# 多クラス分類
clf = CatBoostClassifier(**catboost_default_params)
clf.fit(train_pool)

In [None]:
y_pred = clf.predict(X_val_final)
ac_score = metrics.accuracy_score(y_val, svc.predict(X_val_final))
f1_score = metrics.f1_score(y_val,svc.predict(X_val_final),average='macro')
print(f"accuracy:{ac_score:.4f} f1:{f1_score:.4f}")

# Submission

In [None]:
smote = SMOTE(random_state=1234)
train_res ,y_res = smote.fit_resample(train,y)
submission_stdScaler = StandardScaler()
train_res_std = pd.DataFrame(submission_stdScaler.fit_transform(train_res))
test_std = pd.DataFrame(submission_stdScaler.transform(test))

In [None]:
clf = SVC(C=10, gamma=0.1)
clf.fit(train_res_std, y_res)
y_pred = clf.predict(test_std)

In [None]:
submission_df = pd.DataFrame(y_pred)
submission_df.insert(0,'index',0)
for i in range(len(submission_df)):
    submission_df.loc[i,"index"] = i + 4046
submission_df.set_index("index")
submission_df = submission_df.astype("int64")
submission_df.dtypes

In [None]:
submission_df.to_csv("./output/prediction_kernelSVC.csv",index=False,header=False)