# Get Ready

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("darkgrid")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv("./input/train.csv",index_col="index")
test = pd.read_csv("./input/test.csv")
test = test.drop(["index"],axis=1)

test.head()

In [None]:
X = pd.concat([train.drop("genre",axis=1),test],axis=0)
y = train[["genre"]]

In [None]:
#tempo analysis
X_tempo = X["tempo"]
print(X_tempo)
tempos = []

for i,tempo in enumerate(X_tempo):
    tmp = tempo.split("-")
    tmp[0] = int(tmp[0])
    tmp[1] = int(tmp[1])
    tempos.append([tmp[0],tmp[1],(tmp[1]-tmp[0]),((tmp[0]+tmp[1])//2)])

tempo_df = pd.DataFrame(tempos,columns=["down","up","range","center"])
tempo_df.head()

In [None]:
fig = plt.figure(figsize=(10,10))
for index, col in enumerate(tempo_df.columns):
    plt.subplot(2,2,index+1)
    sns.distplot(tempo_df.loc[:,col].dropna(),kde=False)
fig.tight_layout(pad=1.0)

In [None]:
fig = plt.figure(figsize=(10,20))
for index, col in enumerate(tempo_df.columns):
    plt.subplot(5,2,index+1)
    sns.boxplot(y=col,data=tempo_df.dropna())
fig.tight_layout(pad=1.0)

In [None]:
X.info()

In [None]:
X_nums = X.drop(["tempo","region"],axis=1)
X_nums.info()

In [None]:
X_cats = X.select_dtypes(include=["object"]).copy()
X_cats.info()

# Visualization and Processing

In [None]:
fig = plt.figure(figsize=(15,10))
for index, col in enumerate(X_nums.columns):
    plt.subplot(3,4,index+1)
    sns.distplot(X_nums.loc[:,col].dropna(),kde=False)
fig.tight_layout(pad=1.0)

In [None]:
fig = plt.figure(figsize=(10,10))
for index, col in enumerate(X_nums.columns):
    plt.subplot(3,4,index+1)
    sns.boxplot(y=col,data=X_nums.dropna())
fig.tight_layout(pad=1.0)

In [None]:
plt.figure(figsize=(10,10))
correlation = X_nums.corr()

sns.heatmap(correlation,linewidths=0.5, mask=correlation<0.4,cmap="magma")

In [None]:
fig = plt.figure(figsize=(6,5))
sns.distplot(y.loc[:,"genre"].dropna(),kde=False)
fig.tight_layout(pad=1.0)

In [None]:
#percentage
for i in range(10):
    print("{}:{}".format(i+1,(y[y["genre"]==i+1].count()/y.size)*100))

In [None]:
#make tempo in train data
train_num = train.select_dtypes(exclude=["object"])
train_tempo = train["tempo"]
train_tempos = []

for i,tempo in enumerate(train_tempo):
    tmp = tempo.split("-")
    tmp[0] = int(tmp[0])
    tmp[1] = int(tmp[1])
    train_tempos.append([tmp[0],tmp[1],(tmp[1]-tmp[0]),((tmp[0]+tmp[1])//2)])

train_tempo_df = pd.DataFrame(train_tempos,columns=["down","up","range","center"])
train_tempo_df = train_tempo_df.drop(["down","up"],axis=1)
train_num = pd.concat([train_num,train_tempo_df],axis=1)

train_num.head()

In [None]:
#make tempo in test data
test_num = test.select_dtypes(exclude=["object"])

test_tempo = test["tempo"]
test_tempos = []

for i,tempo in enumerate(test_tempo):
    tmp = tempo.split("-")
    tmp[0] = int(tmp[0])
    tmp[1] = int(tmp[1])
    test_tempos.append([tmp[0],tmp[1],(tmp[1]-tmp[0]),((tmp[0]+tmp[1])//2)])

test_tempo_df = pd.DataFrame(test_tempos,columns=["down","up","range","center"])
test_tempo_df = test_tempo_df.drop(["down","up"],axis=1)
test_num = pd.concat([test_num,test_tempo_df],axis=1)

test_num.head()

In [None]:
plt.figure(figsize=(25,8))
plt.title("Number of mising rows")
missing_count = pd.DataFrame(X.isnull().sum(),columns=["sum"]).sort_values(by=["sum"], ascending=False).reset_index()
missing_count.columns = ["feature", "sum"]
sns.barplot(x="feature",y="sum",data=missing_count)

In [None]:
processed_train = train_num.copy()
processed_test = test_num.copy()
train_num.isnull().sum()

In [None]:
X_region = X.loc[:,"region"]
X_region.unique()

In [None]:
fig = plt.figure(figsize=(8,5))
sns.countplot(x=X_region.loc[:], data=X_region)
plt.xticks(rotation=90)
fig.tight_layout(pad=1.0)

In [None]:
train_region = train[["region"]]
train_region = pd.get_dummies(train_region)
train_region = train_region.drop(["region_unknown"],axis=1)
train_region.head()

In [None]:
test_region = test[["region"]]
test_region = pd.get_dummies(test_region)
test_region = test_region.drop(["region_unknown"],axis=1)
test_region.head()

In [None]:
missing_cols = ["positiveness","danceability","speechiness","liveness","instrumentalness","acousticness","energy"]
for col in missing_cols:
    processed_train[col].fillna(processed_train[col].median(),inplace=True)
    processed_test[col].fillna(processed_train[col].median(),inplace=True)

processed_train.isnull().sum()

In [None]:
pro_correlation = processed_train.corr()
plt.figure(figsize=(10,10))
sns.heatmap(pro_correlation,linewidths=0.5,cmap="magma")

In [None]:
fig = plt.figure(figsize=(20,10))
for index in range(12):
    plt.subplot(3,4,index+1)
    sns.scatterplot(x=processed_train.iloc[:,index], y='genre', data=processed_train)
fig.tight_layout(pad=1.0)

In [None]:
pro_correlation[['genre']].sort_values(['genre'], ascending=False)

In [None]:
for i in range(len(processed_train)):
    processed_train.loc[i,"duration_ms"] = np.log( processed_train.loc[i,"duration_ms"])

for i in range(len(processed_test)):
    processed_test.loc[i,"duration_ms"] = np.log( processed_test.loc[i,"duration_ms"])

In [None]:
fig = plt.figure(figsize=(15,15))
for index, col in enumerate(processed_train.columns):
    plt.subplot(9,4,index+1)
    sns.distplot(processed_train.loc[:,col],kde=False)
fig.tight_layout(pad=1.0)

In [None]:
processed_train.drop(["genre"],axis=1,inplace=True)
processed_train.isnull().sum()

In [None]:
processed_train = pd.concat([processed_train, train_region],axis=1)
processed_train.head()

In [None]:
processed_test = pd.concat([processed_test, test_region],axis=1)
processed_test.head()

In [None]:
# #drop
# print(processed_train.shape)
# processed_train = processed_train.drop(processed_train[processed_train['1'] < 13.5].index)
# processed_train = processed_train.drop(processed_train[processed_train['loudness'] < -25].index)
# processed_train = processed_train.drop(processed_train[processed_train['speechiness'] > 0.6].index)
# processed_train = processed_train.drop(processed_train[processed_train['instrumentalness'] > 0.8].index)
# processed_train.reset_index(drop=True,inplace=True)
# processed_train.drop(["genre"],axis=1,inplace=True)
# print(processed_train.shape)

In [None]:
processed_test.insert(24,"region_region_M",0)

In [None]:
processed_test.dtypes

In [None]:
processed_train.dtypes

In [None]:
fig = plt.figure(figsize=(15,15))
for index, col in enumerate(processed_train.columns):
    plt.subplot(9,4,index+1)
    sns.distplot(processed_train.loc[:,col],kde=False)
fig.tight_layout(pad=1.0)

# Modeling and Predicting

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import KernelPCA, PCA

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

from sklearn import metrics

In [None]:
smote = SMOTE(random_state=1234)

X_train, X_val, y_train, y_val = train_test_split(processed_train, y.iloc[:,0], test_size=0.2, random_state=1234,stratify=y)
X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train,test_size=0.2,random_state=1,stratify=y_train)

X_train_res , y_train_res = smote.fit_resample(X_train,y_train)

stdScaler = StandardScaler()
X_train_std = pd.DataFrame(stdScaler.fit_transform(X_train_res))
X_val_std = pd.DataFrame(stdScaler.transform(X_val))
X_eval_std = pd.DataFrame(stdScaler.transform(X_eval))

In [None]:
print(y_train_res.count)

In [None]:
pca = PCA(n_components=20)
X_train_pca = pca.fit_transform(X_train_std)
X_val_pca = pca.transform(X_val_std)
X_eval_pca = pca.transform(X_eval_std)

In [None]:
kpca = KernelPCA(n_components=2,kernel="rbf",gamma=15)
X_train_kpca = kpca.fit_transform(X_train_std)
X_val_kpca = kpca.transform(X_val_std)
X_eval_kpca = kpca.transform(X_eval_std)

In [None]:
X_val_kpca.shape

In [None]:
lda = LinearDiscriminantAnalysis(n_components=None)
X_train_lda = lda.fit_transform(X_train_std,y_train_res)
X_val_lda = lda.transform(X_val_std)
X_eval_lda = pca.transform(X_eval_std)

In [None]:
X_train_final = X_train_lda
X_val_final = X_val_lda
X_eval_final = X_eval_lda

In [None]:
lda.explained_variance_ratio_

In [None]:
cum_var_exp = np.cumsum(lda.explained_variance_ratio_)
plt.step(range(1,len(cum_var_exp)+1),cum_var_exp)

In [None]:
# logistic regression
iter_range = range(1,100, 3)

lr_accuracy = []
lr_f1 = []
for _iter in iter_range:
    lr = LogisticRegression(max_iter=_iter)
    lr.fit(X_train_final, y_train_res)
    y_pred_lr = lr.predict(X_val_final)
    lr_accuracy.append(metrics.accuracy_score(y_val, y_pred_lr))
    lr_f1.append(metrics.f1_score(y_val,lr.predict(X_val_final),average='macro'))

In [None]:
plt.plot(iter_range, lr_accuracy)
plt.xlabel('iter for lr')
plt.ylabel('Testing Accuracy')

In [None]:
plt.plot(iter_range, lr_f1)
plt.xlabel('K for lr')
plt.ylabel('Testing F1 score')

In [None]:
# K-means
k_range = range(1, 20)

knn_accuracy = []
knn_f1 = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_final, y_train_res)
    y_pred_knn = knn.predict(X_val_final)
    knn_accuracy.append(metrics.accuracy_score(y_val, y_pred_knn))
    knn_f1.append(metrics.f1_score(y_val,knn.predict(X_val_final),average='macro'))

In [None]:
plt.plot(k_range, knn_accuracy)
plt.xlabel('K for kNN')
plt.ylabel('Testing Accuracy')

In [None]:
plt.plot(k_range, knn_f1)
plt.xlabel('K for kNN')
plt.ylabel('Testing F1 score')

In [None]:
#kernel SVC
# for C in [0.01, 0.1, 1, 10, 100]:
#     for gamma in [0.1, 1, 10]:
#         clf = SVC(C=C, gamma=gamma)
#         clf.fit(X_train_final, y_train_res)
#         ac_score = metrics.accuracy_score(y_val, clf.predict(X_val_final))
#         f1_score = metrics.f1_score(y_val,clf.predict(X_val_final),average='macro')
#         # print(clf.score(X_train_final,y_train))
#         print(ac_score, "C = %s, gamma = %s" % (C, gamma))
#         print(f1_score,"f1_score")

In [None]:
from sklearn.datasets import load_iris

iris = load_iris()

iris_data = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_target = pd.Series(iris.target)

In [None]:
X_train_final.shape

In [None]:
# XGBoost
dtrain = xgb.DMatrix(X_train_final, label=y_train_res)
dvalid = xgb.DMatrix(X_val_final, label=y_val)

param = {'max_depth': 5, 'eta': 0.5, 'objective': 'multi:softmax', 'num_class': 11, 'eval_metric': 'mlogloss'}
evallist = [(dvalid, 'eval'), (dtrain, 'train')]
num_round = 50
bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=5)

In [None]:
print('Best Score:{0:.4f}, Iteratin:{1:d}, Ntree_Limit:{2:d}'.format(bst.best_score, bst.best_iteration, bst.best_ntree_limit))

In [None]:
dtest = xgb.DMatrix(X_eval_final)
pred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)

In [None]:
ac_score = metrics.accuracy_score(y_eval, pred)
f1_score = metrics.f1_score(y_eval,pred,average='macro')
print('ac score:{0:.4f}'.format(ac_score))
print('f1 score:{0:.4f}'.format(f1_score))

In [None]:
xgb.plot_importance(bst)

# Make Submission

In [None]:
print(processed_train.shape)
print(y.shape)

In [None]:
smote = SMOTE(random_state=1234)
processed_train_res ,y_res = smote.fit_resample(processed_train,y.iloc[:,0])
submission_stdScaler = StandardScaler()
processed_train_res_std = pd.DataFrame(submission_stdScaler.fit_transform(processed_train_res))
processed_test_std = pd.DataFrame(submission_stdScaler.fit_transform(processed_test))

In [None]:
submission_clf = SVC(C=1, gamma=0.1)
submission_clf.fit(processed_train_res_std, y["genre"])
y_pred = submission_clf.predict(processed_test_std)

svc = SVC(C=1, gamma=0.1)
svc.fit(X_train_std, y_train_std)
ac_score = metrics.accuracy_score(y_eval, svc.predict(X_eval_std))
f1_score = metrics.f1_score(y_eval,svc.predict(X_eval_std),average='macro')
print(svc.score(X_train,y_train))
print(ac_score)
print(f1_score)

In [None]:
dtrain = xgb.DMatrix(processed_train_res_std, label=y_res)
dvalid = xgb.DMatrix(X_val_final, label=y_val)

param = {'max_depth': 5, 'eta': 0.5, 'objective': 'multi:softmax', 'num_class': 11, 'eval_metric': 'mlogloss'}
evallist = [(dvalid, 'eval'), (dtrain, 'train')]
num_round = 50
bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=5)

dtest = xgb.DMatrix(processed_test_std)
pred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)

In [None]:
submission_df = pd.DataFrame(pred)
submission_df.insert(0,'index',0)
for i in range(len(submission_df)):
    submission_df.loc[i,"index"] = i + 4046
submission_df.set_index("index")
submission_df = submission_df.astype("int64")
submission_df.dtypes

In [None]:
submission_df.to_csv("./output/prediction_xgb.csv",index=False,header=False)