# AI Machine Learning Practice 

# Informations

- Dataset: Music dataset

- Objectives: Classification

- Time Limits: 1 min

- Score: Classification Accuracy (Test Data)

- Please read all markdowns carefully 

- About Dataset: Music Style Data
    - 348 float type music features (frequency, tone, tempo, timbre...)
    - Label: Music Style
        - 1: Melancholy
        - 2: Romantic
        - 3: Rhythmical
    

## [Step 0] Importing Packages

You must specify all the packages you use in this practice in the cell below.



In [54]:
from __future__ import print_function
import os
data_path = ['data']

from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, balanced_accuracy_score, f1_score
import numpy as np

## [Step 1] Read Data

Train dataset is in the 'data' directory


In [2]:
import pandas as pd

# Import the data using the file path
filepath = os.sep.join(data_path + ['music_train_data.csv'])
data = pd.read_csv(filepath)

In [3]:
data.head(1)

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f340,f341,f342,f343,f344,f345,f346,f347,f348,answer
0,-0.166614,0.284691,-0.011022,-1.028812,0.101653,0.498247,-0.314566,1.208697,-1.503008,-1.457764,...,2.136721,-1.193955,0.040614,1.127366,0.741521,-0.70773,0.077748,0.832992,-1.291423,2


In [4]:
print(data.shape)
print(data.dtypes)

(650, 349)
f1        float64
f2        float64
f3        float64
f4        float64
f5        float64
           ...   
f345      float64
f346      float64
f347      float64
f348      float64
answer      int64
Length: 349, dtype: object


In [5]:
features = data.columns[:-1]
X_data = data[features]
y_data = data['answer']

In [44]:
# Label proportion => imbalanced!
print(y_data.value_counts(normalize=True).sort_index())

1    0.516923
2    0.369231
3    0.113846
Name: answer, dtype: float64


## [Step 2] Data Preprocessing

* Preprocessing Code below 
* You must explain your method in this markdown
* (Important) You must define transfrom function for test data

In [6]:
# Sample Code - Min Max Scaling
import warnings
warnings.filterwarnings('ignore', module='sklearn')


msc = MinMaxScaler()
X_data_msc = pd.DataFrame(msc.fit_transform(X_data),
                    columns=X_data.columns)
X_data_msc.head(5)

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f339,f340,f341,f342,f343,f344,f345,f346,f347,f348
0,0.630211,0.538339,0.46376,0.040328,0.736837,0.780866,0.28018,0.264939,0.210008,0.169543,...,0.487044,0.518064,0.095777,0.52259,0.586644,0.320691,0.475852,0.228383,0.78003,0.362149
1,0.552919,0.606669,0.201281,0.116883,0.52788,0.914367,0.110937,0.053678,0.436478,0.483335,...,0.518388,0.008166,0.231839,0.684377,0.242407,0.143329,0.379794,0.364343,0.34701,0.594168
2,0.621325,0.551232,0.578177,0.174603,0.573474,0.715899,0.222025,0.05408,0.437189,0.647545,...,0.317538,0.221009,0.251789,0.472166,0.226927,0.179582,0.605205,0.027091,0.341074,0.42147
3,0.652887,0.513503,0.618537,0.072859,0.618504,0.617193,0.20625,0.047912,0.642507,0.46235,...,0.548682,0.251897,0.268742,0.298482,0.3063,0.130865,0.0,0.407966,0.829491,0.609766
4,0.805443,0.343634,0.486271,0.549784,0.850448,0.701505,0.384685,0.245385,0.422189,0.215617,...,0.549331,0.030774,0.267452,0.413211,0.655781,0.263345,0.46422,0.14344,0.873379,0.567479


In [7]:
ssc = StandardScaler()
X_data_ssc = pd.DataFrame(ssc.fit_transform(X_data), columns=X_data.columns)
X_data_ssc.head(5)

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f339,f340,f341,f342,f343,f344,f345,f346,f347,f348
0,-0.153211,0.273649,-0.019149,-1.051726,0.096108,0.521656,-0.344175,1.144901,-1.505849,-1.468402,...,-0.191912,2.067354,-1.185761,0.030032,1.119665,0.735418,-0.705101,0.047618,0.827437,-1.296636
1,-0.564825,0.63752,-1.757884,-0.528857,-1.076342,1.450394,-1.428461,-0.731226,0.087906,0.160739,...,0.017692,-0.802687,-0.324777,0.990219,-1.114428,-0.72669,-1.395914,0.80117,-1.599077,0.36789
2,-0.200534,0.342309,0.738784,-0.134629,-0.820515,0.069688,-0.716755,-0.727654,0.092904,1.013282,...,-1.325449,0.395334,-0.198536,-0.269227,-1.214895,-0.427836,0.225151,-1.06803,-1.632341,-0.87106
3,-0.032453,0.141397,1.006144,-0.829538,-0.567853,-0.61699,-0.817818,-0.782429,1.537804,0.051791,...,0.220275,0.569192,-0.091264,-1.300015,-0.699762,-0.829442,-4.12724,1.042951,1.104601,0.479793
4,0.779968,-0.763179,0.129974,2.427849,0.733573,-0.030443,0.325361,0.971249,-0.012652,-1.229192,...,0.224616,-0.675434,-0.099424,-0.619116,1.568358,0.262675,-0.788757,-0.423169,1.350533,0.176424


In [104]:
# Check skewing only for MinMaxScaled data, since original data has non-positive values
skew_limit = 1.0
skew_vals = X_data_msc[X_data_msc.columns].skew()

skew_cols = (skew_vals
             .sort_values(ascending=False)
             .to_frame()
             .rename(columns={0:'Skew'})
             .query('abs(Skew) > {0}'.format(skew_limit))
            )

print(skew_cols)

pd.options.mode.chained_assignment = None
X_data_msc_skew = X_data_msc.copy()
for col in skew_cols.index.tolist():
    X_data_msc_skew[col] = np.log1p(X_data_msc[col])
#     test[col]  = test[col].apply(np.log1p)  # same thing
X_data_msc_skew

           Skew
f65   25.353958
f66   25.154355
f13    9.475031
f116   5.060596
f14    5.037542
...         ...
f70   -2.332200
f140  -2.593141
f179  -3.371065
f16   -6.985449
f328 -21.557009

[111 rows x 1 columns]


Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f339,f340,f341,f342,f343,f344,f345,f346,f347,f348
0,0.630211,0.538339,0.463760,0.039536,0.552066,0.577100,0.280180,0.235024,0.210008,0.169543,...,0.487044,0.417436,0.095777,0.522590,0.586644,0.278155,0.475852,0.205698,0.780030,0.362149
1,0.552919,0.606669,0.201281,0.110542,0.423881,0.649387,0.110937,0.052287,0.436478,0.483335,...,0.518388,0.008133,0.231839,0.684377,0.242407,0.133945,0.379794,0.310673,0.347010,0.594168
2,0.621325,0.551232,0.578177,0.160930,0.453286,0.539937,0.222025,0.052668,0.437189,0.647545,...,0.317538,0.199678,0.251789,0.472166,0.226927,0.165160,0.605205,0.026731,0.341074,0.421470
3,0.652887,0.513503,0.618537,0.070327,0.481502,0.480692,0.206250,0.046800,0.642507,0.462350,...,0.548682,0.224660,0.268742,0.298482,0.306300,0.122983,0.000000,0.342146,0.829491,0.609766
4,0.805443,0.343634,0.486271,0.438115,0.615428,0.531513,0.384685,0.219445,0.422189,0.215617,...,0.549331,0.030310,0.267452,0.413211,0.655781,0.233763,0.464220,0.134042,0.873379,0.567479
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
645,0.643986,0.530870,0.792612,0.180697,0.560149,0.266530,0.268045,0.189143,0.320156,0.632500,...,0.485984,0.009257,0.296869,0.348801,0.445555,0.226926,0.551983,0.403310,0.745392,0.753700
646,0.830682,0.310314,0.357356,0.281167,0.620441,0.608594,0.516820,0.040042,0.445707,0.522872,...,0.493612,0.017819,0.188927,0.260164,0.282514,0.169770,0.616977,0.040321,0.456574,0.646787
647,0.834328,0.304755,0.406872,0.104853,0.576620,0.570367,0.330605,0.030042,0.396838,0.571843,...,0.497486,0.014047,0.066153,0.540204,0.302202,0.085055,0.748845,0.284448,0.664148,0.601947
648,0.716458,0.450655,0.322461,0.021944,0.536020,0.578582,0.470240,0.111791,0.314869,0.409862,...,0.436012,0.153133,0.480615,0.534249,0.184382,0.093657,0.538613,0.013027,0.522285,0.612838


In [8]:
# transform function
# Do not change the function name
def transform_test(X_test_data):
#     X_test_data = msc.transform(X_test_data)
    X_test_data = ssc.transform(X_test_data)
    return X_test_data

### Skew log must be applied!!!

## [Step 3] Model Training

* Training Code Below
* You must explain your method in this markdown
* (Important) Your model variable should be named 'model' !!! 

## Check Accuracy 

* Check your Train data accuracy

In [9]:
# Function to calculate the % of values that were correctly predicted

def accuracy(real, predict):
    return sum(real == predict) / float(real.shape[0])

In [None]:
# k-NN

In [107]:
# no-scaled data
knn = KNeighborsClassifier()
GS_knn_nsc = GridSearchCV(knn, 
                          param_grid={
                              'n_neighbors': [k for k in range(2, 21)],
                              'p': [1, 2],
                              'weights': ['uniform', 'distance']
                          },
                          scoring='accuracy',
                          refit=True,
                          cv=StratifiedKFold(n_splits=5)
                         )
GS_knn_nsc.fit(X_data, y_data)
print(GS_knn_nsc.best_params_, GS_knn_nsc.best_score_)

# MinMaxScaler
knn = KNeighborsClassifier()
GS_knn_msc = GridSearchCV(knn, 
                          param_grid={
                              'n_neighbors': [k for k in range(2, 21)],
                              'p': [1, 2],
                              'weights': ['uniform', 'distance']
                          },
                          scoring='accuracy',
                          refit=True,
                          cv=StratifiedKFold(n_splits=5)
                         )
GS_knn_msc.fit(X_data_msc, y_data)
print(GS_knn_msc.best_params_, GS_knn_msc.best_score_)

# StandardScaler
knn = KNeighborsClassifier()
GS_knn_ssc = GridSearchCV(knn, 
                          param_grid={
                              'n_neighbors': [k for k in range(2, 21)],
                              'p': [1, 2],
                              'weights': ['uniform', 'distance']
                          },
                          scoring='accuracy',
                          refit=True,
                          cv=StratifiedKFold(n_splits=5)
                         )
GS_knn_ssc.fit(X_data_ssc, y_data)
print(GS_knn_ssc.best_params_, GS_knn_ssc.best_score_)

# MinMaxScaler with log skew
knn = KNeighborsClassifier()
GS_knn_msc_skew = GridSearchCV(knn, 
                              param_grid={
                                  'n_neighbors': [k for k in range(2, 21)],
                                  'p': [1, 2],
                                  'weights': ['uniform', 'distance']
                              },
                              scoring='accuracy',
                              refit=True,
                              cv=StratifiedKFold(n_splits=5)
                             )
GS_knn_msc_skew.fit(X_data_msc_skew, y_data)
print(GS_knn_msc_skew.best_params_, GS_knn_msc_skew.best_score_)

# model = GS_knn.best_estimator_
# model = model.fit(X_data_msc, y_data)
# accuracy(model.predict(X_data_msc), y_data)

{'n_neighbors': 12, 'p': 1, 'weights': 'distance'} 0.8861538461538462
{'n_neighbors': 16, 'p': 1, 'weights': 'distance'} 0.8892307692307693
{'n_neighbors': 9, 'p': 1, 'weights': 'uniform'} 0.8876923076923078
{'n_neighbors': 12, 'p': 1, 'weights': 'distance'} 0.8953846153846154


## Analysis 

* Analyze your model's result
* You may use additional metrics (F1 Score, Confusion matrix) or visualize your results using plots
* Hint : PCA plot will help you understand the dataset (Which class is the most challenging class to classify?)
* Hint : You may also compare different models to choose the best model among classifiers what we learned in this semester

In [11]:
# Your code here

# Test data

* TA will check your model's test data accuracy
* (Important) Do not change the code below

In [12]:
filepath = os.sep.join( ['data', 'music_test_data.csv'])
t_data = pd.read_csv(filepath)
features = t_data.columns
X_t_data = t_data[features]
X_t_data = transform_test(X_t_data)

y_pred = model.predict(X_t_data)
np.savetxt('out.txt', y_pred, fmt='%d', delimiter='\n')