# AI Machine Learning Practice 

# Informations

- Dataset: Music dataset

- Objectives: Classification

- Time Limits: 1 min

- Score: Classification Accuracy (Test Data)

- Please read all markdowns carefully 

- About Dataset: Music Style Data
    - 348 float type music features (frequency, tone, tempo, timbre...)
    - Label: Music Style
        - 1: Melancholy
        - 2: Romantic
        - 3: Rhythmical
    

## [Step 0] Importing Packages

You must specify all the packages you use in this practice in the cell below.



In [1]:
from __future__ import print_function
import os
data_path = ['data']

from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, balanced_accuracy_score, f1_score, precision_score, recall_score
import numpy as np

## [Step 1] Read Data

Train dataset is in the 'data' directory


In [2]:
import pandas as pd

# Import the data using the file path
filepath = os.sep.join(data_path + ['music_train_data.csv'])
data = pd.read_csv(filepath)

In [3]:
data.head(1)

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f340,f341,f342,f343,f344,f345,f346,f347,f348,answer
0,-0.166614,0.284691,-0.011022,-1.028812,0.101653,0.498247,-0.314566,1.208697,-1.503008,-1.457764,...,2.136721,-1.193955,0.040614,1.127366,0.741521,-0.70773,0.077748,0.832992,-1.291423,2


In [4]:
print(data.shape)
print(data.dtypes)

(650, 349)
f1        float64
f2        float64
f3        float64
f4        float64
f5        float64
           ...   
f345      float64
f346      float64
f347      float64
f348      float64
answer      int64
Length: 349, dtype: object


In [5]:
features = data.columns[:-1]
X_data = data[features]
y_data = data['answer']

In [6]:
# Label proportion => imbalanced!
print(y_data.value_counts(normalize=True).sort_index())

1    0.516923
2    0.369231
3    0.113846
Name: answer, dtype: float64


## [Step 2] Data Preprocessing

* Preprocessing Code below 
* You must explain your method in this markdown
* (Important) You must define transfrom function for test data

In [7]:
# Dictionary to store various experimental training dataset
X_data_dict = {'X_data': X_data}

In [8]:
# Correlation between every feature columns
X_data_corr = pd.DataFrame(index=X_data.columns, columns=X_data.columns)
for col in X_data.columns:
    X_data_corr[col] = X_data[X_data.columns].corrwith(X_data[col])
for i in range(len(X_data_corr.columns)):
    X_data_corr.iloc[i,i:] = 0.
X_data_corr = X_data_corr.apply(abs)
X_data_corr

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f339,f340,f341,f342,f343,f344,f345,f346,f347,f348
f1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
f2,0.986154,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
f3,0.141059,0.137554,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
f4,0.338368,0.352530,0.067998,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
f5,0.675279,0.678184,0.110398,0.435819,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
f344,0.401183,0.392504,0.294139,0.048916,0.070123,0.172394,0.196303,0.687886,0.006476,0.361006,...,0.005967,0.017944,0.041522,0.080583,0.693674,0.000000,0.000000,0.000000,0.000000,0.0
f345,0.203257,0.198477,0.352966,0.029907,0.032824,0.302407,0.054753,0.263086,0.037035,0.213334,...,0.132826,0.020030,0.077922,0.010812,0.333309,0.364506,0.000000,0.000000,0.000000,0.0
f346,0.028590,0.024365,0.026298,0.019282,0.209997,0.014193,0.014462,0.214913,0.004694,0.174357,...,0.078074,0.143545,0.069250,0.014253,0.065581,0.139374,0.051037,0.000000,0.000000,0.0
f347,0.282348,0.297530,0.200277,0.197937,0.574398,0.089421,0.377089,0.454674,0.129858,0.467068,...,0.026722,0.078784,0.035134,0.036331,0.769421,0.261522,0.113366,0.031478,0.000000,0.0


In [9]:
# Generate new dataset by dropping columns with correlation value equal or larger than threshold(0.8)
X_data_drop = X_data.copy()
drop_col = []
for col in X_data_corr.columns:
    for row in X_data_corr.index:
        if X_data_corr[col][row] >= 0.8:
            X_data_drop.drop(columns=col, inplace=True)
            drop_col.append(col)
            break
X_data_dict['X_data_drop'] = X_data_drop
print(drop_col)
X_data_drop

['f1', 'f14', 'f17', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f39', 'f41', 'f45', 'f47', 'f48', 'f49', 'f50', 'f51', 'f54', 'f55', 'f59', 'f65', 'f66', 'f77', 'f166', 'f167', 'f168', 'f169', 'f170', 'f171', 'f172', 'f173', 'f174', 'f175', 'f176', 'f177', 'f178', 'f205', 'f206', 'f207', 'f208', 'f209', 'f210', 'f211', 'f212', 'f213', 'f214', 'f215', 'f216', 'f217', 'f218', 'f219', 'f220', 'f221', 'f222', 'f223', 'f224', 'f225', 'f226', 'f227', 'f228', 'f229', 'f230', 'f245', 'f246', 'f247', 'f248', 'f249', 'f250', 'f251', 'f252', 'f253', 'f254', 'f255', 'f283', 'f284', 'f285', 'f286', 'f287', 'f288', 'f289', 'f290', 'f291', 'f292', 'f293', 'f294', 'f316', 'f322', 'f323', 'f324', 'f325']


Unnamed: 0,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,...,f339,f340,f341,f342,f343,f344,f345,f346,f347,f348
0,0.284691,-0.011022,-1.028812,0.101653,0.498247,-0.314566,1.208697,-1.503008,-1.457764,0.330351,...,-0.196257,2.136721,-1.193955,0.040614,1.127366,0.741521,-0.707730,0.077748,0.832992,-1.291423
1,0.651829,-1.754836,-0.512545,-1.063596,1.434039,-1.404162,-0.745222,0.054440,0.153028,-0.404961,...,0.012412,-0.803444,-0.327357,1.022505,-1.083422,-0.714399,-1.407135,0.846917,-1.562645,0.365577
2,0.353967,0.749125,-0.123296,-0.809340,0.042847,-0.688971,-0.741502,0.059323,0.995969,0.382129,...,-1.324741,0.423848,-0.200293,-0.265410,-1.182842,-0.416810,0.234092,-1.061020,-1.595486,-0.867771
3,0.151251,1.017266,-0.809429,-0.558230,-0.649047,-0.790528,-0.798548,1.471307,0.045307,1.159451,...,0.214091,0.601954,-0.092321,-1.319499,-0.673082,-0.816716,-4.172430,1.093709,1.106629,0.476974
4,-0.761447,0.138536,2.406821,0.735203,-0.058044,0.358249,1.027844,-0.043828,-1.221248,0.532904,...,0.218412,-0.673082,-0.100535,-0.623208,1.571379,0.270780,-0.792427,-0.402796,1.349432,0.174976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
645,0.244564,2.173754,0.034836,0.180263,-2.834412,-0.392688,0.684052,-0.745517,0.918739,-1.070555,...,-0.203314,-0.796906,0.086825,-1.014114,0.221252,0.200125,-0.153418,1.596107,0.641362,1.504907
646,-0.940471,-0.717929,0.888751,0.787062,0.897648,1.208942,-0.863827,0.117906,0.355986,1.415710,...,-0.152532,-0.746864,-0.600670,-1.552048,-0.825841,-0.372074,0.319805,-0.981512,-0.956493,0.741365
647,-0.970338,-0.388963,-0.555268,0.342415,0.414480,0.010077,-0.959610,-0.218170,0.607368,0.265552,...,-0.126745,-0.768960,-1.382635,0.147514,-0.699404,-1.162207,1.279939,0.647133,0.191886,0.421133
648,-0.186426,-0.949760,-1.151156,-0.052513,0.516760,0.909057,-0.147741,-0.781873,-0.224130,-0.345255,...,-0.536003,0.103654,1.257128,0.111375,-1.456077,-1.084994,-0.250769,-1.140101,-0.592954,0.498912


In [10]:
# Sample Code - Min Max Scaling
import warnings
warnings.filterwarnings('ignore', module='sklearn')


msc = MinMaxScaler()
X_data_msc = pd.DataFrame(msc.fit_transform(X_data), columns=X_data.columns)
X_data_dict['X_data_msc'] = X_data_msc
X_data_msc.head(5)

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f339,f340,f341,f342,f343,f344,f345,f346,f347,f348
0,0.630211,0.538339,0.46376,0.040328,0.736837,0.780866,0.28018,0.264939,0.210008,0.169543,...,0.487044,0.518064,0.095777,0.52259,0.586644,0.320691,0.475852,0.228383,0.78003,0.362149
1,0.552919,0.606669,0.201281,0.116883,0.52788,0.914367,0.110937,0.053678,0.436478,0.483335,...,0.518388,0.008166,0.231839,0.684377,0.242407,0.143329,0.379794,0.364343,0.34701,0.594168
2,0.621325,0.551232,0.578177,0.174603,0.573474,0.715899,0.222025,0.05408,0.437189,0.647545,...,0.317538,0.221009,0.251789,0.472166,0.226927,0.179582,0.605205,0.027091,0.341074,0.42147
3,0.652887,0.513503,0.618537,0.072859,0.618504,0.617193,0.20625,0.047912,0.642507,0.46235,...,0.548682,0.251897,0.268742,0.298482,0.3063,0.130865,0.0,0.407966,0.829491,0.609766
4,0.805443,0.343634,0.486271,0.549784,0.850448,0.701505,0.384685,0.245385,0.422189,0.215617,...,0.549331,0.030774,0.267452,0.413211,0.655781,0.263345,0.46422,0.14344,0.873379,0.567479


In [11]:
ssc = StandardScaler()
X_data_ssc = pd.DataFrame(ssc.fit_transform(X_data), columns=X_data.columns)
X_data_dict['X_data_ssc'] = X_data_ssc
X_data_ssc.head(5)

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f339,f340,f341,f342,f343,f344,f345,f346,f347,f348
0,-0.153211,0.273649,-0.019149,-1.051726,0.096108,0.521656,-0.344175,1.144901,-1.505849,-1.468402,...,-0.191912,2.067354,-1.185761,0.030032,1.119665,0.735418,-0.705101,0.047618,0.827437,-1.296636
1,-0.564825,0.63752,-1.757884,-0.528857,-1.076342,1.450394,-1.428461,-0.731226,0.087906,0.160739,...,0.017692,-0.802687,-0.324777,0.990219,-1.114428,-0.72669,-1.395914,0.80117,-1.599077,0.36789
2,-0.200534,0.342309,0.738784,-0.134629,-0.820515,0.069688,-0.716755,-0.727654,0.092904,1.013282,...,-1.325449,0.395334,-0.198536,-0.269227,-1.214895,-0.427836,0.225151,-1.06803,-1.632341,-0.87106
3,-0.032453,0.141397,1.006144,-0.829538,-0.567853,-0.61699,-0.817818,-0.782429,1.537804,0.051791,...,0.220275,0.569192,-0.091264,-1.300015,-0.699762,-0.829442,-4.12724,1.042951,1.104601,0.479793
4,0.779968,-0.763179,0.129974,2.427849,0.733573,-0.030443,0.325361,0.971249,-0.012652,-1.229192,...,0.224616,-0.675434,-0.099424,-0.619116,1.568358,0.262675,-0.788757,-0.423169,1.350533,0.176424


In [12]:
# Check skewing only for MinMaxScaled data, since original data has non-positive values
skew_limit = 1.0
skew_vals = X_data_msc[X_data_msc.columns].skew()

skew_cols = (skew_vals
             .sort_values(ascending=False)
             .to_frame()
             .rename(columns={0:'Skew'})
             .query('abs(Skew) > {0}'.format(skew_limit))
            )

print(skew_cols)

# Apply log to skewed columns
pd.options.mode.chained_assignment = None
X_data_msc_skew = X_data_msc.copy()
for col in skew_cols.index.tolist():
    X_data_msc_skew[col] = np.log1p(X_data_msc[col])
X_data_dict['X_data_msc_skew'] = X_data_msc_skew
X_data_msc_skew

           Skew
f65   25.353958
f66   25.154355
f13    9.475031
f116   5.060596
f14    5.037542
...         ...
f70   -2.332200
f140  -2.593141
f179  -3.371065
f16   -6.985449
f328 -21.557009

[111 rows x 1 columns]


Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f339,f340,f341,f342,f343,f344,f345,f346,f347,f348
0,0.630211,0.538339,0.463760,0.039536,0.552066,0.577100,0.280180,0.235024,0.210008,0.169543,...,0.487044,0.417436,0.095777,0.522590,0.586644,0.278155,0.475852,0.205698,0.780030,0.362149
1,0.552919,0.606669,0.201281,0.110542,0.423881,0.649387,0.110937,0.052287,0.436478,0.483335,...,0.518388,0.008133,0.231839,0.684377,0.242407,0.133945,0.379794,0.310673,0.347010,0.594168
2,0.621325,0.551232,0.578177,0.160930,0.453286,0.539937,0.222025,0.052668,0.437189,0.647545,...,0.317538,0.199678,0.251789,0.472166,0.226927,0.165160,0.605205,0.026731,0.341074,0.421470
3,0.652887,0.513503,0.618537,0.070327,0.481502,0.480692,0.206250,0.046800,0.642507,0.462350,...,0.548682,0.224660,0.268742,0.298482,0.306300,0.122983,0.000000,0.342146,0.829491,0.609766
4,0.805443,0.343634,0.486271,0.438115,0.615428,0.531513,0.384685,0.219445,0.422189,0.215617,...,0.549331,0.030310,0.267452,0.413211,0.655781,0.233763,0.464220,0.134042,0.873379,0.567479
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
645,0.643986,0.530870,0.792612,0.180697,0.560149,0.266530,0.268045,0.189143,0.320156,0.632500,...,0.485984,0.009257,0.296869,0.348801,0.445555,0.226926,0.551983,0.403310,0.745392,0.753700
646,0.830682,0.310314,0.357356,0.281167,0.620441,0.608594,0.516820,0.040042,0.445707,0.522872,...,0.493612,0.017819,0.188927,0.260164,0.282514,0.169770,0.616977,0.040321,0.456574,0.646787
647,0.834328,0.304755,0.406872,0.104853,0.576620,0.570367,0.330605,0.030042,0.396838,0.571843,...,0.497486,0.014047,0.066153,0.540204,0.302202,0.085055,0.748845,0.284448,0.664148,0.601947
648,0.716458,0.450655,0.322461,0.021944,0.536020,0.578582,0.470240,0.111791,0.314869,0.409862,...,0.436012,0.153133,0.480615,0.534249,0.184382,0.093657,0.538613,0.013027,0.522285,0.612838


In [13]:
X_data_msc_drop = X_data_msc.drop(columns=drop_col, inplace=False)
X_data_ssc_drop = X_data_ssc.drop(columns=drop_col, inplace=False)
X_data_msc_skew_drop = X_data_msc_skew.drop(columns=drop_col, inplace=False)

X_data_dict['X_data_msc_drop'] = X_data_msc_drop
X_data_dict['X_data_ssc_drop'] = X_data_ssc_drop
X_data_dict['X_data_msc_skew_drop'] = X_data_msc_skew_drop
X_data_msc_drop.shape, X_data_ssc_drop.shape, X_data_msc_skew_drop.shape

((650, 248), (650, 248), (650, 248))

In [14]:
# transform function
# Do not change the function name
def transform_test(X_test_data):
#     X_test_data = msc.transform(X_test_data)
    X_test_data = ssc.transform(X_test_data)
    return X_test_data

### Skew log / drop_col must be applied!!!

## [Step 3] Model Training

* Training Code Below
* You must explain your method in this markdown
* (Important) Your model variable should be named 'model' !!! 

## Check Accuracy 

* Check your Train data accuracy

In [15]:
# Function to calculate the % of values that were correctly predicted

# def accuracy(real, predict):
#     return sum(real == predict) / float(real.shape[0])

def measure_error(y_true, y_pred, label=''):
    return pd.Series({'accuracy':accuracy_score(y_true, y_pred),
                      'precision': precision_score(y_true, y_pred, average='micro'),
                      'recall': recall_score(y_true, y_pred, average='micro'),
                      'balanced accuracy': balanced_accuracy_score(y_true, y_pred),
                      'f1': f1_score(y_true, y_pred, average='micro')},
                      name=label)

In [16]:
# k Nearest Neighbors

In [17]:
for data_name, X_data in X_data_dict.items():
    knn = KNeighborsClassifier()
    GS_knn = GridSearchCV(knn, 
                              param_grid={
                                  'n_neighbors': [k for k in range(2, 21)],
                                  'p': [1, 2],
                                  'weights': ['uniform', 'distance']
                              },
                              scoring='accuracy',
                              refit=True,
                              cv=StratifiedKFold(n_splits=5)
                             )
    GS_knn.fit(X_data, y_data)
    print(data_name, GS_knn.best_params_, GS_knn.best_score_)

X_data {'n_neighbors': 12, 'p': 1, 'weights': 'distance'} 0.8861538461538462
X_data_drop {'n_neighbors': 10, 'p': 1, 'weights': 'uniform'} 0.8553846153846154
X_data_msc {'n_neighbors': 16, 'p': 1, 'weights': 'distance'} 0.8892307692307693
X_data_ssc {'n_neighbors': 9, 'p': 1, 'weights': 'uniform'} 0.8876923076923078
X_data_msc_skew {'n_neighbors': 12, 'p': 1, 'weights': 'distance'} 0.8953846153846154
X_data_msc_drop {'n_neighbors': 9, 'p': 1, 'weights': 'uniform'} 0.8569230769230771
X_data_ssc_drop {'n_neighbors': 10, 'p': 1, 'weights': 'uniform'} 0.8553846153846154
X_data_msc_skew_drop {'n_neighbors': 13, 'p': 1, 'weights': 'distance'} 0.8630769230769232


In [18]:
# k-NN best model (0.8953846153846154) => best
model = KNeighborsClassifier(n_neighbors=12, p=1, weights='distance')
model.fit(X_data_msc_skew, y_data)
y_pred = model.predict(X_data_msc_skew)
confusion_matrix(y_true=y_data, y_pred=y_pred), measure_error(y_true=y_data, y_pred=y_pred)

(array([[336,   0,   0],
        [  0, 240,   0],
        [  0,   0,  74]]),
 accuracy             1.0
 precision            1.0
 recall               1.0
 balanced accuracy    1.0
 f1                   1.0
 Name: , dtype: float64)

In [19]:
# k-NN best model (0.8892307692307693) => best
model = KNeighborsClassifier(n_neighbors=16, p=1, weights='distance')
model.fit(X_data_msc, y_data)
y_pred = model.predict(X_data_msc)
confusion_matrix(y_true=y_data, y_pred=y_pred), measure_error(y_true=y_data, y_pred=y_pred)

(array([[336,   0,   0],
        [  0, 240,   0],
        [  0,   0,  74]]),
 accuracy             1.0
 precision            1.0
 recall               1.0
 balanced accuracy    1.0
 f1                   1.0
 Name: , dtype: float64)

In [20]:
# k-NN best model (0.8876923076923078) => best
model = KNeighborsClassifier(n_neighbors=9, p=1, weights='uniform')
model.fit(X_data_ssc, y_data)
y_pred = model.predict(X_data_ssc)
confusion_matrix(y_true=y_data, y_pred=y_pred), measure_error(y_true=y_data, y_pred=y_pred)

(array([[317,  13,   6],
        [  6, 233,   1],
        [ 30,   3,  41]]),
 accuracy             0.909231
 precision            0.909231
 recall               0.909231
 balanced accuracy    0.822780
 f1                   0.909231
 Name: , dtype: float64)

## Analysis 

* Analyze your model's result
* You may use additional metrics (F1 Score, Confusion matrix) or visualize your results using plots
* Hint : PCA plot will help you understand the dataset (Which class is the most challenging class to classify?)
* Hint : You may also compare different models to choose the best model among classifiers what we learned in this semester

In [11]:
# Your code here

# Test data

* TA will check your model's test data accuracy
* (Important) Do not change the code below

In [12]:
filepath = os.sep.join( ['data', 'music_test_data.csv'])
t_data = pd.read_csv(filepath)
features = t_data.columns
X_t_data = t_data[features]
X_t_data = transform_test(X_t_data)

y_pred = model.predict(X_t_data)
np.savetxt('out.txt', y_pred, fmt='%d', delimiter='\n')