# Attribute & Description
    Feature 1：歌曲的原聲程度。越接近1，表示歌曲所包含的電子音樂成份越少。 
    Feature 2：歌曲表現的強度。強度較高的歌曲，會讓人感到有活力、響亮、甚至吵雜。
    Feature 3：根據歌曲的節奏與穩定性，來評斷歌曲是否適合作為舞曲。越接近 0 的值，越不適合；越接近 1 的值越適合。 
    Feature 4：歌曲的流行度，數值是根據播放次數作為依據，數值越高表示越流行。
    Feature 5：歌曲的速度，以每分鐘節拍數 (BPM) 為單位。
    Feature 6：歌曲在有現場觀眾的情況下進行錄製的機率。數值越高表示歌曲越有可能是現場即時錄製，或是演唱會版本。
    Feature 7：歌曲傳達的情緒。數值越高，表示歌曲聽起來越積極 (快樂、輕快)；數值越低，表示歌曲聽起來越消極 (憤怒、悲傷)。
    Feature 8：歌曲的持續時間，單位為毫秒。
    Feature 9：歌曲的響度，以分貝為單位 (dB)。
    Feature 10：歌曲中是否存在「口語」。若數值偏高，可能是脫口秀、Podcast；若數值偏低，則可能是純音樂。
    Feature 11：歌曲的調性 (大調=0，小調=1)。
    Feature 12：歌曲中的樂器演奏所佔比例，數值越高，表示歌曲的樂器演奏佔比越大。
    Feature 13：歌曲的音高，內容是依據標準的 Pitch class 來進行映射。

In [648]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D # matplotlib 3.2.0 後可省略
from sklearn.datasets import make_blobs
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import LabelEncoder

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/2022-ntust-data-science-hw2/submit.csv
/kaggle/input/2022-ntust-data-science-hw2/test_3000.csv
/kaggle/input/2022-ntust-data-science-hw2/train.csv


## Read training data

In [649]:
dataset = pd.read_csv("../input/2022-ntust-data-science-hw2/train.csv") # training data
print("Shape: " + str(dataset.shape))
dataset.head()

Shape: (40114, 14)


Unnamed: 0,song_id,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Feature 11,Feature 12,Feature 13
0,0,0.147,0.798,0.745,46,111.016,0.197,0.388,240000,-5.436,0.0384,1,0.651,F
1,1,0.0658,0.804,0.521,66,143.952,0.0521,0.553,224700,-4.395,0.0569,0,0.0,C
2,2,0.0395,0.96,0.755,67,99.023,0.332,0.661,170440,-3.189,0.123,1,2.4e-05,E
3,3,0.359,0.769,0.592,40,171.94,0.122,0.223,226520,-7.127,0.19,1,0.0143,D
4,4,0.16,0.838,0.769,83,93.996,0.0935,0.602,249609,-5.238,0.0633,0,0.0,D


In [650]:
print("Missing data ratio: ")
print(dataset.isnull().sum() / len(dataset)) 
# No missing data

Missing data ratio: 
song_id       0.0
Feature 1     0.0
Feature 2     0.0
Feature 3     0.0
Feature 4     0.0
Feature 5     0.0
Feature 6     0.0
Feature 7     0.0
Feature 8     0.0
Feature 9     0.0
Feature 10    0.0
Feature 11    0.0
Feature 12    0.0
Feature 13    0.0
dtype: float64


In [651]:
labelencoder = LabelEncoder()
dataset['Feature 13'] = labelencoder.fit_transform(dataset['Feature 13'])
dataset

Unnamed: 0,song_id,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Feature 11,Feature 12,Feature 13
0,0,0.14700,0.798,0.745,46,111.016,0.1970,0.388,240000,-5.436,0.0384,1,0.651000,8
1,1,0.06580,0.804,0.521,66,143.952,0.0521,0.553,224700,-4.395,0.0569,0,0.000000,3
2,2,0.03950,0.960,0.755,67,99.023,0.3320,0.661,170440,-3.189,0.1230,1,0.000024,7
3,3,0.35900,0.769,0.592,40,171.940,0.1220,0.223,226520,-7.127,0.1900,1,0.014300,5
4,4,0.16000,0.838,0.769,83,93.996,0.0935,0.602,249609,-5.238,0.0633,0,0.000000,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40109,40109,0.00923,0.760,0.761,48,144.977,0.4690,0.492,249667,-6.275,0.3780,0,0.000000,5
40110,40110,0.43000,0.533,0.495,50,79.710,0.1060,0.424,247055,-5.802,0.0295,1,0.000672,3
40111,40111,0.03280,0.823,0.562,43,126.023,0.2120,0.376,194293,-4.732,0.0405,0,0.000002,2
40112,40112,0.82500,0.281,0.420,46,89.423,0.1990,0.792,391613,-15.291,0.0445,0,0.000135,4


In [652]:
dataset.drop(["song_id", "Feature 11", "Feature 13"], axis = 1, inplace = True)
# 1原聲程度、2表現強度、3節奏與穩定性、4流行度、5歌曲速度、6現場錄製、7情緒、8持續時間、9響度、10口語、11調性、12樂器比例、13音高
dataset.describe()

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Feature 12
count,40114.0,40114.0,40114.0,40114.0,40114.0,40114.0,40114.0,40114.0,40114.0,40114.0,40114.0
mean,0.306247,0.599714,0.559131,44.464302,119.810783,0.193903,0.456827,245623.2,-9.141892,0.094101,0.180956
std,0.340959,0.264198,0.178867,15.477318,30.622201,0.161792,0.247,110377.4,6.156216,0.101841,0.32507
min,0.0,0.000792,0.0596,0.0,34.347,0.00967,0.0,15509.0,-47.046,0.0223,0.0
25%,0.0202,0.433,0.443,34.0,94.8595,0.0969,0.259,190480.0,-10.851,0.0361,0.0
50%,0.145,0.642,0.57,45.0,119.6935,0.126,0.449,227373.0,-7.29,0.049,0.000159
75%,0.55,0.815,0.688,56.0,140.205,0.243,0.648,275673.2,-5.191,0.0995,0.15
max,0.996,0.999,0.986,99.0,220.276,1.0,0.992,4497994.0,3.744,0.942,0.996


## Standard Scaler

In [653]:
scalar = StandardScaler()
dataset = scalar.fit_transform(dataset)
print(dataset.shape)

(40114, 11)


## PCA

In [654]:
pca = PCA(n_components = 7)

pca.fit(dataset)
dataset_pca = pca.transform(dataset)
pca.explained_variance_ratio_

NameError: name 'ˊ' is not defined

## KMeans

In [None]:
# Normal KMeans
kmeans = KMeans(n_clusters = 3, init ='k-means++')
label_k = kmeans.fit_predict(dataset_pca)
print(np.shape(label_k))

In [None]:
# Feature KMeans
kmeans = KMeans(n_clusters = 3, init ='k-means++')
label_f = []
for i in range(0, 11):
    label_f.append(kmeans.fit_predict(np.array(dataset[:,i]).reshape(-1, 1)))
    
print(np.shape(label_f))
print(label_f)

## Read test data

In [None]:
test_data = pd.read_csv("../input/2022-ntust-data-science-hw2/test_3000.csv")
test_data.head()

In [None]:
submit = pd.DataFrame()

for i in test_data['id']:
    s1 = test_data.iloc[i]["col_1"]
    s2 = test_data.iloc[i]["col_2"]
    y_count = n_count = 0
    # print('s1: {s1}, s2: {s2}'.format(s1 = s1, s2 = s2))
    for k in range(0, len(label_f)):
        # print('k = {k}, s1_feature = {label1}, s2_feature_ = {label2}'.format(k = k, label1 = label_f[k][s1], label2 = label_f[k][s2]))
        if(label_f[k][s1] == label_f[k][s2]):
            y_count += 1
        else:
            n_count += 1
    if(label_k[s1] == label_k[s2]):
        y_count += 2
    else:
        n_count += 2
    # print('Same Group? {str1}, Yes = {y}, No = {n}'.format(str1 = str(y_count > n_count), y = y_count, n = n_count))
    submit = submit.append({'id':i, 'ans':(y_count > n_count)}, ignore_index=True)

submit['id'] = submit.id.map(int)
submit['ans'] = submit.ans.map(int)
print(submit.dtypes)
print(submit)
submit.to_csv('submission.csv', index=False)
print(submit.loc[:,"ans"].value_counts())
