In [1]:
import pandas as pd
import numpy as np

from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.mixture import GaussianMixture
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import svm

In [2]:
train = pd.read_csv('etctrain.csv')
test = pd.read_csv('etctest.csv')
train.head()

Unnamed: 0,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,...,t16,t17,t18,t19,t20,t21,t22,t23,t24,group
0,315,188,124,107,133,204,479,1329,1442,1176,...,1905,2295,2624,1956,1577,1361,1168,844,603,4
1,445,310,213,164,165,251,383,873,1034,1110,...,2285,2570,2750,2428,1967,1962,1943,1562,1080,5
2,714,487,311,209,231,268,359,668,906,1137,...,2749,2950,2979,2309,2052,2361,2520,1948,989,1
3,559,247,126,89,135,296,859,1846,1808,1291,...,1791,2151,2235,1835,1288,1054,935,720,427,2
4,312,176,127,107,138,227,524,1352,1473,1150,...,1791,2168,2277,1821,1325,1039,917,641,451,3


## Clustering

In [3]:
# Agglomerative Clustering
clustering_model = AgglomerativeClustering(n_clusters=5, linkage="ward")
clustering_model.fit(train.iloc[:,:24])
labels = clustering_model.labels_
pd.crosstab(labels , train.iloc[:,-1])

group,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,45,0,0,0,1
1,0,35,24,23,0
2,0,11,8,8,9
3,0,0,0,0,36
4,1,2,117,15,0


In [4]:
# K means
kmeans = KMeans(n_clusters=5)
kmeans.fit(train.iloc[:,:24])
labels = kmeans.labels_
pd.crosstab(labels , train.iloc[:,-1])



group,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,6,82,5,0
1,42,0,0,0,0
2,0,12,19,15,3
3,3,1,0,0,43
4,0,29,48,26,0


In [5]:
# GMM
gm = GaussianMixture(n_components=5)
gm.fit(train.iloc[:,:24])
pred = gm.predict(train.iloc[:,:24])
pd.crosstab(pred , train.iloc[:,-1])

group,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,1,122,26,0
1,3,3,0,0,43
2,0,37,2,0,0
3,0,7,25,20,3
4,42,0,0,0,0


## Classification

In [6]:
# LDA
model = LinearDiscriminantAnalysis()
model.fit(train.iloc[:,:24], train.iloc[:,-1])
pred = model.predict(test.iloc[:,:24])
pd.crosstab(pred, test.iloc[:,-1])

group,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,12,0,1,0,0
2,0,20,1,0,0
3,0,0,37,5,0
4,0,0,5,6,0
5,0,0,1,0,12


In [7]:
print('misclassification rate = ', 1 - np.trace(pd.crosstab(pred, test.iloc[:,-1])) / 100)

misclassification rate =  0.13


In [8]:
# CART 
model = DecisionTreeClassifier()  
model.fit(train.iloc[:,:24], train.iloc[:,-1])
pred = model.predict(test.iloc[:,:24])
pd.crosstab(pred, test.iloc[:,-1])

group,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,11,0,0,0,0
2,0,17,2,0,0
3,1,3,36,3,1
4,0,0,7,8,0
5,0,0,0,0,11


In [9]:
print('misclassification rate = ', 1 - np.trace(pd.crosstab(pred, test.iloc[:,-1])) / 100)

misclassification rate =  0.17000000000000004


In [10]:
# NN(MLP)
model = MLPClassifier()  
model.fit(train.iloc[:,:24], train.iloc[:,-1])
pred = model.predict(test.iloc[:,:24])
pd.crosstab(pred, test.iloc[:,-1])

group,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,7,0,1,0,1
2,0,20,2,1,0
3,0,0,42,7,0
4,0,0,0,2,0
5,5,0,0,1,11


In [11]:
print('misclassification rate = ', 1 - np.trace(pd.crosstab(pred, test.iloc[:,-1])) / 100)

misclassification rate =  0.18000000000000005


In [12]:
# SVM
model = svm.SVC(kernel='linear', C=1, decision_function_shape='ovo')
model.fit(train.iloc[:,:24], train.iloc[:,-1])
pred = model.predict(test.iloc[:,:24])
pd.crosstab(pred, test.iloc[:,-1])

group,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,10,0,0,0,0
2,0,18,2,1,0
3,0,2,37,4,1
4,0,0,5,6,0
5,2,0,1,0,11


In [13]:
print('misclassification rate = ', 1 - np.trace(pd.crosstab(pred, test.iloc[:,-1])) / 100)

misclassification rate =  0.18000000000000005


In [14]:
model = svm.SVC(kernel='poly', degree=3, C=1, decision_function_shape='ovo')
model.fit(train.iloc[:,:24], train.iloc[:,-1])
pred = model.predict(test.iloc[:,:24])
pd.crosstab(pred, test.iloc[:,-1])

group,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,10,0,0,0,0
2,0,20,1,0,0
3,0,0,42,4,1
4,0,0,2,7,0
5,2,0,0,0,11


In [15]:
print('misclassification rate = ', 1 - np.trace(pd.crosstab(pred, test.iloc[:,-1])) / 100)

misclassification rate =  0.09999999999999998
