In [32]:
import pandas as pd
import numpy as np

In [43]:
from sklearn import datasets, linear_model
from sklearn.model_selection import LeaveOneOut, cross_val_score, KFold, train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

from matplotlib import pyplot as plt

In [44]:
X = pd.read_csv('datasets/Classification/Data1/TrainData1.txt', sep='\s+', header=None)
y = pd.read_csv('datasets/Classification/Data1/TrainLabel1.txt', header=None)
print('X.shape:' + str(X.shape))
print('y.shape:' + str(y.shape))

X.shape:(150, 3312)
y.shape:(150, 1)


In [45]:
N, num_features = X.shape
print('N: ' + str(N))
print('num_features: ' + str(num_features))

N: 150
num_features: 3312


In [46]:
# fill missing values
X = X[X < 1e99]
# linear
X = X.interpolate()
# fill outside values with mean
X = X.fillna(X.mean())
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3302,3303,3304,3305,3306,3307,3308,3309,3310,3311
0,3.824254,1.923762,1.91845,2.352067,3.117298,3.051735,3.307977,3.430222,3.586667,3.605218,...,1.83683,1.85564,1.142389,2.054345,2.808224,1.782186,2.665703,2.468214,2.478581,2.308842
1,3.90419,2.309524,2.15293,2.35003,3.532368,3.524866,3.677791,3.636671,3.696868,3.716764,...,1.951532,1.442323,1.0,2.127914,2.979658,1.961089,2.519027,2.054383,2.689903,2.090928
2,3.750908,1.161068,1.017033,2.347993,3.381889,3.393096,3.509134,3.512466,3.622203,3.60305,...,1.0,1.584105,1.0,1.945321,3.257004,1.965061,2.536066,1.449324,2.60523,1.368659
3,3.809383,1.912355,1.85694,2.498944,3.289406,3.371232,3.541995,3.514497,3.473179,3.62893,...,1.869965,1.481658,1.0,2.155032,3.270371,1.928473,2.618074,2.154013,2.530046,2.185514
4,3.893561,2.094192,1.881271,2.785707,3.344339,3.274417,3.485872,3.516527,3.642358,3.688235,...,1.480725,1.510545,1.0,2.094192,3.246666,1.824516,2.562317,1.942256,2.598517,1.764624


In [47]:
y[0].unique()

array([1, 2, 4, 3, 5])

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, stratify=y)

In [49]:
X_train.shape

(135, 3312)

## KNN(Distance Weighted, Euclidian Distance) ##

In [50]:
knn = KNeighborsClassifier(n_neighbors=10, weights='distance')
svm = LinearSVC()
linreg = LinearRegression()

In [51]:
# Cross Validation

kf = KFold(n_splits=10, shuffle=True)
for train_index, test_index in kf.split(X_train):
    X_train_split, y_train_split = X_train.iloc[train_index], y_train.iloc[train_index]
    X_valid_split, y_valid_split = X_train.iloc[test_index], y_train.iloc[test_index]
    knn.fit(X_train_split, y_train_split.values.flatten())
    print('knn: ' + str(knn.score(X_valid_split, y_valid_split)))
    svm.fit(X_train_split, y_train_split.values.flatten())
    print('svm: ' + str(svm.score(X_valid_split, y_valid_split)))
    linreg.fit(X_train_split, y_train_split.values.flatten())
    print('linear regression: ' + str(svm.score(X_valid_split, y_valid_split)))

knn: 1.0




svm: 1.0
linear regression: 1.0
knn: 0.9285714285714286




svm: 1.0
linear regression: 1.0
knn: 0.8571428571428571




svm: 0.9285714285714286
linear regression: 0.9285714285714286
knn: 0.8571428571428571




svm: 0.9285714285714286
linear regression: 0.9285714285714286
knn: 0.9285714285714286




svm: 1.0
linear regression: 1.0
knn: 1.0




svm: 1.0
linear regression: 1.0
knn: 0.9230769230769231




svm: 0.9230769230769231
linear regression: 0.9230769230769231
knn: 1.0




svm: 1.0
linear regression: 1.0
knn: 0.9230769230769231




svm: 1.0
linear regression: 1.0
knn: 0.8461538461538461
svm: 0.8461538461538461
linear regression: 0.8461538461538461




In [55]:
# whole dataset, 90% train, 10% test
knn.fit(X, y.values.flatten())
print('knn: ' + str(knn.score(X_test, y_test)))
svm.fit(X, y.values.flatten())
print('svm: ' + str(svm.score(X_test, y_test)))
linreg.fit(X, y.values.flatten())
print('linear regression: ' + str(linreg.score(X_test, y_test)))

knn: 1.0
svm: 1.0
linear regression: 1.0




## SVM

In [75]:
for i in range(1,6):
    data = 'datasets/Classification/Data{}/TrainData{}.txt'.format(i, i)
    label = 'datasets/Classification/Data{}/TrainLabel{}.txt'.format(i, i)
    X = pd.read_csv(data, sep='\s+', header=None)
    y = pd.read_csv(label, header=None)
    X = X[X < 1e99]
    # fill missing values
    if X.isnull().any().any():
        # change to nan
        X = X[X < 1e99]
        # fill linear-ly
        X = X.interpolate()
        # fill outside values with mean
        X = X.fillna(X.mean())

In [64]:
y.head()

Unnamed: 0,0
0,5
1,5
2,5
3,6
4,5
