In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

from sklearn import datasets, linear_model
from sklearn.model_selection import LeaveOneOut, cross_val_score, KFold, train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

In [6]:
import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

In [8]:
# models

# weighted KNN, k=10
knn = KNeighborsClassifier(n_neighbors=10, weights='distance')
svm = LinearSVC()
linreg = LinearRegression()
logreg = LogisticRegression()

In [None]:
# Run every dataset(5 of them) through multiple models to find best model
for i in range(1,6):
    # put data in pandas dataframe
    data = 'datasets/Classification/Data{}/TrainData{}.txt'.format(i, i)
    label = 'datasets/Classification/Data{}/TrainLabel{}.txt'.format(i, i)
    X = pd.read_csv(data, sep='\s+', header=None)
    y = pd.read_csv(label, header=None)
    print()
    print('''*** Dataset {} ***'''.format(i))
    print('Number of Samples: ' + str(X.shape[0]))
    print('Number of Features: ' + str(X.shape[1]))
    print('Classes: ' + str(y[0].unique()))
    
    # fill missing values
    if X.isnull().any().any():
        # change to nan
        X = X[X < 1e99]
        # fill linear-ly
        X = X.interpolate()
        # fill outside values with mean
        X = X.fillna(X.mean())

    #, stratify=y
    # split into 90% train and 10% test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True)
    
    # run through each model and print results
    knn.fit(X, y.values.flatten())
    print('knn: ' + str(knn.score(X_test, y_test)))
    svm.fit(X, y.values.flatten())
    print('svm: ' + str(svm.score(X_test, y_test)))
    linreg.fit(X, y.values.flatten())
    print('linear regression: ' + str(linreg.score(X_test, y_test)))
    logreg.fit(X, y.values.flatten())
    print('logistic regression ' + str(logreg.score(X_test, y_test)))