In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

from sklearn import datasets, linear_model
from sklearn.model_selection import LeaveOneOut, cross_val_score, KFold, train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

## All the models

## Run every dataset through every model

In [34]:
# Run every dataset(5 of them) through multiple models to find best model
results_list = [] # list of dataframes with scores
for i in range(1,2):
    # put data in pandas dataframe
    data = 'datasets/Classification/Data{}/TrainData{}.txt'.format(i, i)
    label = 'datasets/Classification/Data{}/TrainLabel{}.txt'.format(i, i)
    X = pd.read_csv(data, sep='\s+', header=None)
    y = pd.read_csv(label, header=None)
    print()
    print('''*** Dataset {} ***'''.format(i))
    print('Number of Samples: ' + str(X.shape[0]))
    print('Number of Features: ' + str(X.shape[1]))
    print('Classes: ' + str(y[0].unique()))
    
    # fill missing values
    if X.isnull().any().any():
        # change to nan
        X = X[X < 1e99]
        # fill linear-ly
        X = X.interpolate()
        # fill outside values with mean
        X = X.fillna(X.mean())

    #, stratify=y
    # split into 90% train and 10% test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, random_state=42)
    
    results = {'knn': [],
               'svm': [],
               'linreg': [],
               'naive_bayes': [],
               'logreg': [],
               'random_forest': []}
    
    # KFold Cross Validation
    kf = KFold(n_splits=5, shuffle=True)
    for train_index, test_index in kf.split(X_train):
        
        # Instantiate Models
        knn = KNeighborsClassifier(n_neighbors=10, weights='distance')
        svm = LinearSVC()
        linreg = LinearRegression()
        naive_bayes = MultinomialNB()
        logreg = LogisticRegression(multi_class='multinomial', solver='sag')
        randomforest = RandomForestClassifier(n_estimators=100)
        
        # Fit and score, store in results dic
        knn.fit(X_train, y_train.values.flatten())
        results['knn'].append(knn.score(X_test, y_test))
        svm.fit(X_train, y_train.values.flatten())
        results['svm'].append(svm.score(X_test, y_test))
        linreg.fit(X_train, y_train.values.flatten())
        results['linreg'].append(linreg.score(X_test, y_test))
        if(i != 4):
            naive_bayes.fit(X_train, y_train.values.flatten())
            results['naive_bayes'].append(naive_bayes.score(X_test, y_test))
        if(i != 1):
            logreg.fit(X_train, y_train.values.flatten())
            results['logreg'].append(logreg.score(X_test, y_test))
        #randomforest.fit(X_train, y_train.values.flatten())
        #results['random_forest'].append(randomforest.score(X_test, y_test))
        
    df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in results.items() ]))    
    results_list.append(df)


*** Dataset 1 ***
Number of Samples: 150
Number of Features: 3312
Classes: [1 2 4 3 5]


In [35]:
results_list[0]

Unnamed: 0,knn,svm,linreg,naive_bayes,logreg,random_forest
0,0.733333,0.733333,-0.034735,0.733333,,
1,0.733333,0.733333,-0.034735,0.733333,,
2,0.733333,0.733333,-0.034735,0.733333,,
3,0.733333,0.733333,-0.034735,0.733333,,
4,0.733333,0.733333,-0.034735,0.733333,,


In [27]:
results_list[1]

Unnamed: 0,knn,svm,linreg,naive_bayes,logreg,random_forest
0,1.0,1.0,0.862046,1.0,1.0,
1,1.0,1.0,0.862046,1.0,1.0,
2,1.0,1.0,0.862046,1.0,1.0,
3,1.0,1.0,0.862046,1.0,1.0,
4,1.0,1.0,0.862046,1.0,1.0,


In [28]:
results_list[2]

Unnamed: 0,knn,svm,linreg,naive_bayes,logreg,random_forest
0,0.322222,0.268254,-0.001209,0.136508,0.18254,
1,0.322222,0.261905,-0.001209,0.136508,0.18254,
2,0.322222,0.244444,-0.001209,0.136508,0.18254,
3,0.322222,0.263492,-0.001209,0.136508,0.18254,
4,0.322222,0.266667,-0.001209,0.136508,0.18254,


In [29]:
results_list[3]

Unnamed: 0,knn,svm,linreg,naive_bayes,logreg,random_forest
0,0.760784,0.509804,0.743503,,0.627451,
1,0.760784,0.498039,0.743503,,0.627451,
2,0.760784,0.537255,0.743503,,0.627451,
3,0.760784,0.545098,0.743503,,0.627451,
4,0.760784,0.486275,0.743503,,0.627451,


In [30]:
results_list[4]

Unnamed: 0,knn,svm,linreg,naive_bayes,logreg,random_forest
0,0.598214,0.455357,0.392804,0.473214,0.553571,
1,0.598214,0.607143,0.392804,0.473214,0.553571,
2,0.598214,0.383929,0.392804,0.473214,0.553571,
3,0.598214,0.348214,0.392804,0.473214,0.553571,
4,0.598214,0.392857,0.392804,0.473214,0.553571,


In [38]:
# Run every dataset(5 of them) through multiple models, with different parameters to find best model
best_estimators = {'knn_best': [],
                   'linreg_best': []}
for i in range(1,6):
    # put data in pandas dataframe
    data = 'datasets/Classification/Data{}/TrainData{}.txt'.format(i, i)
    label = 'datasets/Classification/Data{}/TrainLabel{}.txt'.format(i, i)
    X = pd.read_csv(data, sep='\s+', header=None)
    y = pd.read_csv(label, header=None)
    print()
    print('''*** Dataset {} ***'''.format(i))
    print('Number of Samples: ' + str(X.shape[0]))
    print('Number of Features: ' + str(X.shape[1]))
    print('Classes: ' + str(y[0].unique()))
    
    # fill missing values
    if X.isnull().any().any():
        # change to nan
        X = X[X < 1e99]
        # fill linear-ly
        X = X.interpolate()
        # fill outside values with mean
        X = X.fillna(X.mean())

    #, stratify=y
    # split into 90% train and 10% test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, random_state=42)
    
    knn = KNeighborsClassifier()
    params_knn = {'n_neighbors': np.arange(5, 15)}
    knn_gs = GridSearchCV(knn, params_knn, cv=5)
    knn_gs.fit(X_train, y_train.values.flatten())
    
    linreg = LinearRegression()
    linreg.fit(X_train, y_train.values.flatten())
    print(linreg.score(X_test, y_test))
    
    # save best model
    best_estimators['knn_best'].append(knn_gs.best_estimator_)
    best_estimators['linreg_best'].append(linreg)
    

    print()
    print(knn_gs.best_params_)
    print('knn: {}'.format(best_estimators['knn_best'][i-1].score(X_test, y_test)))
    
    print()
    print('linreg: {}'.format(best_estimators['linreg_best'][i-1].score(X_test, y_test)))


*** Dataset 1 ***
Number of Samples: 150
Number of Features: 3312
Classes: [1 2 4 3 5]




-0.03473505573482938
{'n_neighbors': 5}
knn: 0.7333333333333333
linreg: -0.03473505573482938

*** Dataset 2 ***
Number of Samples: 100
Number of Features: 9182
Classes: [ 1  2  3  4  5  6  7  8  9 10 11]




0.8620464590676848
{'n_neighbors': 6}
knn: 0.8
linreg: 0.8620464590676848

*** Dataset 3 ***
Number of Samples: 6300
Number of Features: 13
Classes: [9 1 8 6 2 4 7 5 3]
-0.0012087552677555458
{'n_neighbors': 14}
knn: 0.3333333333333333
linreg: -0.0012087552677555458

*** Dataset 4 ***
Number of Samples: 2547
Number of Features: 112
Classes: [1 2 3 4 5 6 7 8 9]
0.7435026182520863
{'n_neighbors': 5}
knn: 0.7647058823529411
linreg: 0.7435026182520863

*** Dataset 5 ***
Number of Samples: 1119
Number of Features: 11
Classes: [5 6 7 4 8 3]
0.392803765555772
{'n_neighbors': 13}
knn: 0.49107142857142855
linreg: 0.392803765555772


In [28]:
best_estimators['linreg_best']

[LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
          normalize=False)]