In [84]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

from sklearn import datasets, linear_model
from sklearn.model_selection import LeaveOneOut, cross_val_score, KFold, train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

## Brute-Force Model Search on Training Data

In [138]:
# Run every dataset(5 of them) through multiple models, with different parameters to find best model
best_estimators = {'knn_best': [],
                   'svm_best': [],
                   'svc_best': [],
                   'rf_best': [],
                   'dt_best': [],
                   'ensemble_best': [],
                    }
results_list = []
best_params = {'knn': [],
               'rf': [],}
cv = 3
for i in range(1,6):
    # put data in pandas dataframe
    data = 'datasets/Classification/Data{}/TrainData{}.txt'.format(i, i)
    label = 'datasets/Classification/Data{}/TrainLabel{}.txt'.format(i, i)
    X = pd.read_csv(data, sep='\s+', header=None)
    y = pd.read_csv(label, header=None)
    print(X.shape)
    print()
    print('''*** Dataset {} ***'''.format(i))
    print('Number of Samples: ' + str(X.shape[0]))
    print('Number of Features: ' + str(X.shape[1]))
    print('Classes: ' + str(y[0].unique()))
    
    # fill missing values
    # change to nan
    X = X[X < 1e99]
    # fill linear-ly
    #X = X.interpolate()
    # fill outside values with mean
    X = X.fillna(X.mean())

    #, stratify=y
    # split into 90% train and 10% test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, random_state=42)
    #X_train = X
    #y_train = y
    
    results = {'knn': [],
               'svm': [],
               'svc': [],
               'rf': [],
               'dt': [],
               'ensemble': [],}
    
    # KNN
    knn = KNeighborsClassifier(weights='distance')
    params_knn = {'n_neighbors': np.arange(5, 15)}
    knn_gs = GridSearchCV(knn, params_knn, cv=cv)
    knn_gs.fit(X_train, y_train.values.flatten())
    best_estimators['knn_best'].append(knn_gs.best_estimator_)
    results['knn'].append(best_estimators['knn_best'][i-1].score(X_test, y_test))
    best_params['knn'].append(knn_gs.best_params_)
    print(knn_gs.best_params_)
    
    # SVM
    svm = LinearSVC()
    svm.fit(X_train, y_train.values.flatten())
    best_estimators['svm_best'].append(svm)
    results['svm'].append(best_estimators['svm_best'][i-1].score(X_test, y_test))
    
    # SVC
    svc = SVC()
    svc.fit(X_train, y_train.values.flatten())
    best_estimators['svc_best'].append(svc)
    results['svc'].append(best_estimators['svc_best'][i-1].score(X_test, y_test))
    
    # Random Forest
    rf = RandomForestClassifier(random_state=0)
    params_rf = {'n_estimators': [50, 100, 125, 150, 175, 200]}
    rf_gs = GridSearchCV(rf, params_rf, cv=cv)
    rf_gs.fit(X_train, y_train.values.flatten())
    best_estimators['rf_best'].append(rf_gs.best_estimator_)
    results['rf'].append(best_estimators['rf_best'][i-1].score(X_test, y_test))
    print(rf_gs.best_params_)
    best_params['rf'].append(rf_gs.best_params_)
    
    # Decision Tree
    dt = DecisionTreeClassifier(random_state=0)
    dt.fit(X_train, y_train.values.flatten())
    best_estimators['dt_best'].append(dt)
    results['dt'].append(best_estimators['dt_best'][i-1].score(X_test, y_test))
    
    # Ensemble
    knn = KNeighborsClassifier(weights='distance', n_neighbors = best_params['knn'][i - 1]['n_neighbors'])
    svm = LinearSVC()
    svc = SVC()
    rf = RandomForestClassifier(n_estimators=best_params['rf'][i-1]['n_estimators'], random_state=0)
    dt = DecisionTreeClassifier(random_state=0)

    estimators=[('knn', knn), 
                ('svm', svm),
                ('svc', svc),
                ('dt', dt),
                ('rf', rf)]

    
    #create our voting classifier, inputting our models
    ensemble = VotingClassifier(estimators, voting='hard')
    ensemble.fit(X_train, y_train.values.flatten())
    best_estimators['ensemble_best'].append(ensemble)
    #results['ensemble'].append(ensemble.score(X_test, y_test.values.flatten()))
    
    #df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in results.items() ]))    
    #results_list.append(df)

(150, 3312)

*** Dataset 1 ***
Number of Samples: 150
Number of Features: 3312
Classes: [1 2 4 3 5]




{'n_neighbors': 5}




{'n_estimators': 200}




(100, 9182)

*** Dataset 2 ***
Number of Samples: 100
Number of Features: 9182
Classes: [ 1  2  3  4  5  6  7  8  9 10 11]




{'n_neighbors': 5}




{'n_estimators': 150}




(6300, 13)

*** Dataset 3 ***
Number of Samples: 6300
Number of Features: 13
Classes: [9 1 8 6 2 4 7 5 3]
{'n_neighbors': 14}




{'n_estimators': 175}




(2547, 112)

*** Dataset 4 ***
Number of Samples: 2547
Number of Features: 112
Classes: [1 2 3 4 5 6 7 8 9]




{'n_neighbors': 6}




{'n_estimators': 175}




(1119, 11)

*** Dataset 5 ***
Number of Samples: 1119
Number of Features: 11
Classes: [5 6 7 4 8 3]
{'n_neighbors': 11}




{'n_estimators': 50}




## Save Test Predictions to Disk

***Above*** Must comment out train_test_split and change X/y to X_train/y_train to use all training data

In [143]:
i = 5
data = 'datasets/Classification/Data{}/TestData{}.txt'.format(i, i)
X = pd.read_csv(data, sep='\s+', header=None)
print(X.shape)
print(X.shape[0])
# fill missing values
# change to nan
if(i != 3):
    X = X[X < 1e99]
if(i == 3):
    X = X[X < 1e8]
# fill outside values with mean
X = X.fillna(X.mean())

labels = pd.DataFrame(best_estimators['ensemble_best'][i-1].predict(X))
print(labels.shape[0])
np.savetxt(r'TomyClassification{}.txt'.format(i), labels, fmt='%d')

(480, 11)
480
480


In [None]:
results = pd.concat(results_list, ignore_index=True)
results.index = range(1,len(results)+1)
results

In [102]:
results.iloc[0].sort_values(ascending=False)

svm         0.866667
ensemble    0.800000
dt          0.800000
knn         0.800000
rf          0.733333
svc         0.733333
Name: 1, dtype: float64

In [103]:
results.iloc[1].sort_values(ascending=False)

ensemble    1.0
rf          1.0
svm         1.0
knn         1.0
dt          0.7
svc         0.5
Name: 2, dtype: float64

In [104]:
results.iloc[2].sort_values(ascending=False)

ensemble    0.347619
svc         0.341270
rf          0.334921
knn         0.325397
svm         0.301587
dt          0.293651
Name: 3, dtype: float64

In [105]:
results.iloc[3].sort_values(ascending=False)

rf          0.929412
ensemble    0.878431
dt          0.854902
knn         0.764706
svm         0.466667
svc         0.121569
Name: 4, dtype: float64

In [106]:
results.iloc[4].sort_values(ascending=False)

rf          0.714286
ensemble    0.669643
dt          0.607143
svc         0.580357
knn         0.571429
svm         0.178571
Name: 5, dtype: float64