In [72]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.exceptions import DataConversionWarning
import warnings
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier


In [17]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv('test.csv')

In [18]:
X_all_train = train_df.loc[:, train_df.columns != 'categories']
y_all_train = train_df.loc[:, train_df.columns == 'categories']

In [19]:
y_all_train = y_all_train.categories

### KNN

In [77]:
def knn_param_selection(X, y, nfolds):
    param_grid = [{'knn__n_neighbors': [3, 5, 10, 15, 20, 25, 30, 35]}]
    estimators = []
    
    estimators.append(('standardize', preprocessing.StandardScaler()))
    estimators.append(('pca', PCA(n_components = 20)) )
    estimators.append(('knn', KNeighborsClassifier()))
    model = Pipeline(estimators)
    
    grid_search = GridSearchCV(model, param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    print(grid_search.cv_results_)
    print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
    
    return grid_search.best_params_

In [78]:
knn_param_selection(X_all_train, y_all_train, 5)

{'mean_fit_time': array([3.41453819, 3.33342352, 3.33659253, 3.33388844, 3.35444303,
       3.35973697, 3.61288295, 3.30878563]), 'std_fit_time': array([0.12125389, 0.01673462, 0.02698342, 0.00772082, 0.0111728 ,
       0.01652937, 0.25376277, 0.03712977]), 'mean_score_time': array([0.20488873, 0.20382757, 0.21690397, 0.25602598, 0.27662616,
       0.27920032, 0.32909608, 0.26586776]), 'std_score_time': array([0.01994925, 0.00730554, 0.00749196, 0.01800806, 0.00380724,
       0.00835077, 0.04041354, 0.00868189]), 'param_knn__n_neighbors': masked_array(data=[3, 5, 10, 15, 20, 25, 30, 35],
             mask=[False, False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'knn__n_neighbors': 3}, {'knn__n_neighbors': 5}, {'knn__n_neighbors': 10}, {'knn__n_neighbors': 15}, {'knn__n_neighbors': 20}, {'knn__n_neighbors': 25}, {'knn__n_neighbors': 30}, {'knn__n_neighbors': 35}], 'split0_test_score': array([0.98141026, 0.98397436, 0.9852564

{'knn__n_neighbors': 25}

### SVC

In [23]:
def svc_param_selection(X, y, nfolds):
    param_grid = [{'svm__kernel': ['rbf'], 'svm__gamma': [1e-3, 1e-4, 1],
                     'svm__C': [1, 10, 100, 1000]},
                    {'svm__kernel': ['linear'], 'svm__C': [0.1, 1, 10, 100, 1000]}]
    estimators = []
    
    estimators.append(('standardize', preprocessing.StandardScaler()))
    estimators.append(('pca', PCA(n_components = 2500)) )
    estimators.append(('svm', SVC()))
    model = Pipeline(estimators)
    
    grid_search = GridSearchCV(model, param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

In [27]:
svc_param_selection(X_all_train, y_all_train, 5)

{'svm__C': 10, 'svm__gamma': 0.0001, 'svm__kernel': 'rbf'}

In [58]:
def svc_linear_param_selection(X, y, nfolds):
    param_grid = [{'svm__penalty': ['l2'], 'svm__C': [0.001, 0.1, 1, 10, 100]}]
    estimators = []
    
    estimators.append(('standardize', preprocessing.StandardScaler()))
    estimators.append(('svm', LinearSVC()))
    model = Pipeline(estimators)
    
    grid_search = GridSearchCV(model, param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    print(grid_search.cv_results_)
    print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
    return grid_search.best_params_

In [59]:
svc_linear_param_selection(X_all_train, y_all_train, 5)

{'mean_fit_time': array([ 45.19773569,  61.4805788 ,  62.71985173, 520.12584138,
        63.05641413]), 'std_fit_time': array([1.52283062e+00, 6.13790180e-01, 1.58236168e+00, 9.16623520e+02,
       1.73121300e+00]), 'mean_score_time': array([0.10251689, 0.09179206, 0.08629546, 0.09521718, 0.08727288]), 'std_score_time': array([0.02573509, 0.0030709 , 0.00384219, 0.02124715, 0.00386357]), 'param_svm__C': masked_array(data=[0.001, 0.1, 1, 10, 100],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_svm__penalty': masked_array(data=['l2', 'l2', 'l2', 'l2', 'l2'],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'svm__C': 0.001, 'svm__penalty': 'l2'}, {'svm__C': 0.1, 'svm__penalty': 'l2'}, {'svm__C': 1, 'svm__penalty': 'l2'}, {'svm__C': 10, 'svm__penalty': 'l2'}, {'svm__C': 100, 'svm__penalty': 'l2'}], 'split0_test_score': array([0.98589744, 0.98397436, 0.9846153

{'svm__C': 0.001, 'svm__penalty': 'l2'}

### Log Regression

In [24]:
def log_param_selection(X, y, nfolds):
    param_grid = {'logreg__C': [0.01, 0.1, 1, 10, 100], 'logreg__penalty': ['l1', 'l2']}
    log_reg = LogisticRegression(solver='saga', max_iter=3000, n_jobs=-1)
    
    estimators = []
    
    estimators.append(('standardize', preprocessing.StandardScaler()))
    estimators.append(('pca', PCA(n_components = 1100)) )
    estimators.append(('logreg', log_reg))
    model = Pipeline(estimators)
    
    grid_search = GridSearchCV(model, param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    
    return grid_search.best_params_

In [25]:
log_param_selection(X_all_train, y_all_train, 5)





{'logreg__C': 0.01, 'logreg__penalty': 'l2'}

In [None]:
# Best results: {'logreg__C': 0.01, 'logreg__penalty': 'l2'}

### LDA

In [44]:
def lda_param_selection(X, y, nfolds):
    param_grid = {'lda__shrinkage': [0.001, 0.01, 0.1, 0.5, 1], 'lda__solver': ['eigen', 'lsqr']}
    lda_clf = LinearDiscriminantAnalysis()
    
    estimators = []
    
    estimators.append(('standardize', preprocessing.StandardScaler()))
    estimators.append(('pca', PCA(n_components = 280)) )
    estimators.append(('lda', lda_clf))
    model = Pipeline(estimators)
    
    grid_search = GridSearchCV(model, param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    print(grid_search.cv_results_)
    print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
    
    return grid_search.best_params_

In [45]:
lda_param_selection(X_all_train, y_all_train, 5)

{'mean_fit_time': array([10.17535458,  9.40955667,  9.58032455,  9.18634529,  9.60617781,
        9.65583901,  9.45207477,  9.74630136, 10.07708864,  9.81479502]), 'std_fit_time': array([0.61336399, 0.21813444, 0.37003548, 0.23466287, 0.73102366,
       0.50747548, 0.44879976, 0.72955638, 0.68363976, 0.44849303]), 'mean_score_time': array([0.26049762, 0.23443823, 0.23392305, 0.24764252, 0.23865905,
       0.24059424, 0.23868427, 0.26641874, 0.2640655 , 0.24395432]), 'std_score_time': array([0.04334986, 0.00809539, 0.01137945, 0.03278618, 0.01584328,
       0.03087698, 0.0080542 , 0.06980639, 0.0385549 , 0.03534209]), 'param_lda__shrinkage': masked_array(data=[0.001, 0.001, 0.01, 0.01, 0.1, 0.1, 0.5, 0.5, 1, 1],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',
            dtype=object), 'param_lda__solver': masked_array(data=['eigen', 'lsqr', 'eigen', 'lsqr', 'eigen', 'lsqr',
                   'eigen', '

{'lda__shrinkage': 0.01, 'lda__solver': 'eigen'}

# Predictions generation

In [5]:
def generate_predictions_file(estimator, test_df, filename):
    test_df = test_df.copy()
    y_pred = estimator.predict(test_df)
    test_df['categories'] = pd.Series(y_pred, index=test_df.index)
    submission = test_df[['id','categories']]
    submission.to_csv(filename + '.csv', index=False)

### KNN

In [80]:
estimators = []
estimators.append(('standardize', preprocessing.StandardScaler()))
estimators.append(('pca', PCA(n_components = 20)))
estimators.append(('knn', KNeighborsClassifier(n_neighbors=25)))
model_knn = Pipeline(estimators)

In [81]:
model_knn.fit(X_all_train, y_all_train)

Pipeline(memory=None,
     steps=[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=20, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=25, p=2,
           weights='uniform'))])

In [82]:
generate_predictions_file(model_knn, test_df, 'knn_pca_20_k_25')

### SVC

In [61]:
estimators = []
estimators.append(('standardize', preprocessing.StandardScaler()))
estimators.append(('pca', PCA(n_components = 2500)))
estimators.append(('svm', SVC(C=10, gamma=0.0001, kernel='rbf')))
model_svc = Pipeline(estimators)

In [62]:
model_svc.fit(X_all_train, y_all_train)

Pipeline(memory=None,
     steps=[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm', SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [63]:
generate_predictions_file(model_svc, test_df, 'svc_rbf_C_10_gamma_0001')

#### Linear SVC

In [69]:
# {'svm__C': 0.001, 'svm__penalty': 'l2'}
estimators = []
estimators.append(('standardize', preprocessing.StandardScaler()))
estimators.append(('pca', PCA(n_components = 2500)))
estimators.append(('svm', LinearSVC(C=0.001, penalty='l2')))
model_linear_svc = Pipeline(estimators)

In [70]:
model_linear_svc.fit(X_all_train, y_all_train)

Pipeline(memory=None,
     steps=[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=2500, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svm', LinearSVC(C=0.001, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [71]:
generate_predictions_file(model_linear_svc, test_df, 'svc_linear_C_0_001_l2_penalty_pca_2500')

### Log regression

In [29]:
estimators = []
estimators.append(('standardize', preprocessing.StandardScaler()))
estimators.append(('pca', PCA(n_components = 1100)))
estimators.append(('logreg',LogisticRegression(penalty='l2', C=0.01, solver='saga', max_iter=3000, n_jobs=-1)))
model_log = Pipeline(estimators)

In [30]:
model_log.fit(X_all_train, y_all_train)



Pipeline(memory=None,
     steps=[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=1100, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('logreg', LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=3000, multi_class='warn',
          n_jobs=-1, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False))])

In [31]:
generate_predictions_file(model_log, test_df, 'lod_reg_saga_pca_1100_C_0_01')

### LDA

In [46]:
estimators = []
estimators.append(('standardize', preprocessing.StandardScaler()))
estimators.append(('pca', PCA(n_components = 280)))
estimators.append(('lda', LinearDiscriminantAnalysis(solver='eigen', shrinkage = 0.01)))
model_lda = Pipeline(estimators)

In [48]:
model_lda.fit(X_all_train, y_all_train)

Pipeline(memory=None,
     steps=[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=280, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('lda', LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=0.01,
              solver='eigen', store_covariance=False, tol=0.0001))])

In [50]:
generate_predictions_file(model_lda, test_df, 'lda_eigen_pca_280_shrinkage_0_01')