In [22]:
import pandas as pd
from sklearn import feature_selection
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import LinearSVC

In [42]:
from sklearn.exceptions import DataConversionWarning
import warnings
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [31]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv('test.csv')
X_all_train = train_df.loc[:, train_df.columns != 'categories']
y_all_train = train_df.loc[:, train_df.columns == 'categories']
y_all_train = y_all_train.categories

Accuracy on test set: 0.44


### PCA+LDA for dim reduction

In [32]:
estimators = []
estimators.append(('standardize', preprocessing.StandardScaler()))
estimators.append(('pca', PCA(n_components = 280)))
estimators.append(('lda', LinearDiscriminantAnalysis(n_components = 11, solver='eigen', shrinkage = 0.01)))
model_lda = Pipeline(estimators)

In [35]:
model_lda.fit(X_all_train, y_all_train)


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


Pipeline(memory=None,
     steps=[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=280, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('lda', LinearDiscriminantAnalysis(n_components=11, priors=None, shrinkage=0.01,
              solver='eigen', store_covariance=False, tol=0.0001))])

In [36]:
def generate_predictions_file(estimator, test_df, filename):
    test_df = test_df.copy()
    y_pred = estimator.predict(test_df)
    test_df['categories'] = pd.Series(y_pred, index=test_df.index)
    submission = test_df[['id','categories']]
    submission.to_csv(filename + '.csv', index=False)

In [37]:
generate_predictions_file(model_lda, test_df, 'lda_proj_11_eigen_pca_280_shrinkage_0_01')

  Xt = transform.transform(Xt)


### QDA

In [40]:
def qda_param_selection(X, y, nfolds):
    param_grid = {'qda__reg_param': [0.0, 0.01, 0.1, 0.5, 1]}
    qda_clf = QuadraticDiscriminantAnalysis()
    
    estimators = []
    
    estimators.append(('standardize', preprocessing.StandardScaler()))
    estimators.append(('pca', PCA(n_components = 280)) )
    estimators.append(('qda', qda_clf))
    model = Pipeline(estimators)
    
    grid_search = GridSearchCV(model, param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    print(grid_search.cv_results_)
    print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
    
    return grid_search.best_params_

In [43]:
qda_param_selection(X_all_train, y_all_train, 5)

{'mean_fit_time': array([11.90819025, 12.64620357, 12.60361042, 12.60761228, 11.2559453 ]), 'std_fit_time': array([0.4299788 , 0.5136865 , 0.78349362, 0.23168439, 0.131557  ]), 'mean_score_time': array([0.51916585, 0.46931939, 0.48046751, 0.46335149, 0.46344347]), 'std_score_time': array([0.10122088, 0.02472834, 0.03569394, 0.02411291, 0.04318073]), 'param_qda__reg_param': masked_array(data=[0.0, 0.01, 0.1, 0.5, 1],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'qda__reg_param': 0.0}, {'qda__reg_param': 0.01}, {'qda__reg_param': 0.1}, {'qda__reg_param': 0.5}, {'qda__reg_param': 1}], 'split0_test_score': array([0.97307692, 0.9775641 , 0.98205128, 0.98269231, 0.975     ]), 'split1_test_score': array([0.96794872, 0.96987179, 0.96987179, 0.9724359 , 0.96282051]), 'split2_test_score': array([0.96602564, 0.97179487, 0.97628205, 0.97948718, 0.97115385]), 'split3_test_score': array([0.975     , 0.97692308, 0.98141026, 0.984

{'qda__reg_param': 0.5}

In [44]:
estimators = []
estimators.append(('standardize', preprocessing.StandardScaler()))
estimators.append(('pca', PCA(n_components = 280)))
estimators.append(('qda', QuadraticDiscriminantAnalysis(reg_param = 0.5)))
model_qda = Pipeline(estimators)

In [45]:
model_qda.fit(X_all_train, y_all_train)


Pipeline(memory=None,
     steps=[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=280, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('qda', QuadraticDiscriminantAnalysis(priors=None, reg_param=0.5,
               store_covariance=False, store_covariances=None, tol=0.0001))])

In [46]:
generate_predictions_file(model_qda, test_df, 'qda_pca_280_regularization_0_5')