In [3]:
import pandas as pd
import pandas_profiling 
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
import statsmodels.api as sm
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

  from pandas.core import datetools


In [4]:
from sklearn.exceptions import DataConversionWarning
import warnings
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

### General
- https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline.score

### Feature Engineering
- https://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html
- https://scikit-learn.org/stable/modules/feature_selection.html

### HyperParams Tuning
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

### Model Evaluation:
- validation plot: https://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html#sphx-glr-auto-examples-model-selection-plot-validation-curve-py
- scoring fctions: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
- cross validation: https://scikit-learn.org/stable/modules/cross_validation.html
### Log Regression:

- https://stackoverflow.com/questions/38640109/logistic-regression-python-solvers-defintions
- PCA+Log: https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html
- https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV

### Linear Models with SGD:
- https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn.linear_model.SGDClassifier
- https://stackoverflow.com/questions/43961225/sgdclassifier-vs-logisticregression-with-sgd-solver-in-scikit-learn-library

### SVM
- https://jakevdp.github.io/PythonDataScienceHandbook/05.07-support-vector-machines.html

In [42]:
sklearn.__version__


'0.20.1'

In [8]:
from platform import python_version
print(python_version())

3.6.3


# Data preparation
- test, train split
- use cross validation for hyperparam search and error estimation

In [15]:
def generate_predictions_file(estimator, test_df, filename):
    test_df = test_df.copy()
    y_pred = estimator.predict(test_df)
    test_df['categories'] = pd.Series(y_pred, index=test_df.index)
    submission = test_df[['id','categories']]
    submission.to_csv(filename + '.csv', index=False)

In [7]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv('test.csv')

In [8]:
len(train_df)

7800

In [9]:
X_all_train = train_df.loc[:, train_df.columns != 'categories']
y_all_train = train_df.loc[:, train_df.columns == 'categories']

In [51]:
y_train = y_train.categories
y_test = y_test.categories

# KNN

In [23]:
knn = KNeighborsClassifier()

In [24]:
knn_pipe = Pipeline([('standardize', preprocessing.StandardScaler()), ('knn', knn)]) 
fit = knn_pipe.fit(X_all_train, y_all_train) #fit model

In [25]:
generate_predictions_file(knn_pipe, test_df, 'knn_5_k_default_baseline')

# Logistic Regression

### L2 regularization

In [74]:
logreg_l2 = LogisticRegression(penalty='l2',solver='saga', max_iter=3000, n_jobs=-1)

Train model on the whole data set

In [76]:
clf_l2_log_pipe = Pipeline([('standardize', preprocessing.StandardScaler()), ('logistic', logreg_l2)]) #create pipeline from pca to logregression space
fit = clf_l2_log_pipe.fit(X_all_train, y_all_train) #fit model



In [77]:
generate_predictions_file(clf_l2_log_pipe, test_df, 'std_log_l2_3000_iter_baseline')

# LDA

https://www.researchgate.net/publication/308015273_Linear_vs_quadratic_discriminant_analysis_classifier_a_tutorial

In [1]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [2]:
clf_lda = LinearDiscriminantAnalysis(n_components=2) # TODO: use eigen

In [11]:
clf_lda_pca_pipe = Pipeline([('standardize', preprocessing.StandardScaler()), ('pca', PCA(n_components = 280)), ('lda', clf_lda)]) #create pipeline from pca to svm space

In [13]:
fit = clf_lda_pca_pipe.fit(X_all_train, y_all_train) #fit model

In [17]:
generate_predictions_file(clf_lda_pca_pipe, test_df, 'lda_2_c_pca_280_c')

Official Test: 0.32542

# SVM
Note on SGDClassifier:
-  For each of the  classes, a binary classifier is learned that discriminates between that and all other classes. At testing time, we compute the confidence score (i.e. the signed distances to the hyperplane) for each classifier and choose the class with the highest confidence.

In [18]:
from sklearn.svm import SVC

In [19]:
clf_svc = SVC(kernel='linear')

In [20]:
clf_svm_pipe = Pipeline([('standardize', preprocessing.StandardScaler()), ('svc', clf_svc)]) #create pipeline from pca to logregression space
fit = clf_svm_pipe.fit(X_all_train, y_all_train) #fit model

In [21]:
generate_predictions_file(clf_svm_pipe, test_df, 'svm_linear_kernel_baseline_C_1')

In [91]:
clf_svm_pca_pipe = Pipeline([('standardize', preprocessing.StandardScaler()),('pca', PCA(n_components = 2500)), ('svc', clf_svc)]) #create pipeline from pca to svm space
fit = clf_svm_pca_pipe.fit(X_all_train, y_all_train) #fit model

In [92]:
generate_predictions_file(clf_svm_pca_pipe, test_df, 'svm_linear_pca_2500')

Stochastic Gradient Descent is sensitive to feature scaling, so it is highly recommended to scale your data. For example, scale each attribute on the input vector X to [0,1] or [-1,+1], or standardize it to have mean 0 and variance 1. Note that the same scaling must be applied to the test vector to obtain meaningful results. 