#<center> Week 10

In [10]:
import pandas as pd
import numpy as np
# Preprocessing
from sklearn import metrics
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.compose import make_column_selector
from sklearn import cluster
# Models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
# Model evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.experimental import enable_iterative_imputer  # noqa

In [11]:
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [12]:
df.shape

(42000, 785)

In [13]:
X_train , X_test,y_train, y_test = train_test_split(df.drop(['label'], axis = 1), 
                                                    df['label'],
                                                     test_size=0.33, random_state=42)


## Data Preprocessing

To scale up the analysis I can scale the columns, and try to reduce the number of columns. I would be dropping information, but gaining speed. I can do this with FeatureAgglomeration which reduces the dimensionality of the data set. This function merges features that are very similar.

In [14]:
col_trans = make_column_transformer(
                                    (StandardScaler(),make_column_selector(dtype_include=np.number)),
                                  remainder = 'passthrough')

# Models


## 1. Random Forest

First, I call the random forest classifier, and create a pipeline with it and the preprocessing needed (scaling, and feature aggregation).
As it is not clear how much aggregation is efficient I pass the number of clusters in the gridsearch to find the best value.

In [15]:
clf_rf = RandomForestClassifier() 

clf_rf_pipeline = Pipeline(steps = [
    ('preprocess', col_trans),
    ('feat_agg', cluster.FeatureAgglomeration(n_clusters=32)),
    ('model', clf_rf)]
)

In [16]:
params_rf = {'model__max_depth' : [i + 1 for i in range(5, 10)],
             'model__max_features': ['sqrt'],
             'model__n_estimators': [i  for i in range(800, 1300, 100)]}#,
             #'feat_agg__n_clusters': [i for i in range(5, 50, 10)]}

clf_rf_gs = GridSearchCV(clf_rf_pipeline, cv = 5, param_grid=params_rf, 
                         scoring = 'accuracy', verbose = 10)

In [17]:
clf_rf_gs.fit(X_train, y_train)

KeyboardInterrupt: ignored

In [None]:
clf_rf_gs.best_params_

In [None]:
rf_preds = clf_rf_gs.predict(X_test)

In [None]:
print(f'accuracy score: {round(accuracy_score(y_test, rf_preds), 4)}')

## 2. XGboost

In [None]:
#xgb_clf = xgb.XGBClassifier()
#
#params_xgb = {'eta': list(np.linspace(0.01, 0.2, 2)),
#              'max_depth': [i+1 for i in range(2, 8)], 
#             # 'lambda': list(np.linspace(0.01, 1, 5))
#              }
#
#
#xgb_gs = GridSearchCV(xgb_clf, cv = 5, param_grid = params_xgb,
#                      verbose = 10, scoring = 'accuracy')

In [None]:
#xgb_gs.fit(X_train, y_train)

In [None]:
#xgs_preds = xgb_gs.predict(X_test)

## 3. Multinomial Logistic Regression

In [None]:
clf_log = LogisticRegression(penalty = 'l1', solver = 'saga',
                             max_iter= 1000)
clf_log_pipeline = Pipeline(steps = [
    ('preprocess', col_trans),
    ('feat_agg', cluster.FeatureAgglomeration()),
    ('model', clf_log)])


In [None]:
params_log = {'model__C' : list(np.linspace(0.01, 1, 10)),
             'feat_agg__n_clusters': [i for i in range(3, 40, 5)]}
            

clf_log_gs = GridSearchCV(clf_log_pipeline, cv = 5, 
                          param_grid= params_log, 
                         scoring = 'accuracy', verbose = 10, error_score='raise')

In [None]:
clf_log_gs.fit(X_train, y_train)
log_preds = clf_log_gs.predict(X_test)

In [None]:
accuracy_score(y_test, log_preds)

## 4. Neural Networks

In [None]:
clf_nn = MLPClassifier(random_state=1, max_iter=300)

nn_pipeline = Pipeline(steps=[
                              ('preprocess', col_trans),
                              ('model', clf_nn)
])

nn_pipeline.fit(X_train, y_train)

In [None]:
nn_preds = nn_pipeline.predict(X_test)
accuracy_score(y_test, nn_preds)