# <center>Week 10

## Loading Packages

In [1]:
import pandas as pd
import numpy as np
# Preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn import cluster
# Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
# Model evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
X_train , X_test,y_train, y_test = train_test_split(df.drop(['label'], axis = 1), 
                                                    df['label'],
                                                     test_size=0.33, random_state=42)


## Data Preprocessing

To increase speed and accuracy of the algorithm I scale the columns using StandardScaler on all the columns. What this function does is centering all the variables around the mean and divides them by the sd.

In [5]:
col_trans = make_column_transformer(
                                    (StandardScaler(),make_column_selector(dtype_include=np.number)),
                                  remainder = 'passthrough')

# Models


## 1. Random Forest

First, I call the random forest classifier, and create a pipeline so that it does all  preprocessing needed: scaling and feature aggregation, and then it run the random forest. To determine the number of clusters to use, i.e., the number of columns that I want the FeatureAggregation algorithm to return, I discussed with Ben that it is reasonable to use 196, which is like aggregating 4 pixels into 1. This can increase the effiency of the algorithms with the negative consequence of reducing the amount of information we have.
To find the optimal number, it would be better include this parameter in the GridSearch, but it takes too long.
I only added this parameter in the KNN gridsearch and found that 196 was the best among the options provided.
I use gridsearch to find the max depth, the max number of features at each node, and the number of estimators.

In [8]:
clf_rf = RandomForestClassifier() 

clf_rf_pipeline = Pipeline(steps = [
    ('preprocess', col_trans),
    ('feat_agg', cluster.FeatureAgglomeration(n_clusters=196)),
    ('model', clf_rf)]
)

In [9]:
params_rf = {'model__max_depth' : [i + 1 for i in range(9, 12)],
             'model__max_features': ['sqrt'],
             'model__n_estimators': [i  for i in range(800, 1300, 100)]},

clf_rf_gs = GridSearchCV(clf_rf_pipeline, cv = 5, param_grid=params_rf, 
                         scoring = 'accuracy', verbose = 10, n_jobs=-1)

In [10]:
clf_rf_gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocess',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('standardscaler',
                                                                         StandardScaler(),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x1371507c0>)])),
                                       ('feat_agg',
                                        FeatureAgglomeration(n_clusters=196)),
                                       ('model', RandomForestClassifier())]),
             n_jobs=-1,
             param_grid=({'model__max_depth': [10, 11, 12],
                          'model__max_features': ['sqrt'],
                          'model__n_estimators': [800, 900, 1000, 1100,
                                                  1200]},),
             scorin

In [11]:
clf_rf_gs.best_params_

{'model__max_depth': 12,
 'model__max_features': 'sqrt',
 'model__n_estimators': 1200}

In [12]:
rf_preds = clf_rf_gs.predict(X_test)

In [13]:
print(f'accuracy score: {round(accuracy_score(y_test, rf_preds), 4)}')

accuracy score: 0.9506


## 2. Multinomial Logistic Regression

An alternative to the tree is a multinomial logistic regression. I use the l1 penalty term to reduce overfitting and I tune it using gridsearch.
I have to change the solver to deal with multiple values in the `y_test` vector.

In [18]:
clf_log = LogisticRegression(penalty = 'l1', solver = 'saga',
                             max_iter= 1000)
clf_log_pipeline = Pipeline(steps = [
    ('preprocess', col_trans),
    ('feat_agg', cluster.FeatureAgglomeration(n_clusters=196)),
    ('model', clf_log)])


In [19]:
params_log = {'model__C' : list(np.linspace(0.01, 1, 10))}

clf_log_gs = GridSearchCV(clf_log_pipeline, cv = 5, 
                          param_grid= params_log, 
                         scoring = 'accuracy', verbose = 10, error_score='raise',
                         n_jobs=-1)

In [None]:
clf_log_gs.fit(X_train, y_train)
log_preds = clf_log_gs.predict(X_test)

In [24]:
clf_log_gs.best_params_

{'model__C': 0.45}

In [52]:
print(f' The accuracy of the logistic regression was: {round(accuracy_score(y_test, log_preds), 4)}')

 The accuracy of the logistic regression was: 0.9183


## 3. Neural Networks

I run two NN, one without any tunning, and one in which I tune the learning rate and the number of layers in each hidden layer. 
This was interesting because the NN without any tunning overperformed the Random Forest and the Logistic Regressions and it was much faster than both.

In [22]:
clf_nn = MLPClassifier(random_state=1, max_iter=300)

nn_pipeline = Pipeline(steps=[
                              ('preprocess', col_trans),
    ('feat_agg', cluster.FeatureAgglomeration(n_clusters=196)),
                              ('model', clf_nn)
])


nn_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x1371507c0>)])),
                ('feat_agg', FeatureAgglomeration(n_clusters=196)),
                ('model', MLPClassifier(max_iter=300, random_state=1))])

In [54]:
nn_preds = nn_pipeline.predict(X_test)
print(f"The neural network's accuracy without any tuning is: {round(accuracy_score(y_test, nn_preds), 4)}")

The neural network's accuracy without any tuning is: 0.9674


The NN took only 27 seconds to run and achieve better accuracy than the logistic and the random forest. Both of which required tuning and more than 20 minutes to fit.

### Tuning Neural Networks

In [None]:
params_nn = {'model__alpha':list(np.linspace(0.001, .01, 8)),
'model__hidden_layer_sizes': [(300, 300, ), (400, 400, )]
}

nn_gs = GridSearchCV(nn_pipeline, param_grid = params_nn, cv = 5, n_jobs = -1,
scoring = "accuracy", verbose = 10)

nn_gs.fit(X_train, y_train)
y_nngs = nn_gs.predict(X_test)

In [37]:
nn_gs.best_params_

{'model__alpha': 0.007428571428571429, 'model__hidden_layer_sizes': (400, 400)}

In [58]:
print(f'The accuracy of the NN after tuning the number of neurons and the learning parameter alpha is: {round(accuracy_score(y_test, y_nngs), 4)}')

The accuracy of the NN after tuning the number of neurons and the learning parameter alpha is: 0.9753


## 4. KNN  

I tune the number of neighbors and the nubmer of clusters that the Feature Aggregation should create. It turns out that the initial guess of 196 was the best.
However the algorithm took around 30 mins to fit, and it really slowed down my computer so I couldn't work on my Game Theory paper while this was fitting.

In [6]:
knn_clf = KNeighborsClassifier()

knn_pipeline = Pipeline(steps=[
                              ('preprocess', col_trans),
    ('feat_agg', cluster.FeatureAgglomeration(n_clusters=196)),
                              ('model', knn_clf)
])

In [7]:
knn_params = {'model__n_neighbors':[i for i in range(3, 10)], 'feat_agg__n_clusters': [98, 196, 392]}

knn_gs = GridSearchCV(knn_pipeline, param_grid= knn_params, n_jobs=-1, scoring='accuracy',
cv = 5, verbose = 10)

In [None]:
knn_gs.fit(X_train, y_train)

In [9]:
knn_gs.best_params_

{'feat_agg__n_clusters': 196, 'model__n_neighbors': 5}

In [10]:
knn_preds = knn_gs.predict(X_test)
accuracy_score(y_test, knn_preds)

0.9424963924963925

## Prepare Output

The best model was the NN with the tuned parameters.

In [49]:
nn_preds = nn_gs.predict(test)

In [50]:
sub_nn = pd.DataFrame({"ImageId":[i + 1 for i in range(len(nn_preds))], "Label": nn_preds})

sub_nn.to_csv('submission_nn.csv', index = False)