In [None]:
# Cheat sheet: https://www.datacamp.com/community/blog/scikit-learn-cheat-sheet

In [27]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier
from xgboost import XGBRegressor

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

# TODO review most popular/useful models: RF, decision tree, logistic regr.

In [81]:
# TODO what are all the possible steps?

# TODO add feature selection steps
# TODO add sequential selection

# TODO plot feature importances with random forests

# TODO example with OHE -> pandas get_dummies?
# TODO what is feature extraction?


# Only used in binary classification
from sklearn.metrics import plot_roc_curve 
from sklearn.metrics import confusion_matrix

# TODO add a binary classification example

# TODO find most useful fns in sklearn

# TODO what to do with mixed feature types

# TODO add clustering algorithm examples, meanshift, etc.
    # TODO use iris for this
# TODO check some time series algorithms
from sklearn.model_selection import TimeSeriesSplit
# MAYBE check out the column transformer (for preprocessing)

# Classification

In [82]:
from sklearn.datasets import load_digits

data_mnist = load_digits(as_frame=True)

X = data_mnist['data']
y = data_mnist['target']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    random_state=42,
    test_size=0.2
)

In [83]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

pipeline_steps = [
    ('pca', PCA()),
#     ('clf', LogisticRegression())
    ('clf', XGBClassifier())
]

pipeline = Pipeline(pipeline_steps)

param_grid = {
    'pca__n_components': [5, 15, 30, 45, 64], # Take first N components with highest variance
#     'clf__C': np.logspace(-4, 4, 4),
    'clf__reg_alpha': np.logspace(-4, 4, 4),
}

grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

y_pred = grid_search.predict(X_test)

print(grid_search.best_params_)

{'clf__reg_alpha': 0.0001, 'pca__n_components': 30}


In [86]:
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

print("F1 score:", f1_score(y_test, y_pred, average='macro')) # Mean F1 value for the multi-label case
print("Accuracy score:", accuracy_score(y_test, y_pred)) # Mean F1 value for the multi-label case
print("Precision score:", precision_score(y_test, y_pred, average='macro')) # Mean F1 value for the multi-label case
print("Recall score:", recall_score(y_test, y_pred, average='macro')) # Mean F1 value for the multi-label case

from sklearn.metrics import multilabel_confusion_matrix

multilabel_confusion_matrix(y_test, y_pred) # TODO display these matrices

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

F1 score: 0.963395421713836
Accuracy score: 0.9638888888888889
Precision score: 0.9644667442528794
Recall score: 0.9631173571943286
              precision    recall  f1-score   support

           0       0.97      0.91      0.94        33
           1       0.96      0.96      0.96        28
           2       0.92      1.00      0.96        33
           3       0.97      0.97      0.97        34
           4       1.00      1.00      1.00        46
           5       0.92      0.96      0.94        47
           6       1.00      0.97      0.99        35
           7       1.00      1.00      1.00        34
           8       0.93      0.93      0.93        30
           9       0.97      0.93      0.95        40

    accuracy                           0.96       360
   macro avg       0.96      0.96      0.96       360
weighted avg       0.96      0.96      0.96       360



# Regression
Regression example, Boston housing prices

In [78]:

from sklearn.datasets import load_boston

data_boston = load_boston()

X = pd.DataFrame(data_boston['data'], columns=data_boston.feature_names)
y = data_boston['target']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    random_state=42,
    test_size=0.2,
)

In [72]:
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import Normalizer # TODO check what this does exactly

from sklearn.preprocessing import LabelEncoder

from sklearn.decomposition import PCA
from sklearn.linear_model import RandomForestRegressor

steps = [
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('randomForestRegressor', RandomForestRegressor())
]

pipeline = Pipeline(steps)

In [73]:
# Hyperparameter optimisation
parameters = {
#     'logisticRegression__C':[0.001,0.1,1.0,10.0],
#     'logisticRegression__penalty':['l1', 'l2', 'elasticnet', 'none']
}

grid = GridSearchCV(
    pipeline,
    param_grid=parameters,
    cv=5
)

grid.fit(X_train, y_train)
print(grid.best_params_)

y_pred = grid_search.predict(X_test)

In [86]:
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# TODO plot residuals in regression
residuals = y_test - y_pred

score = 1.0
{'logisticRegression__C': 0.1, 'logisticRegression__penalty': 'l2'}
