In [1]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
)
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
from sklearn.svm import SVC

from helpers import PipelineHelper

data = pd.read_csv("../data/sdd_IOW.csv", index_col=0)

In [4]:
data.columns

Index(['Frequency', 'FrequencyA500', 'FrequencyB500', 'MPmass', 'Mass', 'LON',
       'LAT', 'Split', 'MP_D50', 'Concentration', 'ConcentrationA500',
       'ConcentrationB500', 'ConcentrationA500_div_B500', 'MassConcentration',
       'Depth', 'Dist_Marina', 'Dist_WWTP', 'Dist_WWTP2', 'SAMPLE TYPE ',
       'TEXTURAL GROUP ', 'SEDIMENT NAME ', 'MoM_ari_MEAN', 'MoM_ari_SORTING',
       'MoM_ari_SKEWNESS', 'MoM_ari_KURTOSIS', 'MoM_geo_MEAN',
       'MoM_geo_SORTING', 'MoM_geo_SKEWNESS', 'MoM_geo_KURTOSIS',
       'MoM_log_MEAN', 'MoM_log_SORTING', 'MoM_log_SKEWNESS',
       'MoM_log_KURTOSIS', 'FW_geo_MEAN', 'FW_geo_SORTING', 'FW_geo_SKEWNESS',
       'FW_geo_KURTOSIS', 'FW_log_MEAN', 'FW_log_SORTING', 'FW_log_SKEWNESS',
       'FW_log_KURTOSIS', 'FW_des_MEAN', 'FW_des_SORTING', 'FW_des_SKEWNESS',
       'FW_des_KURTOSIS', 'MODE 1 (µm)', 'MODE 2 (µm)', 'MODE 3 (µm)',
       'MODE 1 (f)', 'MODE 2 (f)', 'MODE 3 (f)', 'D10 (µm)', 'D50 (µm)',
       'D90 (µm)', '(D90 div D10) (µm)', '(D90 -

In [5]:
X = data.Concentration
y = data.iloc[:, 14:]

In [7]:
y.head()

Unnamed: 0_level_0,Depth,Dist_Marina,Dist_WWTP,Dist_WWTP2,SAMPLE TYPE,TEXTURAL GROUP,SEDIMENT NAME,MoM_ari_MEAN,MoM_ari_SORTING,MoM_ari_SKEWNESS,...,perc COARSE SILT,perc MEDIUM SILT,perc FINE SILT,perc V FINE SILT,perc CLAY,OM_D50,TOC,Hg,TIC,regio_sep
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Schlei_S10,0.8,824,27.5,27.5,"Unimodal, Poorly Sorted",Slightly Gravelly Sand,Slightly Very Fine Gravelly Fine Sand,277.8,219.9,2.603,...,1.0,0.8,0.8,0.5,0.0,73.5,0.3995,5.0,0.21,WWTP
Schlei_S10_15cm,0.8,824,27.5,27.5,"Unimodal, Poorly Sorted",Slightly Gravelly Muddy Sand,Slightly Very Fine Gravelly Very Coarse Silty ...,269.0,223.7,2.104,...,2.0,2.0,2.0,1.3,0.3,27.1,0.1685,16.2791,0.225,WWTP
Schlei_S11,2.9,379,750.0,750.0,"Bimodal, Very Poorly Sorted",Sandy Mud,Very Fine Sandy Very Coarse Silt,53.97,91.8,4.056,...,14.1,12.2,12.3,8.0,5.8,45.4,10.8215,221.3,0.645,WWTP
Schlei_S13,3.0,1320,1605.0,1605.0,"Bimodal, Very Poorly Sorted",Sandy Mud,Very Fine Sandy Very Coarse Silt,59.68,87.61,5.527,...,9.8,9.7,9.6,7.0,6.5,73.9,6.9315,172.9,0.465,inner
Schlei_S14,2.3,950,2933.0,2933.0,"Bimodal, Very Poorly Sorted",Muddy Sand,Very Coarse Silty Medium Sand,213.1,196.7,1.143,...,3.6,4.4,5.0,3.5,4.2,99.5,1.0503,31.2,0.39,inner


In [None]:

pipe = Pipeline([
    ('scaler', PipelineHelper([
        ('std', StandardScaler()),
        ('max', MaxAbsScaler()),
    ], optional=True)),

    ('classifier', PipelineHelper([
        # ('svm', SVC()),
        # ('rf', RandomForestClassifier()),
        # ('ada', AdaBoostClassifier()),
        # ('gb', GradientBoostingClassifier()),
        ('knn', KNeighborsClassifier()),
        # ('nb_pipe', Pipeline([
        #     # Naivie Bayes needs positive numbers
        #     ('scaler', MinMaxScaler()),
        #     ('nb', MultinomialNB()),
        # ])),
    ])),
])

params = {
    'scaler__selected_model': pipe.named_steps['scaler'].generate(
        {
            'std__with_mean': [True, False],
            'std__with_std': [True, False],
            # no params for 'max' leads to using standard params
        }
    ),
    'classifier__selected_model': pipe.named_steps['classifier'].generate(
        {
            # 'svm__C': [0.1, 1.0],
            # 'svm__kernel': ['linear', 'rbf'],
            # 'rf__n_estimators': [10, 20, 50, 100, 150],
            # 'rf__max_features': ['sqrt', 'log2'],
            # 'rf__min_samples_split': [2, 5, 10],
            # 'rf__min_samples_leaf': [1, 2, 4],
            # 'rf__bootstrap': [True, False],
            # 'ada__n_estimators': [10, 20, 40, 100],
            # 'ada__algorithm': ['SAMME', 'SAMME.R'],
            # 'gb__n_estimators': [10, 20, 50, 100],
            # 'gb__criterion': ['friedman_mse', 'squared_error'],
            # 'gb__max_features': ['sqrt', None],
            'knn__n_neighbors': [2, 3, 5, 7, 10],
            'knn__leaf_size': [1, 2, 3, 5],
            'knn__weights': ['uniform', 'distance'],
            'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
            # 'nb_pipe__nb__fit_prior': [True, False],
            # 'nb_pipe__nb__alpha': [0.1, 0.2],
        }
    ),
}


In [None]:
grid = GridSearchCV(pipe, params, scoring='accuracy', verbose=1, n_jobs=-1)
grid.fit(X, y)


In [None]:
print(grid.best_params_)
print(grid.best_score_)