In [1]:
import os
os.chdir('..')

In [2]:
import warnings
import pickle
import numpy as np
import pandas as pd
import os
import json

from sklearn.preprocessing import StandardScaler, RobustScaler

from datetime import datetime
from sklearn.model_selection import StratifiedKFold, ParameterGrid

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier, \
    RandomForestRegressor, BaggingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.svm import LinearSVC, SVC, LinearSVR, SVR
from sklearn.neighbors import KNeighborsClassifier


In [3]:
# The following function will be used to split the samples ID-wise. 
# This function is only called once before the main loop for training with cross-validation.
def split_train_test(labels_df):
    from sklearn.model_selection import StratifiedKFold   
    # First, get all unique samples and their category
    unique_samples = []
    unique_cats = []
    for sample, cat in zip(labels_df['sample'], labels_df['category']):
        if sample not in unique_samples:
            unique_samples += [sample]
            unique_cats += [cat]

    # StratifiedKFold with n_splits of 5 to ranmdomly split 80/20.
    # Used only once for train/test split.
    # The train split needs to be split again into train/valid sets later
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    train_inds, test_inds = next(skf.split(unique_samples, unique_cats))
    
    # After the samples are split, we get the duplicates of all samples.
    train_samples = [unique_samples[s] for s in train_inds]
    test_samples = [unique_samples[s] for s in test_inds]
    train_cats = [unique_cats[ind] for ind in train_inds]

    return train_samples, test_samples, train_cats


In [4]:
params_sgd = {
    'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
    'penalty': ['elasticnet'],
    'l1_ratio': [0, 0.15, 0.5, 0.85, 1.0],
    'max_iter': [10000],
    'class_weight': ['balanced']
}

param_grid_rf = {
    'max_depth': [300],
    'max_features': [100],
    'min_samples_split': [300],
    'n_estimators': [100],
    'criterion': ['entropy'],
    'min_samples_leaf': [3],
    'oob_score': [False],
    'class_weight': ['balanced']
}
param_grid_rfr = {
    'max_depth': [300],
    'max_features': [100],
    'min_samples_split': [300],
    'n_estimators': [100],
    'min_samples_leaf': [3],
    'oob_score': [False],
}
param_grid_lda = {
}
param_grid_qda = {
}
param_grid_logreg = {
    # 'max_iter': [10000],
    'solver': ['saga'],
    'penalty': ['l1', 'l2'],
    'class_weight': ['balanced']
}
param_grid_linsvc = {
    'max_iter': [10000],
    'C': [1]
}
param_grid_svc = {
    'max_iter': [10000],
    'C': [1],
    'kernel': ['linear'],
    'probability': [True]
}
param_grid_ada = {
    'base_estimator': [LinearSVC(max_iter=10000)],
    'learning_rate': (1)
}
param_grid_bag = {
    'base_estimator': [
        LinearSVC(max_iter=10000)],
    'n_estimators': [10]
}

param_grid_voting = {
    'voting': ('soft', 'hard'),
}
rf = RandomForestClassifier(max_depth=300, max_features=100, min_samples_split=300, n_estimators=100)
gnb = GaussianNB()
cnb = CategoricalNB()
lr = LogisticRegression(max_iter=4000)
lsvc = LinearSVC(max_iter=10000)
estimators_list = [('rf', rf),
                   ('lr', lr),
                   ('lsvc', lsvc),
                   ('gnb', gnb),
                   ]


In [5]:
np.random.seed(42)

warnings.filterwarnings('ignore')

DIR = 'src/models/sklearn/'


In [6]:
def get_estimators_list():
    rf = RandomForestClassifier(max_depth=300, max_features=100, min_samples_split=300, n_estimators=100)
    gnb = GaussianNB()
    lr = LogisticRegression(max_iter=4000)
    lsvc = SVC(kernel='linear', probability=True)
    estimators_list = [('gnb', gnb),
                       ('lr', lr),
                       ('lsvc', lsvc)
                       ]
    return estimators_list


In [7]:
# Function to retrieve the data from a csv file.
def ms_data(fname):
    from sklearn.preprocessing import minmax_scale
    mat_data = pd.read_csv(fname)
    labels = mat_data.index.values
    categories = [int(lab.split('_')[1]) for lab in labels]
    labels = [lab.split('_')[0] for lab in labels]
    mat_data = np.asarray(mat_data)
    mat_data = minmax_scale(mat_data, axis=0, feature_range=(0, 1))
    mat_data = mat_data.astype("float32")
    return mat_data, labels, categories


In [8]:
models = {
    "LogisticRegression": [LogisticRegression, param_grid_logreg],
    "BaggingClassifier": [BaggingClassifier, param_grid_bag],
    "LinearSVC": [LinearSVC, param_grid_linsvc],
    "SVCLinear": [SVC, param_grid_svc],
    "Gaussian_Naive_Bayes": [GaussianNB, {}],
    "SGDClassifier": [SGDClassifier, params_sgd],
    "KNeighbors": [KNeighborsClassifier, {}],
    # "AdaBoost_Classifier": [AdaBoostClassifier, param_grid_ada],
    "LDA": [LinearDiscriminantAnalysis, param_grid_lda],
    "QDA": [QuadraticDiscriminantAnalysis, param_grid_qda],
    "RandomForestClassifier": [RandomForestClassifier, param_grid_rf],
    # "Voting_Classifier": [VotingClassifier, param_grid_voting],
}


In [9]:
intensities_csv = 'data/canis_intensities.csv'
verbose = 1

In [10]:
data, labels, samples = ms_data('data/canis_intensities.csv')
data[np.isnan(data)] = 0

for i, label in enumerate(labels):
    if label != 'Normal':
        labels[i] = 'Not Normal'

nb_classes = len(set(labels))


In [11]:
categories = pd.Categorical(labels).codes
labels_df = pd.concat([
    pd.DataFrame(np.array(samples).reshape([-1, 1])),
    pd.DataFrame(np.array(categories).reshape([-1, 1])),
    pd.DataFrame(np.array(labels).reshape([-1, 1])),
], 1)
labels_df.columns = ['sample', 'category', 'label']
labels_df

Unnamed: 0,sample,category,label
0,1,1,Not Normal
1,1,1,Not Normal
2,1,1,Not Normal
3,1,1,Not Normal
4,1,1,Not Normal
...,...,...,...
2223,9,1,Not Normal
2224,9,1,Not Normal
2225,9,1,Not Normal
2226,9,1,Not Normal


In [15]:
all_train_samples, test_samples, train_cats = split_train_test(labels_df)
all_train_indices = [s for s, lab in enumerate(labels_df['sample']) if lab in all_train_samples]
test_indices = [s for s, lab in enumerate(labels_df['sample']) if lab in test_samples]

x_test = data[test_indices]
y_test = labels_df['category'][test_indices]


In [19]:
x_test = data[test_indices]
y_test = labels_df['category'][test_indices].tolist()
all_x_train = data[all_train_indices]
all_y_train = labels_df['category'][all_train_indices].tolist()

estimators_list = get_estimators_list()
best_params = {}
scaler = StandardScaler()
for name, (model, param_grid) in zip(models.keys(), models.values()):
    best_scores_train = []
    best_scores_valid = []
    for g in ParameterGrid(param_grid):
        print('h_params:', g)
        best_score = np.inf
        # 3-fold CV
        skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
        for i, (train_samples, valid_samples) in enumerate(skf.split(all_train_samples, train_cats)):
            # Just plot the first iteration, it will already be crowded if doing > 100 optimization iterations
            if verbose:
                print(f"CV: {i}")

            # Now valid_samples and train_samples are indices of all_train_samples.
            # So we first need to get the correct samples indices
            valid_samples = [all_train_samples[s] for s in valid_samples]
            train_samples = [all_train_samples[s] for s in train_samples]
            
            
            # Next, we get all the indices for all replicates of each set
            train_indices = [s for s, lab in enumerate(labels_df['sample'].tolist()) if lab in train_samples]
            valid_indices = [s for s, lab in enumerate(labels_df['sample'].tolist()) if lab in valid_samples]

            x_train = data[train_indices]
            y_train = labels_df['category'][train_indices]
            x_valid = data[valid_indices]
            y_valid = labels_df['category'][valid_indices]
            
            scaler = StandardScaler()
            scaler.fit(x_train)
            x_train = scaler.transform(x_train)
            x_valid = scaler.transform(x_valid)

            dir_name = f"saved_models/sklearn/"
            os.makedirs(dir_name, exist_ok=True)
            pickle.dump(scaler, open(f"{dir_name}/scaler.sav", 'wb'))

            m = model()
            m.set_params(**g)
            m = m.fit(x_train, y_train)
            valid_score = m.score(x_valid, y_valid)
            score_train = m.score(x_train, y_train)
            score_valid = m.score(x_valid, y_valid)
            best_scores_train += [score_train]
            best_scores_valid += [score_valid]

        # save if best
        if np.mean(best_scores_valid) < best_score:
            best_score = valid_score
            best_grid = g
    best_params[name] = best_grid
    best_model = model()
    best_model.set_params(**best_grid)
    best_model.fit(all_x_train, all_y_train)
    score_train = best_model.score(all_x_train, all_y_train)
    score_test = best_model.score(all_x_train, all_y_train)

    print(f"Best model\n"
          f"Train score: {score_train}, "
          f"Best Valid score: {score_test}"
          )
    os.makedirs(dir_name, exist_ok=True)
    filename = f"{dir_name}/{name}.sav"
    pickle.dump(model, open(filename, 'wb'))

    # TODO find best grid according to all cv iterations
    best_model = model()
    best_model.set_params(**best_grid)
    best_model.fit(X=all_x_train, y=all_y_train)
    score_test = best_model.score(x_test, y_test)
    best_params[name]['train_acc_mean'] = np.mean(best_scores_train)
    best_params[name]['train_acc_std'] = np.std(best_scores_train)
    best_params[name]['valid_acc_mean'] = np.mean(best_scores_valid)
    best_params[name]['valid_acc_std'] = np.std(best_scores_valid)
    best_params[name]['test_acc'] = score_test

    print(
        f"Best model\n"
        f"Train score: {np.mean(best_scores_train)} +- {np.std(best_scores_train)}\n"
        f"Valid score: {np.mean(best_scores_valid)} +- {np.std(best_scores_valid)}\n"
        f"Test score: {score_test}\n"
    )
for name in best_params.keys():
    for param in best_params[name].keys():
        best_params[name][param] = str(best_params[name][param])

json.dump(best_params, open('saved_models/sklearn/best_params.json', 'w'))


h_params: {'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'saga'}
CV: 0
CV: 1
CV: 2
h_params: {'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'saga'}
CV: 0
CV: 1
CV: 2
Best model
Train score: 0.8504074505238649, Best Valid score: 0.8504074505238649
Best model
Train score: 0.8482211362053901 +- 0.05130741558593411
Valid score: 0.7978201321949293 +- 0.030475901216540296
Test score: 0.703921568627451

h_params: {'base_estimator': LinearSVC(max_iter=10000), 'n_estimators': 10}
CV: 0
CV: 1
CV: 2
Best model
Train score: 0.9941792782305006, Best Valid score: 0.9941792782305006
Best model
Train score: 0.9805409673581308 +- 0.002356648600947422
Valid score: 0.8473026364935139 +- 0.04229599591201576
Test score: 0.8254901960784313

h_params: {'C': 1, 'max_iter': 10000}
CV: 0
CV: 1
CV: 2
Best model
Train score: 0.9965075669383003, Best Valid score: 0.9965075669383003
Best model
Train score: 1.0 +- 0.0
Valid score: 0.8628514325925133 +- 0.035845329366991315
Test score: 0.82941176470

In [None]:
final_models = {
    "SVCLinear": SVC(max_iter=10000, kernel='linear', probability=True),
    "RandomForestClassifier": RandomForestClassifier(max_depth=300, max_features=100, min_samples_split=300,
                                                     n_estimators=100, class_weight='balanced',
                                                     criterion='entropy'),
    "Bagging_Classifier":
        BaggingClassifier(
            base_estimator=LinearSVC(max_iter=4000), n_estimators=100),
    "LogisticRegression": LogisticRegression(max_iter=10000, penalty='l2', class_weight='balanced'),
    "LinearSVC": GridSearchCV(estimator=LinearSVC(max_iter=4000), param_grid={}, n_jobs=-1, cv=5),
    # "Voting_Classifier": VotingClassifier(estimators=estimators_list, voting='hard')
}
# final_train(final_models, args)
