<a href="https://colab.research.google.com/github/stiwari-ds/data-science-competitions/blob/main/dphi/ds92/notebooks/01_linear_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
%%capture
!pip install -U optuna

In [2]:
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
optuna.logging.set_verbosity(optuna.logging.ERROR)

optuna.__version__

'3.0.2'

In [3]:
import gc
import os
import time
import warnings

gc.enable()
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.pipeline import make_pipeline
from sklearn import linear_model as lm

In [4]:
SEED = 130322
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [5]:
DATA_URL = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/dphi/ds92/data'

train = pd.read_csv(f'{DATA_URL}/raw/train_dataset.csv')
test = pd.read_csv(f'{DATA_URL}/raw/test_dataset.csv')

In [6]:
TARGET = 'class'
labels = LabelEncoder()
train[TARGET] = labels.fit_transform(train[TARGET])

In [7]:
train[TARGET].value_counts(normalize=True)

0    0.6444
1    0.3556
Name: class, dtype: float64

Feature sets from EDA notebook:

In [8]:
features = list(test.columns)

#reduced based on correlation and ANOVA f-test
corr_features = [f for f in features if f not in ('fConc', 'fConc1', 'fM3Trans', 'fDist')]

#reduced based on mutual information score
mi_features = [f for f in features if f not in ('fConc1', 'fDist')]

In [9]:
feature_sets = {
    'all': features,
    'corr': corr_features,
    'mi': mi_features
}

# Modeling

In [10]:
def get_best_threshold(y_true, pred_probs):
    candidate_thresholds = np.arange(0, 1, 0.01)
    candidate_scores = [accuracy_score(y_true, (pred_probs >= t).astype('int')) 
                        for t in candidate_thresholds]
    best_threshold = candidate_thresholds[np.argmax(candidate_scores)]
    return best_threshold

In [11]:
def cross_validate(model, feature_set, thresholding, n_splits=5):
    scores = []
    X, y = train[feature_set], train[TARGET]

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]    
        model.fit(X_train, y_train)
        if thresholding:
            val_probs = model.predict_proba(X_val)[:, 1]
            best_threshold = get_best_threshold(y_val, val_probs)
            val_preds = (val_probs >= best_threshold).astype('int')
        else:
            val_preds = model.predict(X_val)
        scores.append(accuracy_score(y_val, val_preds))
    return np.mean(scores)

In [12]:
def tune_params(objective, preprocessor, feature_set, n_trials, direction, thresholding):

    study = optuna.create_study(
        sampler=TPESampler(seed=SEED),
        pruner=HyperbandPruner(),
        direction=direction
    )
    study.optimize(
        func=lambda trial: objective(trial, preprocessor, feature_set, thresholding),
        n_trials=n_trials,
        gc_after_trial=True
    )
    return study

In [13]:
def cross_validate_predict(model, feature_set, thresholding, n_splits=5):
    oof_preds = {}  #out-of-fold predictions on train set
    test_preds = {} #predictions on test set for each fold
    scores = [] #scores on validation set

    X, y = train[feature_set], train[TARGET]
    X_test = test[feature_set]
    
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model.fit(X_train, y_train)

        if thresholding:
            val_probs = model.predict_proba(X_val)[:, 1]
            test_probs = model.predict_proba(X_test)[:, 1]
            best_threshold = get_best_threshold(y_val, val_probs)
            val_preds = (val_probs >= best_threshold).astype('int')
            test_preds_fold = (test_probs >= best_threshold).astype('int')
        else:
            val_preds = model.predict(X_val)
            test_preds_fold = model.predict(X_test)
        
        oof_preds.update(dict(zip(val_idx, val_preds)))
        test_preds[f'fold{fold}'] = test_preds_fold

        score = accuracy_score(y_val, val_preds)
        scores.append(score)
        print(f'Fold #{fold}: {score:.4f}', end=' | ')
        _ = gc.collect()
    print(f'Avg = {np.mean(scores):.4f} +/- {np.std(scores):.4f}')
    
    oof_preds = pd.Series(oof_preds).sort_index()
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mode'] = test_preds.mode(axis=1)[0]

    return oof_preds, test_preds['mode']

In [14]:
def run_experiment(objective, preprocessor, classifier, feature_set, n_trials=5, thresholding=False):
    
    print(f'----------Hyperparameter tuning----------')
    start = time.time()
    study = tune_params(
        objective=objective, 
        preprocessor=preprocessor, 
        feature_set=feature_set, 
        n_trials=n_trials, 
        direction='maximize', #metric is accuracy -> higher is better
        thresholding=thresholding
    )
    end = time.time()
    print(f'Best trial: {study.best_trial.number} -> ' \
          f'Best value: {study.best_value:.4f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:15} - {v}')
    print(f'[Time taken: {end - start:.2f}s]')

    print(f'-----Cross-validation and prediction-----')
    start = time.time()
    classifier.set_params(**study.best_params)
    model = make_pipeline(preprocessor, classifier)
    oof_preds, test_preds = cross_validate_predict(model, feature_set, thresholding)
    end = time.time()
    print(f'[Time taken: {end - start:.2f}s]\n')

    return oof_preds, test_preds

In [15]:
oof_preds = pd.DataFrame()  #Out-of-fold predictions on train set
test_preds = pd.DataFrame() #Cross-validated predictions on test set

**Preprocessors**

In [16]:
preprocessors = {
    'ss': StandardScaler(),
    'qt': QuantileTransformer(n_quantiles=100, output_distribution='normal', random_state=SEED)
}

In [17]:
N_TRIALS = 50

### RidgeClassifier

In [18]:
def ridge_objective(trial, preprocessor, feature_set, thresholding):
    param_grid = {
        'max_iter': trial.suggest_categorical('max_iter', [5000]),
        'solver': trial.suggest_categorical('solver', ['svd']),
        'alpha': trial.suggest_float('alpha', 1e-2, 1e2, log=True),
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced']),
        'random_state': trial.suggest_categorical('random_state', [SEED])
    }
    model = make_pipeline(preprocessor, lm.RidgeClassifier(**param_grid))
    return cross_validate(model, feature_set, thresholding)

In [19]:
%%time
clf = 'ridge'
for fset in feature_sets:
    for proc in preprocessors:
        model_name = f'{clf}_{proc}_{fset}'
        print(f'Model config: {model_name}')
        oof_preds[model_name], test_preds[model_name] = run_experiment(
            objective=ridge_objective,
            preprocessor=preprocessors[proc],
            classifier=lm.RidgeClassifier(),
            feature_set=feature_sets[fset],
            n_trials=N_TRIALS,
            thresholding=False  #since RidgeClassifier only has predict()
        )

Model config: ridge_ss_all
----------Hyperparameter tuning----------
Best trial: 10 -> Best value: 0.7878
Best hyperparameters:
max_iter        - 5000
solver          - svd
alpha           - 5.890119895047674
class_weight    - balanced
random_state    - 130322
[Time taken: 10.97s]
-----Cross-validation and prediction-----
Fold #0: 0.7870 | Fold #1: 0.7810 | Fold #2: 0.7930 | Fold #3: 0.7770 | Fold #4: 0.8010 | Avg = 0.7878 +/- 0.0085
[Time taken: 3.53s]

Model config: ridge_qt_all
----------Hyperparameter tuning----------
Best trial: 1 -> Best value: 0.8146
Best hyperparameters:
max_iter        - 5000
solver          - svd
alpha           - 0.13023545564166325
class_weight    - None
random_state    - 130322
[Time taken: 23.91s]
-----Cross-validation and prediction-----
Fold #0: 0.8090 | Fold #1: 0.8150 | Fold #2: 0.8130 | Fold #3: 0.8190 | Fold #4: 0.8170 | Avg = 0.8146 +/- 0.0034
[Time taken: 1.92s]

Model config: ridge_ss_corr
----------Hyperparameter tuning----------
Best trial: 0 -

### LogisticRegression

In [20]:
def logreg_objective(trial, preprocessor, feature_set, thresholding):
    param_grid = {
        'max_iter': trial.suggest_categorical('max_iter', [5000]),
        'solver': trial.suggest_categorical('solver', ['saga']),
        'penalty': trial.suggest_categorical('penalty', ['elasticnet', 'l1', 'l2', 'none']),
        'C': trial.suggest_float('C', 1e-2, 1e2, log=True),
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced']),
        'n_jobs': trial.suggest_categorical('n_jobs', [-1]),
        'random_state': trial.suggest_categorical('random_state', [SEED])
    }
    if param_grid['penalty'] == 'elasticnet':
        param_grid['l1_ratio'] = trial.suggest_float('l1_ratio', 0.05, 0.95, step=0.05)

    model = make_pipeline(preprocessor, lm.LogisticRegression(**param_grid))
    return cross_validate(model, feature_set, thresholding)

In [21]:
%%time
clf = 'logreg'
for fset in feature_sets:
    for proc in preprocessors:
        model_name = f'{clf}_{proc}_{fset}'
        print(f'Model config: {model_name}')
        oof_preds[model_name], test_preds[model_name] = run_experiment(
            objective=logreg_objective,
            preprocessor=preprocessors[proc],
            classifier=lm.LogisticRegression(),
            feature_set=feature_sets[fset],
            n_trials=N_TRIALS,
            thresholding=True #since LogisticRegression has predict_proba()
        )

Model config: logreg_ss_all
----------Hyperparameter tuning----------
Best trial: 41 -> Best value: 0.7918
Best hyperparameters:
max_iter        - 5000
solver          - saga
penalty         - elasticnet
C               - 0.011429245646048854
class_weight    - None
n_jobs          - -1
random_state    - 130322
l1_ratio        - 0.4
[Time taken: 43.60s]
-----Cross-validation and prediction-----
Fold #0: 0.7900 | Fold #1: 0.7960 | Fold #2: 0.7910 | Fold #3: 0.7880 | Fold #4: 0.7940 | Avg = 0.7918 +/- 0.0029
[Time taken: 2.32s]

Model config: logreg_qt_all
----------Hyperparameter tuning----------
Best trial: 10 -> Best value: 0.8290
Best hyperparameters:
max_iter        - 5000
solver          - saga
penalty         - l2
C               - 0.6978616525115373
class_weight    - None
n_jobs          - -1
random_state    - 130322
[Time taken: 50.65s]
-----Cross-validation and prediction-----
Fold #0: 0.8190 | Fold #1: 0.8260 | Fold #2: 0.8250 | Fold #3: 0.8330 | Fold #4: 0.8420 | Avg = 0.8290 

### SGDClassifier: hinge (= linear SVM)

In [35]:
def sgdhinge_objective(trial, preprocessor, feature_set, thresholding):
    param_grid = {
        'max_iter': trial.suggest_categorical('max_iter', [5000]),
        'loss': trial.suggest_categorical('loss', ['hinge']),
        'penalty': trial.suggest_categorical('penalty', ['elasticnet', 'l1', 'l2']),
        'alpha': trial.suggest_float('alpha', 5e-5, 5e2, log=True),
        'learning_rate': trial.suggest_categorical('learning_rate', ['optimal']),
        'early_stopping': trial.suggest_categorical('early_stopping', [True]),
        'validation_fraction': trial.suggest_categorical('validation_fraction', [0.1]),
        'n_iter_no_change': trial.suggest_categorical('n_iter_no_change', [10]),
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced']),
        'n_jobs': trial.suggest_categorical('n_jobs', [-1]),
        'random_state': trial.suggest_categorical('random_state', [SEED])
    }
    if param_grid['penalty'] == 'elasticnet':
        param_grid['l1_ratio'] = trial.suggest_float('l1_ratio', 0.05, 0.95, step=0.05)

    model = make_pipeline(preprocessor, lm.SGDClassifier(**param_grid))
    return cross_validate(model, feature_set, thresholding)

In [36]:
%%time
clf = 'sgdhinge'
for fset in feature_sets:
    for proc in preprocessors:
        model_name = f'{clf}_{proc}_{fset}'
        print(f'Model config: {model_name}')
        oof_preds[model_name], test_preds[model_name] = run_experiment(
            objective=sgdhinge_objective,
            preprocessor=preprocessors[proc],
            classifier=lm.SGDClassifier(),
            feature_set=feature_sets[fset],
            n_trials=N_TRIALS,
            thresholding=False #no probability estimates for hinge loss
        )

Model config: sgdhinge_ss_all
----------Hyperparameter tuning----------
Best trial: 33 -> Best value: 0.7862
Best hyperparameters:
max_iter        - 5000
loss            - hinge
penalty         - l2
alpha           - 0.008143976821283289
learning_rate   - optimal
early_stopping  - True
validation_fraction - 0.1
n_iter_no_change - 10
class_weight    - balanced
n_jobs          - -1
random_state    - 130322
[Time taken: 16.41s]
-----Cross-validation and prediction-----
Fold #0: 0.7800 | Fold #1: 0.7910 | Fold #2: 0.7870 | Fold #3: 0.7830 | Fold #4: 0.7900 | Avg = 0.7862 +/- 0.0042
[Time taken: 1.85s]

Model config: sgdhinge_qt_all
----------Hyperparameter tuning----------
Best trial: 31 -> Best value: 0.8196
Best hyperparameters:
max_iter        - 5000
loss            - hinge
penalty         - elasticnet
alpha           - 0.006086818498448863
learning_rate   - optimal
early_stopping  - True
validation_fraction - 0.1
n_iter_no_change - 10
class_weight    - None
n_jobs          - -1
random_

### SGDClassifier: modified_huber

In [37]:
def sgdhuber_objective(trial, preprocessor, feature_set, thresholding):
    param_grid = {
        'max_iter': trial.suggest_categorical('max_iter', [5000]),
        'loss': trial.suggest_categorical('loss', ['modified_huber']),
        'penalty': trial.suggest_categorical('penalty', ['elasticnet', 'l1', 'l2']),
        'alpha': trial.suggest_float('alpha', 5e-5, 5e2, log=True),
        'learning_rate': trial.suggest_categorical('learning_rate', ['optimal']),
        'early_stopping': trial.suggest_categorical('early_stopping', [True]),
        'validation_fraction': trial.suggest_categorical('validation_fraction', [0.1]),
        'n_iter_no_change': trial.suggest_categorical('n_iter_no_change', [10]),
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced']),
        'n_jobs': trial.suggest_categorical('n_jobs', [-1]),
        'random_state': trial.suggest_categorical('random_state', [SEED])
    }
    if param_grid['penalty'] == 'elasticnet':
        param_grid['l1_ratio'] = trial.suggest_float('l1_ratio', 0.05, 0.95, step=0.05)

    model = make_pipeline(preprocessor, lm.SGDClassifier(**param_grid))
    return cross_validate(model, feature_set, thresholding)

In [38]:
%%time
clf = 'sgdhuber'
for fset in feature_sets:
    for proc in preprocessors:
        model_name = f'{clf}_{proc}_{fset}'
        print(f'Model config: {model_name}')
        oof_preds[model_name], test_preds[model_name] = run_experiment(
            objective=sgdhuber_objective,
            preprocessor=preprocessors[proc],
            classifier=lm.SGDClassifier(),
            feature_set=feature_sets[fset],
            n_trials=N_TRIALS,
            thresholding=True
        )

Model config: sgdhuber_ss_all
----------Hyperparameter tuning----------
Best trial: 35 -> Best value: 0.7918
Best hyperparameters:
max_iter        - 5000
loss            - modified_huber
penalty         - l1
alpha           - 0.05061351153973985
learning_rate   - optimal
early_stopping  - True
validation_fraction - 0.1
n_iter_no_change - 10
class_weight    - None
n_jobs          - -1
random_state    - 130322
[Time taken: 24.64s]
-----Cross-validation and prediction-----
Fold #0: 0.7910 | Fold #1: 0.7930 | Fold #2: 0.7890 | Fold #3: 0.7890 | Fold #4: 0.7970 | Avg = 0.7918 +/- 0.0030
[Time taken: 1.97s]

Model config: sgdhuber_qt_all
----------Hyperparameter tuning----------
Best trial: 36 -> Best value: 0.8272
Best hyperparameters:
max_iter        - 5000
loss            - modified_huber
penalty         - l2
alpha           - 0.12128426127769294
learning_rate   - optimal
early_stopping  - True
validation_fraction - 0.1
n_iter_no_change - 10
class_weight    - None
n_jobs          - -1
ran

### Perceptron

In [39]:
def perceptron_objective(trial, preprocessor, feature_set, thresholding):
    param_grid = {
        'max_iter': trial.suggest_categorical('max_iter', [5000]),
        'penalty': trial.suggest_categorical('penalty', ['elasticnet', 'l1', 'l2']),
        'alpha': trial.suggest_float('alpha', 5e-5, 5e2, log=True),
        'eta0': trial.suggest_float('eta0', 1e-2, 1e2, log=True),
        'early_stopping': trial.suggest_categorical('early_stopping', [True]),
        'validation_fraction': trial.suggest_categorical('validation_fraction', [0.1]),
        'n_iter_no_change': trial.suggest_categorical('n_iter_no_change', [10]),
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced']),
        'n_jobs': trial.suggest_categorical('n_jobs', [-1]),
        'random_state': trial.suggest_categorical('random_state', [SEED])
    }
    if param_grid['penalty'] == 'elasticnet':
        param_grid['l1_ratio'] = trial.suggest_float('l1_ratio', 0.05, 0.95, step=0.05)

    model = make_pipeline(preprocessor, lm.Perceptron(**param_grid))
    return cross_validate(model, feature_set, thresholding)

In [40]:
%%time
clf = 'perceptron'
for fset in feature_sets:
    for proc in preprocessors:
        model_name = f'{clf}_{proc}_{fset}'
        print(f'Model config: {model_name}')
        oof_preds[model_name], test_preds[model_name] = run_experiment(
            objective=perceptron_objective,
            preprocessor=preprocessors[proc],
            classifier=lm.Perceptron(),
            feature_set=feature_sets[fset],
            n_trials=N_TRIALS,
            thresholding=False
        )

Model config: perceptron_ss_all
----------Hyperparameter tuning----------
Best trial: 14 -> Best value: 0.7008
Best hyperparameters:
max_iter        - 5000
penalty         - l2
alpha           - 0.343214217162308
eta0            - 0.013283355958811353
early_stopping  - True
validation_fraction - 0.1
n_iter_no_change - 10
class_weight    - None
n_jobs          - -1
random_state    - 130322
[Time taken: 15.47s]
-----Cross-validation and prediction-----
Fold #0: 0.7030 | Fold #1: 0.7540 | Fold #2: 0.6520 | Fold #3: 0.6820 | Fold #4: 0.7130 | Avg = 0.7008 +/- 0.0338
[Time taken: 1.84s]

Model config: perceptron_qt_all
----------Hyperparameter tuning----------
Best trial: 30 -> Best value: 0.7466
Best hyperparameters:
max_iter        - 5000
penalty         - elasticnet
alpha           - 0.0017520333809086647
eta0            - 0.2309444875431106
early_stopping  - True
validation_fraction - 0.1
n_iter_no_change - 10
class_weight    - None
n_jobs          - -1
random_state    - 130322
l1_ratio

### Passive Aggressive Classifier

In [48]:
def passagg_objective(trial, preprocessor, feature_set, thresholding):
    param_grid = {
        'max_iter': trial.suggest_categorical('max_iter', [5000]),
        'C': trial.suggest_float('C', 1e-2, 1e2, log=True),
        'loss': trial.suggest_categorical('loss', ['squared_hinge']),
        'early_stopping': trial.suggest_categorical('early_stopping', [True]),
        'validation_fraction': trial.suggest_categorical('validation_fraction', [0.1]),
        'n_iter_no_change': trial.suggest_categorical('n_iter_no_change', [10]),
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced']),
        'n_jobs': trial.suggest_categorical('n_jobs', [-1]),
        'random_state': trial.suggest_categorical('random_state', [SEED])
    }

    model = make_pipeline(preprocessor, lm.PassiveAggressiveClassifier(**param_grid))
    return cross_validate(model, feature_set, thresholding)

In [49]:
%%time
clf = 'passagg'
for fset in feature_sets:
    for proc in preprocessors:
        model_name = f'{clf}_{proc}_{fset}'
        print(f'Model config: {model_name}')
        oof_preds[model_name], test_preds[model_name] = run_experiment(
            objective=passagg_objective,
            preprocessor=preprocessors[proc],
            classifier=lm.PassiveAggressiveClassifier(),
            feature_set=feature_sets[fset],
            n_trials=N_TRIALS,
            thresholding=False
        )

Model config: passagg_ss_all
----------Hyperparameter tuning----------
Best trial: 20 -> Best value: 0.7516
Best hyperparameters:
max_iter        - 5000
C               - 0.010064884977692291
loss            - squared_hinge
early_stopping  - True
validation_fraction - 0.1
n_iter_no_change - 10
class_weight    - None
n_jobs          - -1
random_state    - 130322
[Time taken: 14.78s]
-----Cross-validation and prediction-----
Fold #0: 0.7760 | Fold #1: 0.7400 | Fold #2: 0.7160 | Fold #3: 0.7680 | Fold #4: 0.7580 | Avg = 0.7516 +/- 0.0215
[Time taken: 1.85s]

Model config: passagg_qt_all
----------Hyperparameter tuning----------
Best trial: 20 -> Best value: 0.7958
Best hyperparameters:
max_iter        - 5000
C               - 0.010064884977692291
loss            - squared_hinge
early_stopping  - True
validation_fraction - 0.1
n_iter_no_change - 10
class_weight    - None
n_jobs          - -1
random_state    - 130322
[Time taken: 23.44s]
-----Cross-validation and prediction-----
Fold #0: 0.

# Submission files

In [50]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [62]:
NOTEBOOK = '01'
SUBMISSION_PATH = f'/content/drive/MyDrive/data_science_competitions/dphi/ds92/submissions/nb_{NOTEBOOK}'
if not os.path.isdir(SUBMISSION_PATH):
    os.makedirs(SUBMISSION_PATH)

In [63]:
def create_submission_files(test_preds, path=SUBMISSION_PATH):
    for model_config in test_preds.columns:
        sub = pd.DataFrame({
            'class': labels.inverse_transform(test_preds[model_config])
        })
        sub.to_csv(f'{path}/{model_config}.csv', index=False)

    mode_preds = test_preds.mode(axis=1)[0].astype('int')
    sub = pd.DataFrame({
        'class': labels.inverse_transform(mode_preds)
    })
    sub.to_csv(f'{path}/all_models_mode.csv', index=False)

In [64]:
test_preds.head()

Unnamed: 0,ridge_ss_all,ridge_qt_all,ridge_ss_corr,ridge_qt_corr,ridge_ss_mi,ridge_qt_mi,logreg_ss_all,logreg_qt_all,logreg_ss_corr,logreg_qt_corr,logreg_ss_mi,logreg_qt_mi,sgdhinge_ss_all,sgdhinge_qt_all,sgdhinge_ss_corr,sgdhinge_qt_corr,sgdhinge_ss_mi,sgdhinge_qt_mi,sgdhuber_ss_all,sgdhuber_qt_all,sgdhuber_ss_corr,sgdhuber_qt_corr,sgdhuber_ss_mi,sgdhuber_qt_mi,perceptron_ss_all,perceptron_qt_all,perceptron_ss_corr,perceptron_qt_corr,perceptron_ss_mi,perceptron_qt_mi,passagg_ss_all,passagg_qt_all,passagg_ss_corr,passagg_qt_corr,passagg_ss_mi,passagg_qt_mi
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,0,1,0
4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1


In [65]:
create_submission_files(test_preds)