<a href="https://colab.research.google.com/github/stiwari-ds/data-science-competitions/blob/main/machinehack/analytics_olympiad22/notebooks/01_logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

**Utils**

In [1]:
import gc
import os
import time
import warnings

gc.enable()
warnings.filterwarnings(action='ignore')

**Data analysis**

In [2]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

**Modeling**

In [3]:
%%capture
!pip install -U optuna

In [4]:
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner

optuna.__version__

'3.0.3'

In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss, brier_score_loss

**Reproducibility**

In [6]:
SEED = 2311
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

**Data**

In [7]:
DATA_URL = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/machinehack/analytics_olympiad22/data'

train_full = pd.read_csv(f'{DATA_URL}/processed/train_proc.csv')
train_clip = pd.read_csv(f'{DATA_URL}/processed/train_clip.csv')
test = pd.read_csv(f'{DATA_URL}/processed/test_proc.csv')
sample_sub = pd.read_csv(f'{DATA_URL}/raw/submission.csv')

In [8]:
TARGET = 'OUTCOME'

In [9]:
features = [f for f in test.columns 
            if f not in ('ID', 'POSTAL_CODE', 'ANNUAL_MILEAGE')]

num_features = ['ID_COUNT', 'CREDIT_SCORE', 'ANNUAL_MILEAGE_K', 'DUIS', 
                'SPEEDING_VIOLATIONS', 'PAST_ACCIDENTS', 'TOTAL_PAST_INCIDENTS']

cat_features = [f for f in features if f not in num_features]

#all other nominal cat features are binary, hence no need for one-hot encoding
one_hot_features = ['TYPE_OF_VEHICLE', 'POSTAL_CODE_REGION']

In [10]:
train_full[cat_features] = train_full[cat_features].astype('int8')
train_clip[cat_features] = train_clip[cat_features].astype('int8')
test[cat_features] = test[cat_features].astype('int8')
#separate datatypes for make_column_selector
train_full[one_hot_features] = train_full[one_hot_features].astype('category')
train_clip[one_hot_features] = train_clip[one_hot_features].astype('category')
test[one_hot_features] = test[one_hot_features].astype('category')

In [19]:
original_features = ['AGE', 'GENDER', 'DRIVING_EXPERIENCE', 'EDUCATION', 'INCOME', 
                     'CREDIT_SCORE', 'VEHICLE_OWNERSHIP', 'VEHICLE_YEAR', 'MARRIED', 
                     'CHILDREN', 'POSTAL_CODE_SUBREGION', 'ANNUAL_MILEAGE_K', 
                     'SPEEDING_VIOLATIONS', 'DUIS', 'PAST_ACCIDENTS', 'TYPE_OF_VEHICLE']

cat_only_features = ['AGE', 'GENDER', 'DRIVING_EXPERIENCE', 'EDUCATION', 'INCOME', 
                     'VEHICLE_OWNERSHIP', 'VEHICLE_YEAR', 'MARRIED', 'CHILDREN', 
                     'TYPE_OF_VEHICLE', 'IS_ID_REPEATED', 'CREDIT_SCORE_BINS', 
                     'POSTAL_CODE_REGION', 'ANNUAL_MILEAGE_RANGE', 'HAS_PRIOR_DUIS', 
                     'HAS_PRIOR_SPEEDING_VIOLATIONS', 'HAS_PAST_ACCIDENTS',
                     'HAS_PAST_INCIDENTS']

#based on F-test and Chi2-test
reduced_features = ['GENDER', 'DRIVING_EXPERIENCE', 'POSTAL_CODE_SUBREGION']

#based on mutual information score
mi_features = ['AGE', 'GENDER', 'DRIVING_EXPERIENCE', 'EDUCATION', 'INCOME', 
               'VEHICLE_OWNERSHIP', 'VEHICLE_YEAR', 'MARRIED', 'CHILDREN', 
               'TYPE_OF_VEHICLE', 'ID_COUNT', 'CREDIT_SCORE_BINS', 'POSTAL_CODE_REGION', 
               'POSTAL_CODE_SUBREGION', 'ANNUAL_MILEAGE_RANGE', 'TOTAL_PAST_INCIDENTS', 
               'HAS_PAST_INCIDENTS']

# Modeling

In [20]:
NUM_FOLDS = 5

In [21]:
def objective(trial, data, preprocessor, base_params):
    scores = []
    X, y = data
    
    param_grid = {
        'penalty': trial.suggest_categorical(
            'penalty', ['l1', 'l2', 'elasticnet', 'none']
        ),
        'C': trial.suggest_float('C', 1e-4, 1e4, log=True),
        'intercept_scaling': trial.suggest_float(
            'intercept_scaling', 0.1, 10, step=0.1
        ),
        'class_weight': trial.suggest_categorical(
            'class_weight', ['balanced', None]
        )
    }
    if param_grid['penalty'] == 'elasticnet':
        param_grid['l1_ratio'] = trial.suggest_float(
            'l1_ratio', 0.05, 0.95, step=0.05
        )

    model = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(**base_params, **param_grid))
    ])

    cv = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]    
        model.fit(X_train, y_train)
        val_probs = model.predict_proba(X_val)[:, 1]
        scores.append(log_loss(y_val, val_probs))
    
    return np.mean(scores)

In [22]:
def tune_params(data, preprocessor, base_params, n_trials, direction):
    study = optuna.create_study(
        sampler=TPESampler(seed=SEED),
        pruner=HyperbandPruner(),
        direction=direction
    )
    study.optimize(
        func=lambda trial: objective(trial, data, preprocessor, base_params),
        n_trials=n_trials,
        gc_after_trial=True
    )
    return study

In [28]:
def cross_validate_predict(model, data):
    oof_probs = {}  #out-of-fold predicted probabilities on train set
    test_probs = {} #predicted probabilities on test set for each fold
    scores = [] #scores on validation set
    
    X, y, X_test = data
    
    cv = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        val_probs = model.predict_proba(X_val)[:, 1]
        oof_probs.update(dict(zip(val_idx, val_probs)))
        test_probs[f'fold{fold}'] = model.predict_proba(X_test)[:, 1]

        score = log_loss(y_val, val_probs)
        scores.append(score)
        print(f'Fold #{fold}: {score:.6f}')
        _ = gc.collect()
    print(f'Avg = {np.mean(scores):.6f} +/- {np.std(scores):.6f}\n')
    
    oof_probs = pd.Series(oof_probs).sort_index()
    test_probs = pd.DataFrame.from_dict(test_probs)
    test_probs['mean'] = test_probs.mean(axis=1)

    return oof_probs, test_probs

In [29]:
def run_experiment(data, n_trials=5):
    
    X, y, X_test = data

    base_params = {
        'max_iter': 10000,
        'solver': 'saga',
        'dual': False,
        'tol': 2e-4,
        'fit_intercept': True,
        'n_jobs': -1,
        'random_state': SEED
    }

    ohe = OneHotEncoder(drop='if_binary', handle_unknown='ignore')
    minmax = MinMaxScaler(feature_range=(0, 1))
    preprocessor = ColumnTransformer(
        transformers=[
            ('nominal_cat', ohe, make_column_selector(dtype_include='category')),
            ('numerical', minmax, make_column_selector(dtype_exclude='category'))
        ], 
        remainder='passthrough', 
        n_jobs=-1
    )

    print(f'----------Hyperparameter tuning----------')
    start = time.time()
    study = tune_params(
        data=(X, y), 
        preprocessor=preprocessor, 
        base_params=base_params, 
        n_trials=n_trials, 
        direction='minimize', #metric is logloss -> lower is better
    )
    end = time.time()
    print(f'Best trial: {study.best_trial.number} -> ' \
          f'Best value: {study.best_value:.6f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:15} - {v}')
    print(f'[Time taken: {end - start:.2f}s]')

    print(f'-----Cross-validation and prediction-----')
    start = time.time()
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(**base_params, **study.best_params))
    ])
    oof_preds, test_preds = cross_validate_predict(model, data)
    end = time.time()
    print(f'[Time taken: {end - start:.2f}s]\n')

    return oof_preds, test_preds

### Trial run

In [30]:
%%time
optuna.logging.set_verbosity(optuna.logging.INFO)
op, tp = run_experiment(
    data=(train_full[original_features], train_full[TARGET], test[original_features]),
    n_trials=3
)

[32m[I 2022-11-04 05:59:02,052][0m A new study created in memory with name: no-name-2ae69b84-978e-4dbd-bac5-46c0406a0df7[0m


----------Hyperparameter tuning----------


[32m[I 2022-11-04 05:59:10,584][0m Trial 0 finished with value: 0.681043691278779 and parameters: {'penalty': 'l2', 'C': 0.00025643698116528354, 'intercept_scaling': 7.1, 'class_weight': None}. Best is trial 0 with value: 0.681043691278779.[0m
[32m[I 2022-11-04 05:59:15,903][0m Trial 1 finished with value: 0.693095141072452 and parameters: {'penalty': 'l1', 'C': 0.0015649840907716501, 'intercept_scaling': 8.3, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.681043691278779.[0m
[32m[I 2022-11-04 05:59:39,869][0m Trial 2 finished with value: 0.6931120958690793 and parameters: {'penalty': 'l2', 'C': 2.403161227248477, 'intercept_scaling': 7.7, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.681043691278779.[0m


Best trial: 0 -> Best value: 0.681044
Best hyperparameters:
penalty         - l2
C               - 0.00025643698116528354
intercept_scaling - 7.1
class_weight    - None
[Time taken: 37.94s]
-----Cross-validation and prediction-----
Fold #0: 0.681017
Fold #1: 0.681012
Fold #2: 0.681094
Fold #3: 0.681043
Fold #4: 0.681052
Avg = 0.681044 +/- 0.000030

[Time taken: 9.92s]

CPU times: user 42.2 s, sys: 3.03 s, total: 45.2 s
Wall time: 47.9 s


In [31]:
optuna.logging.set_verbosity(optuna.logging.ERROR)

In [32]:
%%time
op1, tp1 = run_experiment(
    data=(train_full[original_features], train_full[TARGET], test[original_features]),
    n_trials=100
)

----------Hyperparameter tuning----------
Best trial: 51 -> Best value: 0.681040
Best hyperparameters:
penalty         - l2
C               - 0.0004364976239962741
intercept_scaling - 9.1
class_weight    - None
[Time taken: 1729.18s]
-----Cross-validation and prediction-----
Fold #0: 0.681005
Fold #1: 0.681001
Fold #2: 0.681102
Fold #3: 0.681038
Fold #4: 0.681053
Avg = 0.681040 +/- 0.000037

[Time taken: 9.82s]

CPU times: user 26min 51s, sys: 1min 3s, total: 27min 54s
Wall time: 28min 59s


In [33]:
%%time
op2, tp2 = run_experiment(
    data=(train_clip[original_features], train_clip[TARGET], test[original_features]),
    n_trials=100
)

----------Hyperparameter tuning----------
Best trial: 41 -> Best value: 0.681025
Best hyperparameters:
penalty         - l2
C               - 0.00040039930745587354
intercept_scaling - 8.3
class_weight    - None
[Time taken: 1742.28s]
-----Cross-validation and prediction-----
Fold #0: 0.680971
Fold #1: 0.681060
Fold #2: 0.681082
Fold #3: 0.681091
Fold #4: 0.680923
Avg = 0.681025 +/- 0.000067

[Time taken: 8.81s]

CPU times: user 27min 1s, sys: 1min 3s, total: 28min 5s
Wall time: 29min 11s


In [34]:
%%time
op3, tp3 = run_experiment(
    data=(train_full[cat_only_features], train_full[TARGET], test[cat_only_features]),
    n_trials=100
)

----------Hyperparameter tuning----------
Best trial: 82 -> Best value: 0.681046
Best hyperparameters:
penalty         - l2
C               - 0.0003028779297664595
intercept_scaling - 8.9
class_weight    - None
[Time taken: 1024.11s]
-----Cross-validation and prediction-----
Fold #0: 0.681145
Fold #1: 0.680978
Fold #2: 0.681068
Fold #3: 0.681022
Fold #4: 0.681017
Avg = 0.681046 +/- 0.000057

[Time taken: 10.67s]

CPU times: user 14min 59s, sys: 1min 2s, total: 16min 1s
Wall time: 17min 14s


In [35]:
%%time
op4, tp4 = run_experiment(
    data=(train_clip[cat_only_features], train_clip[TARGET], test[cat_only_features]),
    n_trials=100
)

----------Hyperparameter tuning----------
Best trial: 72 -> Best value: 0.681020
Best hyperparameters:
penalty         - l2
C               - 0.00031915269748580236
intercept_scaling - 8.9
class_weight    - None
[Time taken: 981.62s]
-----Cross-validation and prediction-----
Fold #0: 0.680969
Fold #1: 0.681055
Fold #2: 0.681088
Fold #3: 0.681086
Fold #4: 0.680905
Avg = 0.681020 +/- 0.000072

[Time taken: 9.85s]

CPU times: user 14min 19s, sys: 1min 1s, total: 15min 21s
Wall time: 16min 31s


In [36]:
%%time
op5, tp5 = run_experiment(
    data=(train_full[mi_features], train_full[TARGET], test[mi_features]),
    n_trials=100
)

----------Hyperparameter tuning----------
Best trial: 77 -> Best value: 0.681053
Best hyperparameters:
penalty         - l2
C               - 0.000320494516419467
intercept_scaling - 7.8
class_weight    - None
[Time taken: 999.18s]
-----Cross-validation and prediction-----
Fold #0: 0.681034
Fold #1: 0.681019
Fold #2: 0.681097
Fold #3: 0.681062
Fold #4: 0.681054
Avg = 0.681053 +/- 0.000027

[Time taken: 11.06s]

CPU times: user 14min 13s, sys: 1min 1s, total: 15min 15s
Wall time: 16min 50s


In [37]:
%%time
op6, tp6 = run_experiment(
    data=(train_clip[mi_features], train_clip[TARGET], test[mi_features]),
    n_trials=100
)

----------Hyperparameter tuning----------
Best trial: 91 -> Best value: 0.681045
Best hyperparameters:
penalty         - l2
C               - 0.00026170426707950605
intercept_scaling - 7.7
class_weight    - None
[Time taken: 1261.60s]
-----Cross-validation and prediction-----
Fold #0: 0.681005
Fold #1: 0.681077
Fold #2: 0.681076
Fold #3: 0.681109
Fold #4: 0.680955
Avg = 0.681045 +/- 0.000056

[Time taken: 9.46s]

CPU times: user 18min 53s, sys: 1min, total: 19min 53s
Wall time: 21min 11s


In [38]:
%%time
op7, tp7 = run_experiment(
    data=(train_full[reduced_features], train_full[TARGET], test[reduced_features]),
    n_trials=100
)

----------Hyperparameter tuning----------
Best trial: 81 -> Best value: 0.681038
Best hyperparameters:
penalty         - l2
C               - 0.0016659478765338111
intercept_scaling - 1.7000000000000002
class_weight    - None
[Time taken: 321.28s]
-----Cross-validation and prediction-----
Fold #0: 0.681018
Fold #1: 0.680919
Fold #2: 0.681108
Fold #3: 0.681087
Fold #4: 0.681061
Avg = 0.681038 +/- 0.000067

[Time taken: 4.42s]

CPU times: user 4min 44s, sys: 54.7 s, total: 5min 39s
Wall time: 5min 25s


In [39]:
%%time
op8, tp8 = run_experiment(
    data=(train_clip[reduced_features], train_clip[TARGET], test[reduced_features]),
    n_trials=100
)

----------Hyperparameter tuning----------
Best trial: 30 -> Best value: 0.681015
Best hyperparameters:
penalty         - l2
C               - 0.0031313614951399126
intercept_scaling - 10.0
class_weight    - None
[Time taken: 289.56s]
-----Cross-validation and prediction-----
Fold #0: 0.680985
Fold #1: 0.681057
Fold #2: 0.681066
Fold #3: 0.681031
Fold #4: 0.680936
Avg = 0.681015 +/- 0.000049

[Time taken: 3.98s]

CPU times: user 4min 18s, sys: 54.4 s, total: 5min 12s
Wall time: 4min 53s


# Generating submission files

In [41]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [42]:
NOTEBOOK = '01'
SUBMISSION_PATH = f'/content/drive/MyDrive/data_science_competitions/machinehack/analytics_olympiad22/submissions/nb_{NOTEBOOK}'
if not os.path.isdir(SUBMISSION_PATH):
    os.makedirs(SUBMISSION_PATH)

In [43]:
def create_submission_files(test_preds: pd.DataFrame, expt_num: int):
    for col in (test_preds.columns):
        sub = sample_sub.copy()
        sub[TARGET] = test_preds[col]
        sub.to_csv(f'{SUBMISSION_PATH}/{expt_num}_{col}.csv', index=False)

In [44]:
create_submission_files(tp1, '01')
create_submission_files(tp2, '02')
create_submission_files(tp3, '03')
create_submission_files(tp4, '04')
create_submission_files(tp5, '05')
create_submission_files(tp6, '06')
create_submission_files(tp7, '07')
create_submission_files(tp8, '08')