# Setup

In [1]:
import gc
import numpy as np
import pandas as pd

gc.enable()
pd.set_option('display.max_columns', None)

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline

SEED = 55

In [2]:
DATA_DIR = '/kaggle/input/playground-series-s4e1'
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

original = pd.read_csv('/kaggle/input/bank-customer-churn-prediction/Churn_Modelling.csv')

# Data exploration

In [3]:
train.shape, test.shape, original.shape

((165034, 14), (110023, 13), (10002, 14))

In [4]:
train.sample(random_state=SEED)

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
91586,91586,15697567,Trevisani,762,France,Female,36.0,8,0.0,2,1.0,1.0,151325.24,0


In [5]:
original.sample(random_state=SEED)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
8776,8777,15695383,Griffin,567,Spain,Male,44.0,9,0.0,2,1.0,0.0,87677.15,0


**Target distribution:**

In [6]:
TARGET = 'Exited'

with pd.option_context('display.precision', 3):
    display(pd.concat(
        [
            train[TARGET].value_counts(normalize=True).rename('Train'),
            original[TARGET].value_counts(normalize=True).rename('Original')
        ],
        axis=1
    ))

Unnamed: 0_level_0,Train,Original
Exited,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.788,0.796
1,0.212,0.204


**Dropping irrelevant column(s):**

In [7]:
train.id.nunique(), test.id.nunique(), original.RowNumber.nunique()

(165034, 110023, 10000)

Same as number of rows i.e, all unique values.

In [8]:
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)
original = original.drop('RowNumber', axis=1)

In [9]:
train.CustomerId.nunique(), test.CustomerId.nunique(), original.CustomerId.nunique()

(23221, 19698, 10000)

Repeats in train and test.

In [10]:
train.CustomerId.value_counts()[:5]

CustomerId
15682355    121
15570194     99
15585835     98
15595588     91
15648067     90
Name: count, dtype: int64

In [11]:
train.loc[train.CustomerId == 15682355].sample(5, random_state=SEED)

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
95668,15682355,Chienezie,775,France,Female,56.0,9,124049.8,4,0.0,1.0,163801.53,1
50768,15682355,Chuang,684,France,Male,36.0,3,128770.86,1,0.0,1.0,81723.8,0
160819,15682355,L?,773,Spain,Male,38.0,6,138597.54,1,1.0,0.0,52314.71,0
141112,15682355,Hsiung,586,Spain,Male,42.0,1,0.0,2,1.0,0.0,137571.05,0
103658,15682355,K?,662,France,Male,28.0,9,0.0,2,1.0,1.0,10928.3,0


Different customers with same CustomerId. Most probably a result of synthetic data creation (since original data has all unique CustomerIds).  
We should focus on the customer's account information instead of their identity.

In [12]:
train = train.drop(['CustomerId', 'Surname'], axis=1)
test = test.drop(['CustomerId', 'Surname'], axis=1)
original = original.drop(['CustomerId', 'Surname'], axis=1)

**Feature types (unique values):**

In [13]:
features = test.columns

In [14]:
pd.concat(
    [
        train[features].nunique().rename('Train'),
        test[features].nunique().rename('Test'),
        original[features].nunique().rename('Original')
    ],
    axis=1
).T

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
Train,457,3,2,71,11,30075,4,2,2,55298
Test,454,3,2,74,11,22513,4,2,2,41670
Original,460,3,2,73,11,6382,4,2,2,9999


In [15]:
cat_features = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
num_features = [f for f in features if f not in cat_features]

In [16]:
train[cat_features] = train[cat_features].astype('category')
test[cat_features] = test[cat_features].astype('category')
original[cat_features] = original[cat_features].astype('category')

**Missing values:**

In [17]:
train.isna().sum().sum(), test.isna().sum().sum(), original.isna().sum().sum()

(0, 0, 4)

In [18]:
missing_rows = original.isna().any(axis=1)
original[missing_rows]

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
4,850,Spain,Female,43.0,2,125510.82,1,,1.0,79084.1,0
6,822,,Male,50.0,7,0.0,2,1.0,1.0,10062.8,0
8,501,France,Male,44.0,4,142051.07,2,0.0,,74940.5,0
9,684,France,Male,,2,134603.88,1,1.0,1.0,71725.73,0


* Only 4 values missing over all three datasets
* Original dataset has values missing completely at random
* For categorical columns -> fill with target-grouped most frequent
* For numerical column -> fill with target-grouped median

In [19]:
original.groupby(TARGET)[cat_features].describe()

Unnamed: 0_level_0,Geography,Geography,Geography,Geography,Gender,Gender,Gender,Gender,HasCrCard,HasCrCard,HasCrCard,HasCrCard,IsActiveMember,IsActiveMember,IsActiveMember,IsActiveMember
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
Exited,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,7963,3,France,4204,7964,2,Male,4559,7963.0,2.0,1.0,5631.0,7963.0,2.0,1.0,4415.0
1,2038,3,Germany,815,2038,2,Female,1139,2038.0,2.0,1.0,1425.0,2038.0,2.0,0.0,1303.0


In [20]:
original['HasCrCard'].fillna(value=1, inplace=True)
original['Geography'].fillna(value='France', inplace=True)
original['IsActiveMember'].fillna(value=1, inplace=True)

In [21]:
original.groupby(TARGET)['Age'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Exited,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,7963.0,37.408644,10.125235,18.0,31.0,36.0,41.0,92.0
1,2038.0,44.836605,9.759368,18.0,38.0,45.0,51.0,84.0


In [22]:
original['Age'].fillna(value=36, inplace=True)

**Duplicates:**

Within train dataset -

In [23]:
train.duplicated().sum(), train.drop(TARGET, axis=1).duplicated().sum()

(123, 175)

In [24]:
train = train.drop_duplicates(keep='first', ignore_index=True)

In [25]:
dup_rows = train.drop(TARGET, axis=1).duplicated(keep=False)
train[dup_rows].sort_values(by='CreditScore')[:6]

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
39850,468,Spain,Male,26.0,1,131643.25,1,1.0,0.0,64436.16,0
111582,468,Spain,Male,26.0,1,131643.25,1,1.0,0.0,64436.16,1
127331,498,Germany,Male,25.0,8,121702.73,1,1.0,1.0,132210.49,1
152754,498,Germany,Male,25.0,8,121702.73,1,1.0,1.0,132210.49,0
61518,504,Germany,Female,43.0,7,102365.49,1,1.0,1.0,194690.77,1
68681,504,Germany,Female,43.0,7,102365.49,1,1.0,1.0,194690.77,0


In [26]:
train = train.drop_duplicates(subset=features, keep=False, ignore_index=True)

Within original dataset -

In [27]:
original.duplicated().sum(), original.drop(TARGET, axis=1).duplicated().sum()

(2, 2)

In [28]:
dup_rows = original.duplicated(keep=False)
original[dup_rows]

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9998,772,Germany,Male,42.0,3,75075.31,2,1.0,0.0,92888.52,1
9999,772,Germany,Male,42.0,3,75075.31,2,1.0,0.0,92888.52,1
10000,792,France,Female,28.0,4,130142.79,1,1.0,0.0,38190.78,0
10001,792,France,Female,28.0,4,130142.79,1,1.0,0.0,38190.78,0


In [29]:
original = original.drop_duplicates(keep='first', ignore_index=True)

Between train and original:

In [30]:
pd.concat([train, original]).duplicated().sum()

416

In [31]:
pd.concat([train.drop(TARGET, axis=1), original.drop(TARGET, axis=1)]).duplicated().sum()

1082

In [32]:
labeled = pd.concat([train, original], axis=0, ignore_index=True)
dup_rows = labeled.duplicated(subset=features, keep=False)
labeled[dup_rows].sort_values(by='CreditScore')[:10]

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
174431,350,France,Female,40.0,0,111098.85,1,1.0,1.0,172321.21,1
106540,350,France,Female,40.0,0,111098.85,1,1.0,1.0,172321.21,0
172961,367,Spain,Male,42.0,6,93608.28,1,1.0,0.0,168816.73,1
39342,367,Spain,Male,42.0,6,93608.28,1,1.0,0.0,168816.73,0
165749,376,France,Female,46.0,6,0.0,1,1.0,0.0,157333.69,1
105905,376,France,Female,46.0,6,0.0,1,1.0,0.0,157333.69,0
174017,382,Spain,Male,36.0,0,0.0,1,1.0,1.0,179540.73,1
146664,382,Spain,Male,36.0,0,0.0,1,1.0,1.0,179540.73,0
168338,408,France,Female,84.0,8,87873.39,1,0.0,0.0,188484.52,1
147244,408,France,Female,84.0,8,87873.39,1,0.0,0.0,188484.52,0


These need to be removed if we include original data for training our model.  
Can be handled in our cross-validation framework.

# Preprocessing

In [33]:
gender_mapping = {'Female': 0, 'Male': 1}

train['Gender'] = train['Gender'].replace(gender_mapping).astype('int8')
test['Gender'] = test['Gender'].replace(gender_mapping).astype('int8')
original['Gender'] = original['Gender'].replace(gender_mapping).astype('int8')

In [34]:
train = pd.get_dummies(train, columns=['Geography'], dtype='int8')
test = pd.get_dummies(test, columns=['Geography'], dtype='int8')
original = pd.get_dummies(original, columns=['Geography'], dtype='int8')

In [35]:
features = list(test.columns)
cat_features = [f for f in features if f not in num_features]

train[cat_features] = train[cat_features].astype('int8')
test[cat_features] = test[cat_features].astype('int8')
original[cat_features] = original[cat_features].astype('int8')

# Cross-validation framework

In [36]:
def comp_metric(y_true, y_pred):
    return roc_auc_score(y_true, y_pred)

In [37]:
def custom_cv(features, model, extend=False, folds=7, seed=SEED, verbose=True):
    oof_preds = {}
    test_preds = {}
    
    X_test = test[features]
    
    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(train, train[TARGET])):
        X_train, X_val = train.iloc[train_ids], train.iloc[val_ids]
        if extend:
            X_train = pd.concat([X_train, original], axis=0) \
                        .drop_duplicates(keep=False, ignore_index=True)
        
        y_train, y_val = X_train.pop(TARGET), X_val.pop(TARGET)
        X_train, X_val = X_train[features], X_val[features]
        
        model.fit(X_train, y_train)
        
        val_preds = model.predict_proba(X_val)[:, 1]
        oof_preds.update(dict(zip(val_ids, val_preds)))
        test_preds[f'fold{fold}'] = model.predict_proba(X_test)[:, 1]
        
        if verbose:
            score = comp_metric(y_val, val_preds)
            print(f'Fold #{fold}: {score:.4f}', end=' | ')
            
        _ = gc.collect()
    
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1) # mean of fold-wise predictions
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'OOF score: {comp_metric(train[TARGET], oof_preds):.4f}\n')
    
    return oof_preds, test_preds

**Submission files:**

In [38]:
def create_submission_files(test_preds, config, notebook='00'):
    sub = sample_sub.copy()
    sub[TARGET] = test_preds['mean']
    sub.to_csv(f'{notebook}_{config}.csv', index=False)

**Model:**

In [39]:
num_preprocessor = make_pipeline(RobustScaler(), MinMaxScaler())
preprocessor = make_column_transformer(
    # numerical only; binary categorical does not need any preprocessing 
    (num_preprocessor, make_column_selector(dtype_exclude='int8')),
    remainder='passthrough',
    n_jobs=-1)

classifier = LogisticRegression(
    C=0.1, 
    max_iter=1000, 
    n_jobs=-1, 
    random_state=SEED)

model = make_pipeline(preprocessor, classifier)

# Experiments

In [40]:
op = {} # Train-set OOF predictions
tp = {} # Test-set predictions

### Train-only vs Train + Original

In [41]:
%%time

model_name = 'logreg'
dataset = 'trn'
folds = 7
seed = SEED
config = f'{model_name}_{dataset}_f{folds}_s{seed}'

op[config], tp[config] = custom_cv(
    features=features, 
    model=model, 
    extend=False,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

Fold #0: 0.8183 | Fold #1: 0.8217 | Fold #2: 0.8109 | Fold #3: 0.8184 | Fold #4: 0.8191 | Fold #5: 0.8145 | Fold #6: 0.8216 | OOF score: 0.8178

CPU times: user 5.86 s, sys: 3.5 s, total: 9.36 s
Wall time: 19.7 s


In [42]:
%%time

model_name = 'logreg'
dataset = 'ext'
folds = 7
seed = SEED
config = f'{model_name}_{dataset}_f{folds}_s{seed}'

op[config], tp[config] = custom_cv(
    features=features, 
    model=model, 
    extend=True,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

Fold #0: 0.8185 | Fold #1: 0.8216 | Fold #2: 0.8109 | Fold #3: 0.8187 | Fold #4: 0.8191 | Fold #5: 0.8145 | Fold #6: 0.8217 | OOF score: 0.8179

CPU times: user 6.43 s, sys: 3.35 s, total: 9.78 s
Wall time: 17.9 s


**No significant gain after including original data.** (A lot of rows were removed from both train and original to eliminate conflicting duplicates. Other options can be explored.)

### Folds: 7, 10, 15

In [43]:
%%time

model_name = 'logreg'
dataset = 'ext'
folds = 10
seed = SEED
config = f'{model_name}_{dataset}_f{folds}_s{seed}'

op[config], tp[config] = custom_cv(
    features=features, 
    model=model, 
    extend=True,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

Fold #0: 0.8203 | Fold #1: 0.8186 | Fold #2: 0.8184 | Fold #3: 0.8125 | Fold #4: 0.8188 | Fold #5: 0.8184 | Fold #6: 0.8197 | Fold #7: 0.8156 | Fold #8: 0.8142 | Fold #9: 0.8225 | OOF score: 0.8179

CPU times: user 8.25 s, sys: 4.69 s, total: 12.9 s
Wall time: 25 s


In [44]:
%%time

model_name = 'logreg'
dataset = 'ext'
folds = 15
seed = SEED
config = f'{model_name}_{dataset}_f{folds}_s{seed}'

op[config], tp[config] = custom_cv(
    features=features, 
    model=model, 
    extend=True,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

Fold #0: 0.8224 | Fold #1: 0.8172 | Fold #2: 0.8188 | Fold #3: 0.8200 | Fold #4: 0.8073 | Fold #5: 0.8189 | Fold #6: 0.8225 | Fold #7: 0.8093 | Fold #8: 0.8237 | Fold #9: 0.8214 | Fold #10: 0.8154 | Fold #11: 0.8165 | Fold #12: 0.8119 | Fold #13: 0.8241 | Fold #14: 0.8187 | OOF score: 0.8179

CPU times: user 11.8 s, sys: 6.86 s, total: 18.7 s
Wall time: 36.1 s


**No difference in score on changing number of folds for CV.**

### Class weights default vs balanced:

In [45]:
bal_classifier = LogisticRegression(
    C=0.1, 
    max_iter=1000, 
    n_jobs=-1, 
    random_state=SEED, 
    class_weight='balanced')

bal_model = make_pipeline(preprocessor, bal_classifier)

In [46]:
%%time

model_name = 'bal_logreg'
dataset = 'trn'
folds = 7
seed = SEED
config = f'{model_name}_{dataset}_f{folds}_s{seed}'

op[config], tp[config] = custom_cv(
    features=features, 
    model=bal_model, 
    extend=False,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

Fold #0: 0.8193 | Fold #1: 0.8222 | Fold #2: 0.8126 | Fold #3: 0.8205 | Fold #4: 0.8196 | Fold #5: 0.8154 | Fold #6: 0.8222 | OOF score: 0.8188

CPU times: user 5.91 s, sys: 3.14 s, total: 9.05 s
Wall time: 17.8 s


In [47]:
%%time

model_name = 'bal_logreg'
dataset = 'ext'
folds = 7
seed = SEED
config = f'{model_name}_{dataset}_f{folds}_s{seed}'

op[config], tp[config] = custom_cv(
    features=features, 
    model=bal_model, 
    extend=True,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

Fold #0: 0.8192 | Fold #1: 0.8220 | Fold #2: 0.8125 | Fold #3: 0.8205 | Fold #4: 0.8194 | Fold #5: 0.8152 | Fold #6: 0.8221 | OOF score: 0.8187

CPU times: user 6.54 s, sys: 3.16 s, total: 9.71 s
Wall time: 18.7 s


**Balanced class weights perform better.**

### Best config -> different seeds

In [48]:
%%time

model_name = 'bal_logreg'
dataset = 'trn'
folds = 7
seed = 2311
config = f'{model_name}_{dataset}_f{folds}_s{seed}'

op[config], tp[config] = custom_cv(
    features=features, 
    model=bal_model, 
    extend=False,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

Fold #0: 0.8203 | Fold #1: 0.8187 | Fold #2: 0.8189 | Fold #3: 0.8193 | Fold #4: 0.8152 | Fold #5: 0.8188 | Fold #6: 0.8208 | OOF score: 0.8188

CPU times: user 6.05 s, sys: 3.17 s, total: 9.22 s
Wall time: 18 s


In [49]:
%%time

model_name = 'bal_logreg'
dataset = 'trn'
folds = 7
seed = 152
config = f'{model_name}_{dataset}_f{folds}_s{seed}'

op[config], tp[config] = custom_cv(
    features=features, 
    model=bal_model, 
    extend=False,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

Fold #0: 0.8196 | Fold #1: 0.8199 | Fold #2: 0.8213 | Fold #3: 0.8211 | Fold #4: 0.8171 | Fold #5: 0.8141 | Fold #6: 0.8188 | OOF score: 0.8188

CPU times: user 5.85 s, sys: 3.12 s, total: 8.97 s
Wall time: 17.5 s


**Stable across different seeds.**

In [50]:
!head 00_bal_logreg_ext_f7_s55.csv

id,Exited
165034,0.09227912938672819
165035,0.7995017654969944
165036,0.395045171646322
165037,0.43139058793584933
165038,0.7002709782720153
165039,0.19640922101008423
165040,0.24310314622415924
165041,0.4353001497204647
165042,0.7902257016124773


**Time to submit!**