In [1]:
!pip install -U autowoe

Collecting autowoe
  Downloading AutoWoE-1.3.2-py3-none-any.whl (215 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.7/215.7 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting StrEnum<0.5.0,>=0.4.7
  Downloading StrEnum-0.4.10-py3-none-any.whl (7.7 kB)
Collecting sphinx
  Downloading sphinx-5.3.0-py3-none-any.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m71.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Collecting imagesize>=1.3
  Downloading imagesize-1.4.1-py2.py3-none-any.whl (8.8 kB)
Collecting sphinxcontrib-serializinghtml>=1.1.5
  Downloading sphinxcontrib_serializinghtml-1.1.5-py2.py3-none-any.whl (94 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.0/94.0 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sphinxcontrib-jsmath
  Downloading sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl (5.1 kB)
Collecting sphinxcontrib-devhelp
  Downloading sphinxcontrib_devhelp-1.0.2-p

# Imports 

In [2]:
%matplotlib inline

import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

from autowoe import AutoWoE, ReportDeco

# Data loading

In [3]:
INPUT_PATH = '../input/tabular-playground-series-apr-2021/'
train_data = pd.read_csv(INPUT_PATH + 'train.csv')
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,1,"Oconnor, Frankie",male,,2,0,209245,27.14,C12239,S
1,1,0,3,"Bryan, Drew",male,,0,0,27323,13.35,,S
2,2,0,3,"Owens, Kenneth",male,0.33,1,2,CA 457703,71.29,,S
3,3,0,3,"Kramer, James",male,19.00,0,0,A. 10866,13.04,,S
4,4,1,3,"Bond, Michael",male,25.00,0,0,427635,7.76,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,1,2,"Bell, Adele",female,62.00,0,0,PC 15008,14.86,D17243,C
99996,99996,0,2,"Brown, Herman",male,66.00,0,0,13273,11.15,,S
99997,99997,0,3,"Childress, Charles",male,37.00,0,0,,9.95,,S
99998,99998,0,3,"Caughlin, Thomas",male,51.00,0,1,458654,30.92,,S


In [4]:
test_data = pd.read_csv(INPUT_PATH + 'test.csv')
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,100000,3,"Holliday, Daniel",male,19.0,0,0,24745,63.01,,S
1,100001,3,"Nguyen, Lorraine",female,53.0,0,0,13264,5.81,,S
2,100002,1,"Harris, Heather",female,19.0,0,0,25990,38.91,B15315,C
3,100003,2,"Larsen, Eric",male,25.0,0,0,314011,12.93,,S
4,100004,1,"Cleary, Sarah",female,17.0,0,2,26203,26.89,B22515,C
...,...,...,...,...,...,...,...,...,...,...,...
99995,199995,3,"Cash, Cheryle",female,27.0,0,0,7686,10.12,,Q
99996,199996,1,"Brown, Howard",male,59.0,1,0,13004,68.31,,S
99997,199997,3,"Lightfoot, Cameron",male,47.0,0,0,4383317,10.87,,S
99998,199998,1,"Jacobsen, Margaret",female,49.0,1,2,PC 26988,29.68,B20828,C


In [5]:
submission = pd.read_csv(INPUT_PATH + 'sample_submission.csv')
submission

Unnamed: 0,PassengerId,Survived
0,100000,1
1,100001,1
2,100002,1
3,100003,1
4,100004,1
...,...,...
99995,199995,1
99996,199996,1
99997,199997,1
99998,199998,1


In [6]:
print('TRAIN TARGET MEAN = {:.3f}'.format(train_data['Survived'].mean()))

TRAIN TARGET MEAN = 0.428


# Extra features creation

In [7]:
def create_extra_features(data):
    data.Cabin = data.Cabin.map(lambda x: str(x)[0].strip())
    data.Ticket = data.Ticket.map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else np.nan)
    
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    
    data['FirstName'] = data.Name.map(lambda x: str(x).split(',')[0])
    data['Surname'] = data.Name.map(lambda x: str(x).split(',')[1])
    
    for col in ['Name', 'FirstName', 'Surname']:
        data['Counter_' + col] = data[col].map(data.groupby(col)['PassengerId'].count().to_dict())
        
    data.drop(columns = ['Name', 'Surname'], inplace = True)
    
    return data


all_df = pd.concat([train_data, test_data]).reset_index(drop = True)
all_df = create_extra_features(all_df)
train_data, test_data = all_df[:len(train_data)], all_df[len(train_data):]
print(train_data.shape, test_data.shape)

(100000, 16) (100000, 16)


In [8]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,FirstName,Counter_Name,Counter_FirstName,Counter_Surname
0,0,1.0,1,male,,2,0,,27.14,C,S,3,Oconnor,1,47,51
1,1,0.0,3,male,,0,0,,13.35,n,S,1,Bryan,1,54,31
2,2,0.0,3,male,0.33,1,2,CA,71.29,n,S,4,Owens,2,165,1128
3,3,0.0,3,male,19.0,0,0,A.,13.04,n,S,1,Kramer,2,53,4736
4,4,1.0,3,male,25.0,0,0,,7.76,n,S,1,Bond,1,39,3662


# Splitting data

In [9]:
tr_data, val_data = train_test_split(train_data, test_size = 0.2, stratify = train_data['Survived'], random_state = 13)
print(tr_data.shape, val_data.shape)

(80000, 16) (20000, 16)


# Setup interpretable AutoWoe model

Here we setup the model with `ReportDeco` decorator - this decorator helps us to build automatic report (see Bonus 2 part)

In [10]:
auto_woe = AutoWoE(monotonic=False,
                 vif_th=20.,
                 imp_th=0,
                 th_const=32,
                 force_single_split=True,
                 min_bin_size = 0.005,
                 oof_woe=True,
                 n_folds=10,
                 n_jobs=4,
                 regularized_refit=True,
                 verbose=2
        )

auto_woe = ReportDeco(auto_woe)

# Model training

In [11]:
%%time
auto_woe.fit(tr_data, 
             target_name="Survived")

 features [] contain too many nans or identical values
[LightGBM] [Info] Number of positive: 27368, number of negative: 36632
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5955
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.427625 -> initscore=-0.291548
[LightGBM] [Info] Start training from score -0.291548
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[25]	val_set's auc: 0.851732
 features ['Counter_Name'] have low importance
PassengerId processing...
Pclass processing...
Sex processing...
Age processing...
SibSp processing...
Parch processing...
Ticket processing...
Fare processing...
Cabin processing...
Embarked processing...
FamilySize processing...
FirstName processing...
Counter_FirstName processing...
Counter_Surname processing...
di

In [12]:
val_pred = auto_woe.predict_proba(val_data)
print("ACC_SCORE = {:.5f}".format(accuracy_score(val_data['Survived'], (val_pred > 0.5).astype(int))))

ACC_SCORE = 0.77605


# Bonus 1 - Automatic report generation for trained model

In [13]:
report_params = {"output_path": "./AUTOWOE_REPORT_Validation",
                 "report_name": "AutoWoE automatic report for Syntanic dataset model",
                 "report_version_id": 1,
                 "city": "Moscow",
                 "model_aim": "Here we want to build a model to solve TPS April 2021 competition",
                 "model_name": "Syntanic_AutoWoE_model",
                 "zakazchik": "Kaggle", # sorry for transliterate russian key here - it means the group that ask you to build this model 
                 "high_level_department": "Google",
                 "ds_name": "Alexander Ryzhkov",
                 "target_descr": "Human survived in Titanic disaster",
                 "non_target_descr": "(Sad news) Human not survived in Titanic disaster"}

auto_woe.generate_report(report_params)

Successfully wrote ./AUTOWOE_REPORT_Validation/autowoe_report.html.


#### Generated report is [here](./AUTOWOE_REPORT_Validation/autowoe_report.html). P.S. It is interactive - to open subtree click on black triangle on the left of the text.

# Bonus 2 - Automatic SQL inference query generation for trained model

As our model is interpretable, we can create SQL query for it automatically. With the help of this query you can receive model predictions inside database without Python at all.

All you need is setup the `table_name` with the initial data

In [14]:
print(auto_woe.get_sql_inference_query(table_name = 'TABLE_NAME'))

SELECT
  1 / (1 + EXP(-(
    -0.281
    -0.838*WOE_TAB.Sex
    -0.489*WOE_TAB.Embarked
    -0.189*WOE_TAB.Fare
    -0.422*WOE_TAB.Pclass
    -0.407*WOE_TAB.Cabin
    -0.334*WOE_TAB.Ticket
    -0.46*WOE_TAB.Parch
  ))) as PROB,
  WOE_TAB.*
FROM 
    (SELECT
    CASE
      WHEN Sex == 'female' THEN -1.191
      WHEN Sex == 'male' THEN 1.055
      ELSE 0
    END AS Sex,
    CASE
      WHEN Embarked == 'C' THEN -1.386
      WHEN Embarked == 'Q' THEN -0.7
      WHEN Embarked == 'S' THEN 0.482
      ELSE 0
    END AS Embarked,
    CASE
      WHEN (Fare IS NULL OR Fare = 'NaN') THEN 0
      WHEN Fare <= 12.635 THEN 0.732
      WHEN Fare <= 28.175 THEN -0.127
      WHEN Fare <= 48.595 THEN -0.56
      WHEN Fare <= 111.41 THEN -0.196
      ELSE -1.17
    END AS Fare,
    CASE
      WHEN Pclass == 1 THEN -0.609
      WHEN Pclass == 2 THEN -0.397
      WHEN Pclass == 3 THEN 0.82
      ELSE 0
    END AS Pclass,
    CASE
      WHEN Cabin == 'A' THEN 0.66
      WHEN Cabin IN ('B', 'E') THEN -0.821
 

# Train on the full train 2 separate models for Sex

In [15]:
def fit_autowoe(data):
    auto_woe = AutoWoE(monotonic=False,
                     vif_th=20.,
                     imp_th=0,
                     th_const=32,
                     force_single_split=True,
                     min_bin_size = 0.01,
                     oof_woe=True,
                     n_folds=10,
                     n_jobs=4,
                     regularized_refit=True,
                     verbose=2
            )
    auto_woe.fit(data, 
                 target_name="Survived")
    return auto_woe

In [16]:
male_model = fit_autowoe(train_data[train_data['Sex'] == 'male'])
print('=' * 50)
female_model = fit_autowoe(train_data[train_data['Sex'] == 'female'])

 features ['Sex'] contain too many nans or identical values
[LightGBM] [Info] Number of positive: 9226, number of negative: 35665
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4426
[LightGBM] [Info] Number of data points in the train set: 44891, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.205520 -> initscore=-1.352144
[LightGBM] [Info] Start training from score -1.352144
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[21]	val_set's auc: 0.756667
 features [] have low importance
PassengerId processing...
Pclass processing...
Age processing...
SibSp processing...
Parch processing...
Ticket processing...
Fare processing...
Cabin processing...
Embarked processing...
FamilySize processing...
FirstName processing...
Counter_Name processing...
Counter_FirstName processing...
Counter_Surname processing...
dic

In [17]:
male_pred = male_model.predict_proba(test_data)
female_pred = female_model.predict_proba(test_data)

In [18]:
preds = np.where(test_data['Sex'] == 'male', male_pred, female_pred)

In [19]:
preds

array([0.12912243, 0.56385311, 0.87204776, ..., 0.09941357, 0.79255767,
       0.91256468])

# Create submissions

In [20]:
submission['Survived'] = (preds > 0.5).astype(int)
submission.to_csv('AutoWoE_submission.csv', index = False)

In [21]:
submission['Survived'].mean()

0.32391