Reference:  
https://www.kaggle.com/peterhurford/why-not-logistic-regression  
https://www.kaggle.com/superant/oh-my-cat  

Just enter this competition and read the above kernels. This notebook is modified from the references. Approaches therein are simple feature engineering and logistic regression, which is elegant. Keep exploring more advanced approach.

In [None]:
%%time

import pandas as pd
import numpy as np

# Load data
train = pd.read_csv('../input/cat-in-the-dat/train.csv')
test = pd.read_csv('../input/cat-in-the-dat/test.csv')

print(train.shape)
print(test.shape)

In [None]:
%%time

# Subset
target = train['target'].values
train_id = train['id'].tolist()
test_id = test['id'].tolist()
train.drop(['target', 'id'], axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)

print(train.shape)
print(test.shape)

Some simple preprocessing and feature engineering

In [None]:
def reduce_dim(df, column):
    # summarize those showing only once to one category
    for index, dup in df[column].duplicated(keep=False).iteritems():
        if dup == False:
            df.at[index, column] = -1
    # re-index
    new_index = {idx:i for i, idx in enumerate(df[column].unique())}
    df[column] = df[column].map(new_index)
    return df


def data_treatment(df):
    for col in list(df.columns):
        if col.startswith('bin'):
            bins = df[col].unique()
            df[col] = df[col].map({bins[0]:0, bins[1]:1}).astype('int8')
    
    df['ord_5'] = df['ord_5'].str[0]
    df['isweekend'] = (df['day'] >= 5).astype('int8')
    
    return df

In [None]:
%%time

# Preprocessing
whole = pd.concat([train, test])
whole = data_treatment(whole)

cat_cols = whole.columns[5:-1]
non_cat_cols = list(set(whole.columns)-set(cat_cols))
for category in cat_cols:
    whole = reduce_dim(whole, category)

Rather than using pd.get_dummies(), OneHotEncoder in sklearn could be much faster with the sparse output.

In [None]:
%%time

# One-hot encoding
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack

enc = OneHotEncoder(handle_unknown='ignore')
enc = enc.fit(whole[cat_cols])
whole_ohe = enc.transform(whole[cat_cols])
whole_ohe = hstack((whole_ohe, whole[non_cat_cols]))
whole_ohe = whole_ohe.tocsr()

train_ohe = whole_ohe[:train.shape[0], :]
test_ohe = whole_ohe[train.shape[0]:, :]

print(train_ohe.shape)
print(test_ohe.shape)

In [None]:
%%time

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score as auc
from sklearn.linear_model import LogisticRegression

# Model
def run_cv_model(train, test, target, model_fn, params={}, eval_fn=None, model_type='logreg'):
    kf = StratifiedKFold(n_splits=5)
    fold_splits = kf.split(train, target)
    
    trn_scores = []
    cv_scores = []
    pred_oof = np.zeros((train.shape[0]))
    pred_full_test = 0
    
    for i, (dev_index, val_index) in enumerate(fold_splits, 1):
        print(f'Start fold {i}/5')
        trn_X, val_X = train[dev_index], train[val_index]
        trn_y, val_y = target[dev_index], target[val_index]
        pred_trn, pred_val, pred_test = model_fn(trn_X, trn_y, val_X, val_y, test, params)
        pred_oof[val_index] = pred_val
        pred_full_test += pred_test / 5.0
        if eval_fn is not None:
            trn_sc = eval_fn(trn_y, pred_trn)
            cv_sc = eval_fn(val_y, pred_val)
            trn_scores.append(trn_sc)
            cv_scores.append(cv_sc)
            print(f'trn score {i}: {trn_sc}')
            print(f'cv score {i}: {cv_sc}')
            print()
    
    print(f'trn scores : {trn_scores}')
    print(f'trn mean score : {np.mean(trn_scores)}')
    print(f'trn std score : {np.std(trn_scores)}')
    print()
    
    print(f'oof cv scores : {eval_fn(target, pred_oof)}')
    print(f'cv scores : {cv_scores}')
    print(f'cv mean score : {np.mean(cv_scores)}')
    print(f'cv std score : {np.std(cv_scores)}')
    print()
    
    results = {'model_type': model_type,
               'pred_oof': pred_oof, 'pred_test': pred_full_test,
               'trn_scores': trn_scores, 'cv_scores': cv_scores}
    
    return results


def runLR(train_X, train_y, val_X, val_y, test_X, params):
    print('Training Logistic Regression...')
    model = LogisticRegression(**params)
    model.fit(train_X, train_y)
    print('Predicting 1/3...')
    pred_trn = model.predict_proba(train_X)[:, 1]
    print('Predicting 2/3...')
    pred_val = model.predict_proba(val_X)[:, 1]
    print('Predicting 3/3...')
    pred_test = model.predict_proba(test_X)[:, 1]
    return pred_trn, pred_val, pred_test


lr_params = {'solver': 'lbfgs', 'C': 0.1, 'max_iter': 1000}
results = run_cv_model(train_ohe, test_ohe, target, runLR, lr_params, auc)

In [None]:
# Make submission

submission = pd.DataFrame({'id': test_id, 'target': results['pred_test']})
submission.to_csv('submission.csv', index=False)