# <b>Categorical Feature Encoding Challenge</b>

## 1. Import Libraries

In [2]:
import numpy as np
import pandas as pd
import os
import random

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score as auc
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

### 1-1. Fixed Seed

In [3]:
def seed_everything(seed: int=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

## 2. Load Data
`train.csv`: 학습 데이터, 타겟: target<br>
`test.csv`: 테스트 데이터 <br>

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
print(train.shape)
train.head()

(300000, 25)


Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,T,Y,Green,Triangle,Snake,Finland,...,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,...,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,...,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,...,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,...,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0


In [6]:
print(test.shape)
test.head()

(200000, 24)


Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,300000,0,0,1,T,Y,Blue,Triangle,Axolotl,Finland,...,9d117320c,3c49b42b8,2,Novice,Warm,j,P,be,5,11
1,300001,0,0,0,T,N,Red,Square,Lion,Canada,...,46ae3059c,285771075,1,Master,Lava Hot,l,A,RP,7,5
2,300002,1,0,1,F,Y,Blue,Square,Dog,China,...,b759e21f0,6f323c53f,2,Expert,Freezing,a,G,tP,1,12
3,300003,0,0,1,T,Y,Red,Star,Cat,China,...,0b6ec68ff,b5de3dcc4,1,Contributor,Lava Hot,b,Q,ke,2,3
4,300004,0,1,1,F,N,Red,Trapezoid,Dog,China,...,f91f3b1ee,967cfa9c9,3,Grandmaster,Lava Hot,l,W,qK,4,11


In [7]:
target = train['target']
train_id = train['id']
test_id = test['id']
train.drop(['target', 'id'], axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)

print(train.shape)
print(test.shape)

(300000, 23)
(200000, 23)


## 3. Preprocessing

### 3-1. One Hot Encode


In [8]:
%%time

traintest = pd.concat([train, test])
dummies = pd.get_dummies(traintest, columns=traintest.columns, drop_first=True, sparse=True)
train_ohe = dummies.iloc[:train.shape[0], :]
test_ohe = dummies.iloc[train.shape[0]:, :]

print(train_ohe.shape)
print(test_ohe.shape)

(300000, 16529)
(200000, 16529)
CPU times: user 14.4 s, sys: 554 ms, total: 15 s
Wall time: 25.7 s


In [9]:
%%time

train_ohe = train_ohe.sparse.to_coo().tocsr()
test_ohe = test_ohe.sparse.to_coo().tocsr()

CPU times: user 3.72 s, sys: 103 ms, total: 3.82 s
Wall time: 3.89 s


## 4. Train Model
- 평가지표: `area under the ROC curve(AUC)`

### 4-1. Train LogisticRegression Model

In [10]:
%%time

# Model
def run_cv_model(train, test, target, model_fn, params={}, eval_fn=None, label='model'):
    kf = KFold(n_splits=5)
    fold_splits = kf.split(train, target)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros((train.shape[0]))
    i = 1
    for dev_index, val_index in fold_splits:
        print('Started ' + label + ' fold ' + str(i) + '/5')
        dev_X, val_X = train[dev_index], train[val_index]
        dev_y, val_y = target[dev_index], target[val_index]
        params2 = params.copy()
        pred_val_y, pred_test_y = model_fn(dev_X, dev_y, val_X, val_y, test, params2)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index] = pred_val_y
        if eval_fn is not None:
            cv_score = eval_fn(val_y, pred_val_y)
            cv_scores.append(cv_score)
            print(label + ' cv score {}: {}'.format(i, cv_score))
        i += 1
    print('{} cv scores : {}'.format(label, cv_scores))
    print('{} cv mean score : {}'.format(label, np.mean(cv_scores)))
    print('{} cv std score : {}'.format(label, np.std(cv_scores)))
    pred_full_test = pred_full_test / 5.0
    results = {'label': label,
              'train': pred_train, 'test': pred_full_test,
              'cv': cv_scores}
    return results

def runLR(train_X, train_y, test_X, test_y, test_X2, params):
    print('Train LR')
    model = LogisticRegression(**params)
    model.fit(train_X, train_y)
    print('Predict 1/2')
    pred_test_y = model.predict_proba(test_X)[:, 1]
    print('Predict 2/2')
    pred_test_y2 = model.predict_proba(test_X2)[:, 1]
    return pred_test_y, pred_test_y2

lr_params = {'C': 0.125, 'max_iter': 800, 'random_state': 42, 'solver': 'liblinear'}
results = run_cv_model(train_ohe, test_ohe, target, runLR, lr_params, auc, 'lr')

Started lr fold 1/5
Train LR
Predict 1/2
Predict 2/2
lr cv score 1: 0.801360631753113
Started lr fold 2/5
Train LR
Predict 1/2
Predict 2/2
lr cv score 2: 0.800274568021601
Started lr fold 3/5
Train LR
Predict 1/2
Predict 2/2
lr cv score 3: 0.8073645523597908
Started lr fold 4/5
Train LR
Predict 1/2
Predict 2/2
lr cv score 4: 0.8033286420617307
Started lr fold 5/5
Train LR
Predict 1/2
Predict 2/2
lr cv score 5: 0.8038792753163382
lr cv scores : [0.801360631753113, 0.800274568021601, 0.8073645523597908, 0.8033286420617307, 0.8038792753163382]
lr cv mean score : 0.8032415339025146
lr cv std score : 0.002439436925828438
CPU times: user 52.8 s, sys: 38.3 s, total: 1min 31s
Wall time: 54.8 s


## 5. Submission

In [11]:
submission = pd.DataFrame({'id': test_id, 'target': results['test']})
submission.to_csv('submission.csv', index=False)

In [12]:
submission.head()

Unnamed: 0,id,target
0,300000,0.338533
1,300001,0.701896
2,300002,0.12641
3,300003,0.421605
4,300004,0.859565
