# Categorical Feature Encoding Challenge II

## xLearn version

I created this kernel by referring to the following   
[libffm_model](https://www.kaggle.com/ogrellier/libffm-model)

Thanks!


---------------------

## install xlearn

ref.  
https://www.kaggle.com/nadare/xlearn-model-cv-42-lb


In [None]:
import os
os.mkdir("./working")
os.environ['USER'] = 'root'
os.system('pip install ../input/xlearn/xlearn/xlearn-0.40a1/')

import xlearn as xl

-------------------

In [None]:
import numpy as np
import pandas as pd
import scipy
import gc,os
from collections import Counter
from sklearn.model_selection import KFold,StratifiedKFold,RepeatedKFold,RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score as auc
from sklearn.linear_model import LogisticRegression
import category_encoders as ce

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = 50
BIN_COL  = [f'bin_{i}' for i in range(5)]
NOM_COL  = [f'nom_{i}' for i in range(10)]
ORD_COL  = [f'ord_{i}' for i in range(6)]
NOM_5_9  = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']
NOM_0_4  = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4']
DATE_COL = ['day','month']
import matplotlib.pyplot as plt
import seaborn as sns


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
for dirname, _, filenames in os.walk('/kaggle/working/libffm-binaries'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Read the data

In [None]:
%%time

def read_csv():
    train = pd.read_csv('../input/cat-in-the-dat-ii/train.csv')
    test  = pd.read_csv('../input/cat-in-the-dat-ii/test.csv')

    train_id = train['id']
    test_id  = test['id']
    train.drop('id', axis=1, inplace=True)
    test.drop('id',  axis=1, inplace=True)
    return train, test, train_id, test_id

train, test, train_id, test_id = read_csv()
def preprocessing(df):
    
    df['bin_missing']  = (df[['bin_0', 'bin_1', 'bin_2', 'bin_4']].isnull().sum(axis=1)>0).replace({True:1, False:0})
    df['nom_missing']  = (df[NOM_COL].isnull().sum(axis=1)>0).replace({True:1, False:0})
    df['ord_missing']  = (df[ORD_COL].isnull().sum(axis=1)>0).replace({True:1, False:0})
    df['date_missing'] = (df[DATE_COL].isnull().sum(axis=1)>0).replace({True:1, False:0})
    
    df.day = df.day.replace({3:5,2:6,1:7})
    df.loc[df.ord_5.notnull(), 'ord_5_1'] = df.loc[df.ord_5.notnull(), 'ord_5'].apply(lambda x: x[0])
    df.loc[df.ord_5.notnull(), 'ord_5_2'] = df.loc[df.ord_5.notnull(), 'ord_5'].apply(lambda x: x[1])
    
    return df

train, test, train_id, test_id = read_csv()
train = preprocessing(train)
test  = preprocessing(test)
print(f'train day unique value:{train.day.unique()}')
print(f'test  day unique value:{test.day.unique()}')

for col in test.columns:
    if len(set(train[col].dropna().unique().tolist())^ set(test[col].dropna().unique().tolist()))>0:
        train_only = list(set(train[col].dropna().unique().tolist()) - set(test[col].dropna().unique().tolist()))
        test_only  = list(set(test[col].dropna().unique().tolist()) - set(train[col].dropna().unique().tolist()))
        print(col, '(train only)', train_only, '(test only)', test_only) 
        train.loc[train[col].isin(train_only), col] = np.NaN
        test.loc[test[col].isin(test_only), col]    = np.NaN  

test.insert(1, 'target', 0)

drop_cols = ['bin_3', 'ord_5']
train.drop(columns=drop_cols, inplace=True)
test.drop(columns=drop_cols,  inplace=True)

In [None]:
%%time

# Label Encode to ease creation of libffm format

features = [_f for _f in train if _f not in ['id', 'target']]

def factor_encoding(train, test):
    
    assert sorted(train.columns) == sorted(test.columns)
    
    full = pd.concat([train, test], axis=0, sort=False)
    # Factorize everything
    for f in full:
        full[f], _ = pd.factorize(full[f])
        full[f] += 1  # make sure no negative
        
    return full.iloc[:len(train)], full.iloc[len(train):]

train_f, test_f = factor_encoding(train[features], test[features])

print(train_f,'-'*20)
print(train_f.head(10))

print(test_f,'-'*20)
print(test_f.head(10))

class LibFFMEncoder(object):
    def __init__(self):
        self.encoder = 1
        self.encoding = {}

    def encode_for_libffm(self, row):
        txt = f"{row[0]}"
        for i, r in enumerate(row[1:]):
            
#             try:
#                 txt += f' {i+1}:{self.encoding[r]}:1'
#             except KeyError:
#                 self.encoding[r] = self.encoder
#                 self.encoder += 1
#                 txt += f' {i+1}:{self.encoding[r]}:1'
            try:
#                 print(f'key {i} {r}')
                txt += f' {i+1}:{self.encoding[(i, r)]}:1'
            except KeyError:
#                 print(f'key error {i} {r}')
                self.encoding[(i, r)] = self.encoder
                self.encoder += 1
                txt += f' {i+1}:{self.encoding[(i, r)]}:1'
                
        return txt


SPLITS = 5
    
# Create files for testing and OOF
from sklearn.model_selection import KFold
fold_ids = [
    [trn_, val_] for (trn_, val_) in KFold(SPLITS,True,1).split(train)
]
for fold_, (trn_, val_) in enumerate(fold_ids):
    encoder = LibFFMEncoder()
    libffm_format_trn = pd.concat([train['target'].iloc[trn_], train_f.iloc[trn_]], axis=1).apply(
        lambda row: encoder.encode_for_libffm(row), raw=True, axis=1
    )
    libffm_format_val = pd.concat([train['target'].iloc[val_], train_f.iloc[val_]], axis=1).apply(
        lambda row: encoder.encode_for_libffm(row), raw=True, axis=1
    )
    print(train['target'].iloc[trn_].shape, train['target'].iloc[val_].shape, libffm_format_val.shape)
    
    libffm_format_trn.to_csv(f'libffm_trn_fold_{fold_+1}.txt', index=False, header=False)
    libffm_format_val.to_csv(f'libffm_val_fold_{fold_+1}.txt', index=False, header=False)
    
# Create files for final model
encoder = LibFFMEncoder()
libffm_format_trn = pd.concat([train['target'], train_f], axis=1).apply(
        lambda row: encoder.encode_for_libffm(row), raw=True, axis=1
)
libffm_format_tst = pd.concat([test['target'], test_f], axis=1).apply(
    lambda row: encoder.encode_for_libffm(row), raw=True, axis=1
)

libffm_format_trn.to_csv(f'libffm_trn.txt', index=False, header=False)
libffm_format_tst.to_csv(f'libffm_tst.txt', index=False, header=False)

## Run OOF

In [None]:
%%time

# create ffm model
ffm_model = xl.create_ffm() 

# import optuna
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split

outputs = []

# define params
param = {'task':'binary', 
         'lr':0.2, 
         'k':4,
         'lambda':0.0002, 
         'metric':'auc',
         'epoch': 15
        }
        
for fold_ in range(1, SPLITS+1):
    print(f'fold: {fold_}')
    model = f"libffm_fold_{fold_}_model"
    trn_fold_txt = f"libffm_trn_fold_{fold_}.txt"
    val_fold_txt = f"libffm_val_fold_{fold_}.txt"
    val_preds_fold_txt = f"val_preds_fold_{fold_}.txt"    

    # set training and validation data
    ffm_model.setTrain(trn_fold_txt)
    
    #xLearn will perform early-stopping by default.     
    ffm_model.setValidate(val_fold_txt)   
    
    print(' fitting...')
    ffm_model.fit(param, 'model.output')

    print(' make predictions...')
    ffm_model.setTest(val_fold_txt)
    ffm_model.setSigmoid()
    ffm_model.predict("model.output", val_preds_fold_txt)
    gc.collect()
    
    print(' auc score:',
        roc_auc_score(
            train['target'].iloc[fold_ids[fold_-1][1]], 
            pd.read_csv(val_preds_fold_txt, header=None).values[:,0]))   


In [None]:
gc.collect()

## Compute OOF score

In [None]:
oof_preds = np.zeros(train.shape[0])
for fold_, (_, val_) in enumerate(fold_ids):
    oof_preds[val_] = pd.read_csv(f'val_preds_fold_{fold_+1}.txt', header=None).values[:, 0]
print(roc_auc_score(train['target'], oof_preds))

In [None]:
sns.distplot(pd.Series(oof_preds))

## Train a xlearn model

In [None]:
%%time

# # define params
# param = {'task':'binary', 
#          'lr':0.2, 
#          'k':4,
#          'lambda':0.0002, 
#          'metric':'auc',
#          'epoch': 15,
#         }

ffm_model = xl.create_ffm() 
ffm_model.setTrain("libffm_trn.txt")
ffm_model.fit(param, 'model.output')

In [None]:
%%time

ffm_model.setTest('libffm_tst.txt')
ffm_model.setSigmoid()
ffm_model.predict('model.output', 'tst_preds.txt')

In [None]:
gc.collect()

## Predict for test set


In [None]:
# Prepare submission

submission = pd.DataFrame()
submission['id'] = test_id#test[['id']].copy()
# submission = test[['id']].copy()
submission['target'] = pd.read_csv('tst_preds.txt', header=None).values[:,0]
submission.to_csv('xlearn_prediction.csv', index=False)

In [None]:
test.shape

In [None]:
submission[:50]

In [None]:
submission.target.describe()

In [None]:
sns.distplot(submission.target)