In [21]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
import os

# Exploratory Data Analysis

In [22]:
# Loading data directly from CatBoost
from catboost.datasets import amazon

train, test = amazon()

In [23]:
print("Train shape: {}, Test shape: {}".format(train.shape, test.shape))

Train shape: (32769, 10), Test shape: (58921, 10)


In [24]:
train.head(5)

Unnamed: 0,ACTION,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,1,39353,85475,117961,118300,123472,117905,117906,290919,117908
1,1,17183,1540,117961,118343,123125,118536,118536,308574,118539
2,1,36724,14457,118219,118220,117884,117879,267952,19721,117880
3,1,36135,5396,117961,118343,119993,118321,240983,290919,118322
4,1,42680,5905,117929,117930,119569,119323,123932,19793,119325


In [25]:
test.head(5)

Unnamed: 0,id,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,1,78766,72734,118079,118080,117878,117879,118177,19721,117880
1,2,40644,4378,117961,118327,118507,118863,122008,118398,118865
2,3,75443,2395,117961,118300,119488,118172,301534,249618,118175
3,4,43219,19986,117961,118225,118403,120773,136187,118960,120774
4,5,42093,50015,117961,118343,119598,118422,300136,118424,118425


dataset has 9 columns, plus target (`ACTION`) for train and `id` for test. 
All these columns are categorical encoded as integers.

# Feature Engineering

In [26]:
train.apply(lambda x: len(x.unique()))

ACTION                 2
RESOURCE            7518
MGR_ID              4243
ROLE_ROLLUP_1        128
ROLE_ROLLUP_2        177
ROLE_DEPTNAME        449
ROLE_TITLE           343
ROLE_FAMILY_DESC    2358
ROLE_FAMILY           67
ROLE_CODE            343
dtype: int64

`RESOURCE`,`MGR_ID` and `ROLE_FAMILY_DESC`. 
These 3 columns are high-cardinality categorical features.

`ROLE_CODE` and `ROLE_TITLE`. 
These 2 columns have exactly the same amount of unique values.

In [27]:
import itertools
target = "ACTION"
col4train = [x for x in train.columns if x!=target]

col1 = 'ROLE_CODE'
col2 = 'ROLE_TITLE'

pair = len(train.groupby([col1,col2]).size())
single = len(train.groupby([col1]).size())

print(col1, col2, pair, single)

ROLE_CODE ROLE_TITLE 343 343


these 2 columns have 1:1 relationship.
remove `ROLE_TITLE`.


In [28]:
col4train = [x for x in col4train if x!='ROLE_TITLE']
y = train[target].values

# Encoding

# Unsupervised categorical encodings

Label Encoding, SVD Encoding, Frequency encoding

functions

In [29]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score

def get_model(): 
    params = {
        "n_estimators":300, 
        "n_jobs": 3,
        "random_state":5436,
    }
    return ExtraTreesClassifier(**params)

def validate_model(model, data):
    skf = StratifiedKFold(n_splits=5, random_state = 4141, shuffle = True)
    stats = cross_validate(
        model, data[0], data[1], 
        groups=None, scoring='roc_auc', 
        cv=skf, n_jobs=None, return_train_score = True
    )
    stats = pd.DataFrame(stats)
    return stats.describe().transpose()

def transform_dataset(train, test, func, func_params = {}):
    dataset = pd.concat([train, test], ignore_index = True)
    dataset = func(dataset, **func_params)
    if isinstance(dataset, pd.DataFrame):
        new_train = dataset.iloc[:train.shape[0],:].reset_index(drop = True)
        new_test =  dataset.iloc[train.shape[0]:,:].reset_index(drop = True)
    else:
        new_train = dataset[:train.shape[0]]
        new_test =  dataset[train.shape[0]:]
    return new_train, new_test

> ## 1. Label Encoding

In [30]:
#for each column in dataset creates N column with random integers
def assign_rnd_integer(dataset, number_of_times = 5, seed = 23):
    new_dataset = pd.DataFrame()
    np.random.seed(seed)
    for c in dataset.columns:
        for i in range(number_of_times):
            col_name = c+"_"+str(i)
            unique_vals = dataset[c].unique()
            labels = np.array(list(range(len(unique_vals))))
            np.random.shuffle(labels)
            mapping = pd.DataFrame({c: unique_vals, col_name: labels})
            new_dataset[col_name] = (dataset[[c]]
                                     .merge(mapping, on = c, how = 'left')[col_name]
                                    ).values
    return new_dataset

In [31]:
new_train, new_test = transform_dataset(
    train[col4train], test[col4train], 
    assign_rnd_integer, {"number_of_times":5}
)

print(new_train.shape, new_test.shape)
new_train.head(5)

(32769, 40) (58921, 40)


Unnamed: 0,RESOURCE_0,RESOURCE_1,RESOURCE_2,RESOURCE_3,RESOURCE_4,MGR_ID_0,MGR_ID_1,MGR_ID_2,MGR_ID_3,MGR_ID_4,ROLE_ROLLUP_1_0,ROLE_ROLLUP_1_1,ROLE_ROLLUP_1_2,ROLE_ROLLUP_1_3,ROLE_ROLLUP_1_4,ROLE_ROLLUP_2_0,ROLE_ROLLUP_2_1,ROLE_ROLLUP_2_2,ROLE_ROLLUP_2_3,ROLE_ROLLUP_2_4,ROLE_DEPTNAME_0,ROLE_DEPTNAME_1,ROLE_DEPTNAME_2,ROLE_DEPTNAME_3,ROLE_DEPTNAME_4,ROLE_FAMILY_DESC_0,ROLE_FAMILY_DESC_1,ROLE_FAMILY_DESC_2,ROLE_FAMILY_DESC_3,ROLE_FAMILY_DESC_4,ROLE_FAMILY_0,ROLE_FAMILY_1,ROLE_FAMILY_2,ROLE_FAMILY_3,ROLE_FAMILY_4,ROLE_CODE_0,ROLE_CODE_1,ROLE_CODE_2,ROLE_CODE_3,ROLE_CODE_4
0,3686,1025,4574,5572,7195,1527,461,2267,4280,559,95,79,124,91,117,59,118,176,47,58,318,232,397,43,100,103,2263,612,1645,2642,51,8,14,9,1,185,327,233,193,59
1,2594,6922,6168,5518,305,4513,3008,3296,1000,820,95,79,124,91,117,112,157,17,79,138,23,95,414,232,222,738,359,2242,2830,2164,15,5,38,41,27,295,153,302,201,243
2,166,5602,4741,515,3549,2769,2600,3466,4376,2172,19,128,128,44,69,87,78,70,151,73,48,422,375,394,315,2038,1096,341,1561,947,62,19,53,17,36,198,232,270,64,244
3,5539,4799,4222,3469,5186,1660,1171,3388,2416,1887,95,79,124,91,117,112,157,17,79,138,432,280,363,140,475,1336,491,1805,2395,1685,51,8,14,9,1,303,190,312,280,64
4,580,599,3384,4089,6315,2627,4363,1755,3830,1454,15,116,126,60,29,163,131,34,164,133,117,239,247,314,360,1499,2059,215,2931,1530,27,55,47,28,62,244,31,293,163,119


In [32]:
validate_model(
    model = get_model(), 
    data = [new_train.values, y]
)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fit_time,5.0,10.150363,3.221607,8.345974,8.580536,8.829827,9.104312,15.891164
score_time,5.0,0.35221,0.062833,0.304745,0.305112,0.311137,0.405273,0.434783
test_score,5.0,0.863432,0.010977,0.849568,0.857531,0.864963,0.865975,0.879122
train_score,5.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [33]:
new_train, new_test = transform_dataset(
    train[col4train], test[col4train], 
    assign_rnd_integer, {"number_of_times":10}
)
print(new_train.shape, new_test.shape)
new_train.head(5)

(32769, 80) (58921, 80)


Unnamed: 0,RESOURCE_0,RESOURCE_1,RESOURCE_2,RESOURCE_3,RESOURCE_4,RESOURCE_5,RESOURCE_6,RESOURCE_7,RESOURCE_8,RESOURCE_9,MGR_ID_0,MGR_ID_1,MGR_ID_2,MGR_ID_3,MGR_ID_4,MGR_ID_5,MGR_ID_6,MGR_ID_7,MGR_ID_8,MGR_ID_9,ROLE_ROLLUP_1_0,ROLE_ROLLUP_1_1,ROLE_ROLLUP_1_2,ROLE_ROLLUP_1_3,ROLE_ROLLUP_1_4,ROLE_ROLLUP_1_5,ROLE_ROLLUP_1_6,ROLE_ROLLUP_1_7,ROLE_ROLLUP_1_8,ROLE_ROLLUP_1_9,ROLE_ROLLUP_2_0,ROLE_ROLLUP_2_1,ROLE_ROLLUP_2_2,ROLE_ROLLUP_2_3,ROLE_ROLLUP_2_4,ROLE_ROLLUP_2_5,ROLE_ROLLUP_2_6,ROLE_ROLLUP_2_7,ROLE_ROLLUP_2_8,ROLE_ROLLUP_2_9,ROLE_DEPTNAME_0,ROLE_DEPTNAME_1,ROLE_DEPTNAME_2,ROLE_DEPTNAME_3,ROLE_DEPTNAME_4,ROLE_DEPTNAME_5,ROLE_DEPTNAME_6,ROLE_DEPTNAME_7,ROLE_DEPTNAME_8,ROLE_DEPTNAME_9,ROLE_FAMILY_DESC_0,ROLE_FAMILY_DESC_1,ROLE_FAMILY_DESC_2,ROLE_FAMILY_DESC_3,ROLE_FAMILY_DESC_4,ROLE_FAMILY_DESC_5,ROLE_FAMILY_DESC_6,ROLE_FAMILY_DESC_7,ROLE_FAMILY_DESC_8,ROLE_FAMILY_DESC_9,ROLE_FAMILY_0,ROLE_FAMILY_1,ROLE_FAMILY_2,ROLE_FAMILY_3,ROLE_FAMILY_4,ROLE_FAMILY_5,ROLE_FAMILY_6,ROLE_FAMILY_7,ROLE_FAMILY_8,ROLE_FAMILY_9,ROLE_CODE_0,ROLE_CODE_1,ROLE_CODE_2,ROLE_CODE_3,ROLE_CODE_4,ROLE_CODE_5,ROLE_CODE_6,ROLE_CODE_7,ROLE_CODE_8,ROLE_CODE_9
0,3686,1025,4574,5572,7195,4425,4959,5700,5723,5279,3155,117,3772,4684,1709,974,356,2917,3873,4484,6,7,96,124,78,106,122,81,5,27,152,25,103,59,150,59,89,23,124,123,418,169,192,283,46,8,262,374,332,68,1360,415,449,476,2628,66,1445,1355,2017,1781,6,46,18,66,54,4,53,26,19,67,316,263,64,344,146,290,307,260,109,291
1,2594,6922,6168,5518,305,1524,3817,6292,3954,3966,4247,4304,4750,639,3445,1667,712,1015,2661,3257,6,7,96,124,78,106,122,81,5,27,182,41,93,105,39,168,83,125,74,4,356,62,283,299,421,254,363,98,32,296,826,117,863,1879,966,436,1471,852,2777,1762,29,18,37,24,6,26,56,7,50,4,221,332,217,147,129,266,222,128,271,305
2,166,5602,4741,515,3549,3201,2123,4148,3182,2490,740,296,4233,2658,584,2660,1593,2442,1580,4175,28,11,21,93,4,95,47,6,68,19,109,140,180,124,145,154,131,92,11,158,298,40,148,121,248,2,465,183,405,399,549,2885,1118,296,1680,2201,2900,1114,2108,431,60,49,8,38,10,46,30,0,4,65,173,52,112,267,22,212,15,49,322,58
3,5539,4799,4222,3469,5186,2042,3213,1373,3540,5628,3785,4208,1687,2895,2075,3360,3064,1541,4146,1396,6,7,96,124,78,106,122,81,5,27,182,41,93,105,39,168,83,125,74,4,182,464,292,422,139,439,188,41,104,198,1326,1100,2163,314,350,2329,192,381,1126,2483,6,46,18,66,54,4,53,26,19,67,274,155,340,27,338,213,328,61,230,309
4,580,599,3384,4089,6315,2544,4806,222,6252,2527,3535,2408,4132,3855,1333,2200,2525,176,4327,1195,52,61,88,63,74,11,67,107,83,73,82,52,94,179,23,25,79,160,3,36,355,273,118,380,109,1,265,367,223,276,2239,1490,2459,1287,825,542,1492,2402,295,1560,51,41,14,7,63,57,47,29,21,42,6,137,171,124,107,208,32,250,262,177


In [34]:
validate_model(
    model = get_model(), 
    data = [new_train.values, y]
)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fit_time,5.0,11.055252,0.505146,10.536836,10.652267,11.086418,11.190285,11.810456
score_time,5.0,0.406546,0.101406,0.305169,0.305248,0.406276,0.507668,0.508369
test_score,5.0,0.872831,0.01429,0.85811,0.858482,0.878036,0.878347,0.891181
train_score,5.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


when 5 times : 0.8634

when 10 times : 0.8728

choose 10 times

## 2. SVD encoding

In [35]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

def extract_col_interaction(dataset, col1, col2, tfidf = True):
    data = dataset.groupby([col1])[col2].agg(lambda x: " ".join(list([str(y) for y in x])))
    if tfidf:
        vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(" "))
    else:
        vectorizer = CountVectorizer(tokenizer=lambda x: x.split(" "))
    
    data_X = vectorizer.fit_transform(data)
    dim_red = TruncatedSVD(n_components=1, random_state = 5115)
    data_X = dim_red.fit_transform(data_X)
    
    result = pd.DataFrame()
    result[col1] = data.index.values
    result[col1+"_{}_svd".format(col2)] = data_X.ravel()
    return result

import itertools
def get_col_interactions_svd(dataset, tfidf = True):
    new_dataset = pd.DataFrame()
    for col1,col2 in itertools.permutations(dataset.columns, 2):
        data = extract_col_interaction(dataset, col1,col2, tfidf)
        col_name = [x for x in data.columns if "svd" in x][0]
        new_dataset[col_name] = dataset[[col1]].merge(data, on = col1, how = 'left')[col_name]
    return new_dataset

In [36]:
new_train, new_test = transform_dataset(
    train[col4train], test[col4train], 
    get_col_interactions_svd
)
print(new_train.shape, new_test.shape)
new_train.head(5)

(32769, 56) (58921, 56)


Unnamed: 0,RESOURCE_MGR_ID_svd,RESOURCE_ROLE_ROLLUP_1_svd,RESOURCE_ROLE_ROLLUP_2_svd,RESOURCE_ROLE_DEPTNAME_svd,RESOURCE_ROLE_FAMILY_DESC_svd,RESOURCE_ROLE_FAMILY_svd,RESOURCE_ROLE_CODE_svd,MGR_ID_RESOURCE_svd,MGR_ID_ROLE_ROLLUP_1_svd,MGR_ID_ROLE_ROLLUP_2_svd,MGR_ID_ROLE_DEPTNAME_svd,MGR_ID_ROLE_FAMILY_DESC_svd,MGR_ID_ROLE_FAMILY_svd,MGR_ID_ROLE_CODE_svd,ROLE_ROLLUP_1_RESOURCE_svd,ROLE_ROLLUP_1_MGR_ID_svd,ROLE_ROLLUP_1_ROLE_ROLLUP_2_svd,ROLE_ROLLUP_1_ROLE_DEPTNAME_svd,ROLE_ROLLUP_1_ROLE_FAMILY_DESC_svd,ROLE_ROLLUP_1_ROLE_FAMILY_svd,ROLE_ROLLUP_1_ROLE_CODE_svd,ROLE_ROLLUP_2_RESOURCE_svd,ROLE_ROLLUP_2_MGR_ID_svd,ROLE_ROLLUP_2_ROLE_ROLLUP_1_svd,ROLE_ROLLUP_2_ROLE_DEPTNAME_svd,ROLE_ROLLUP_2_ROLE_FAMILY_DESC_svd,ROLE_ROLLUP_2_ROLE_FAMILY_svd,ROLE_ROLLUP_2_ROLE_CODE_svd,ROLE_DEPTNAME_RESOURCE_svd,ROLE_DEPTNAME_MGR_ID_svd,ROLE_DEPTNAME_ROLE_ROLLUP_1_svd,ROLE_DEPTNAME_ROLE_ROLLUP_2_svd,ROLE_DEPTNAME_ROLE_FAMILY_DESC_svd,ROLE_DEPTNAME_ROLE_FAMILY_svd,ROLE_DEPTNAME_ROLE_CODE_svd,ROLE_FAMILY_DESC_RESOURCE_svd,ROLE_FAMILY_DESC_MGR_ID_svd,ROLE_FAMILY_DESC_ROLE_ROLLUP_1_svd,ROLE_FAMILY_DESC_ROLE_ROLLUP_2_svd,ROLE_FAMILY_DESC_ROLE_DEPTNAME_svd,ROLE_FAMILY_DESC_ROLE_FAMILY_svd,ROLE_FAMILY_DESC_ROLE_CODE_svd,ROLE_FAMILY_RESOURCE_svd,ROLE_FAMILY_MGR_ID_svd,ROLE_FAMILY_ROLE_ROLLUP_1_svd,ROLE_FAMILY_ROLE_ROLLUP_2_svd,ROLE_FAMILY_ROLE_DEPTNAME_svd,ROLE_FAMILY_ROLE_FAMILY_DESC_svd,ROLE_FAMILY_ROLE_CODE_svd,ROLE_CODE_RESOURCE_svd,ROLE_CODE_MGR_ID_svd,ROLE_CODE_ROLE_ROLLUP_1_svd,ROLE_CODE_ROLE_ROLLUP_2_svd,ROLE_CODE_ROLE_DEPTNAME_svd,ROLE_CODE_ROLE_FAMILY_DESC_svd,ROLE_CODE_ROLE_FAMILY_svd
0,0.015059,0.999236,0.869578,0.008674,0.695,0.846882,0.713359,0.034007,0.999988,0.964151,3.007338e-08,0.8969273,0.047233,6.9e-05,0.733896,0.02226,0.016183,0.008418,0.040092,0.071463,0.036593,0.686629,0.001099,0.9999168,0.006414,0.096203,0.075999,0.028685,0.151673,-0.009297,0.967065,0.778194,0.873828,0.969019,0.882,0.280971,-1.8e-05,0.989504,0.778027,0.000653,0.004791,0.0006978118,0.644948,3.2e-05,0.996214,0.953954,0.625506,0.790561,0.163381,0.518124,5.6e-05,0.988656,0.933793,0.082077,0.940899,-0.003581
1,0.034197,0.982219,0.95253,0.082501,0.180704,0.223276,0.19672,0.174024,0.999988,0.265149,1.901879e-08,0.0002176456,0.00236,0.000396,0.733896,0.02226,0.016183,0.008418,0.040092,0.071463,0.036593,0.68975,4e-06,0.9999168,0.001687,0.082994,0.050863,0.023754,0.350194,0.000847,0.797737,0.583598,0.439512,0.675192,0.392035,0.043701,-1.1e-05,0.999935,0.159315,0.000103,3.9e-05,1.090086e-07,0.796351,7.1e-05,0.992507,0.895471,0.635695,0.001405,0.027663,0.483528,0.000877,0.996758,0.726202,0.03802,-0.000341,-0.000446
2,0.000674,0.001712,0.006027,0.26188,0.001021,0.010063,0.004702,0.006119,5.562812e-09,-8e-06,0.001715711,0.0002121618,0.963695,0.851832,0.139913,-0.017322,1e-06,0.521839,0.312767,0.894738,0.824359,0.094593,0.000294,4.143361e-07,0.494055,0.278938,0.900952,0.830185,0.039013,0.001736,0.002376,0.006677,0.000317,0.022336,0.00053,0.062278,9.1e-05,0.001937,0.002253,0.167734,0.027143,3.111196e-05,0.15362,2.3e-05,0.030143,0.057715,0.262357,0.016452,-0.201398,0.09315,-0.000294,0.006333,0.010736,0.529856,-0.001061,0.000406
3,0.028655,0.999236,0.934787,0.012435,0.083169,0.994862,0.663811,0.089637,0.999988,0.265149,0.000354878,0.379235,0.013731,1.9e-05,0.733896,0.02226,0.016183,0.008418,0.040092,0.071463,0.036593,0.68975,4e-06,0.9999168,0.001687,0.082994,0.050863,0.023754,0.271008,0.002901,0.90782,0.667436,0.827047,0.970636,0.869949,0.172814,-3e-06,0.997782,0.764704,0.000473,0.002823,1.049378e-05,0.644948,3.2e-05,0.996214,0.953954,0.625506,0.790561,0.163381,0.578121,2.1e-05,0.996215,0.955166,0.079606,0.952335,-0.003581
4,0.000827,0.482659,0.111446,0.114995,0.01028,0.064558,0.052313,0.009358,4.516656e-05,1.5e-05,0.01473593,-4.669595e-08,0.006411,0.001138,0.041462,0.015394,1e-06,0.867528,0.889019,0.98411,0.969121,0.018662,0.000378,2.370849e-05,0.77948,0.9305,0.982451,0.923917,0.009549,-0.000137,0.001671,0.004789,0.000362,0.054207,0.010147,0.008159,2.7e-05,0.000101,0.000299,0.03116,0.007478,9.568153e-07,0.200616,0.000181,0.836703,0.440798,0.0626,0.011422,0.059175,0.053252,-0.000292,0.006631,0.010132,0.506337,0.00028,-0.000696


In [37]:
validate_model(
    model = get_model(), 
    data = [new_train.values, y]
)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fit_time,5.0,9.247503,0.833479,8.582263,8.59441,9.117335,9.322273,10.621236
score_time,5.0,0.451189,0.062801,0.405284,0.405651,0.405762,0.510551,0.528697
test_score,5.0,0.862532,0.013752,0.842693,0.854887,0.867282,0.870474,0.877325
train_score,5.0,0.999975,3e-06,0.99997,0.999974,0.999977,0.999977,0.999979


SVD : AUC is 0.8625.

## 3. Frequency encoding

In [38]:
def get_freq_encoding(dataset):
    new_dataset = pd.DataFrame()
    for c in dataset.columns:
        data = dataset.groupby([c]).size().reset_index()
        new_dataset[c+"_freq"] = dataset[[c]].merge(data, on = c, how = "left")[0]
    return new_dataset

In [39]:
new_train, new_test = transform_dataset(
    train[col4train], test[col4train], 
    get_freq_encoding
)
print(new_train.shape, new_test.shape)
new_train.head(5)

(32769, 8) (58921, 8)


Unnamed: 0,RESOURCE_freq,MGR_ID_freq,ROLE_ROLLUP_1_freq,ROLE_ROLLUP_2_freq,ROLE_DEPTNAME_freq,ROLE_FAMILY_DESC_freq,ROLE_FAMILY_freq,ROLE_CODE_freq
0,7,145,59065,12155,180,17996,28861,9569
1,93,34,59065,10920,406,29,3506,213
2,8,7,518,518,1645,92,7768,3838
3,2,153,59065,10920,494,3244,28861,12082
4,28,18,815,396,143,41,945,187


In [40]:
validate_model(
    model = get_model(), 
    data = [new_train.values, y]
)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fit_time,5.0,4.591639,0.258596,4.313933,4.363932,4.577475,4.810587,4.892268
score_time,5.0,0.405088,0.000622,0.40459,0.404695,0.404964,0.405039,0.406151
test_score,5.0,0.820965,0.012499,0.803485,0.818629,0.821627,0.822482,0.838603
train_score,5.0,0.999884,1.3e-05,0.999865,0.999883,0.999883,0.999895,0.999896


Frequency : AUC is 0.8209

In [41]:
new_train1, new_test1 = transform_dataset(
    train[col4train], test[col4train], get_freq_encoding
)
new_train2, new_test2 = transform_dataset(
    train[col4train], test[col4train], get_col_interactions_svd
)
new_train3, new_test3 = transform_dataset(
    train[col4train], test[col4train], 
    assign_rnd_integer, {"number_of_times":10}
)

new_train = pd.concat([new_train1, new_train2, new_train3], axis = 1)
new_test = pd.concat([new_test1, new_test2, new_test3], axis = 1)
print(new_train.shape, new_test.shape)
validate_model(
    model = get_model(), 
    data = [new_train.values, y]
)

(32769, 144) (58921, 144)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fit_time,5.0,16.572372,0.743528,15.876566,16.18763,16.442313,16.533676,17.821672
score_time,5.0,0.306046,0.000322,0.305751,0.305822,0.305864,0.306389,0.306404
test_score,5.0,0.879933,0.014201,0.86154,0.872578,0.882537,0.883149,0.899863
train_score,5.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


1. Label + SVD + Frequency :  AUC is 0.8799

# Supervised categorical encodings

target encoding smoothing adding noise, expanding mean

function

1. simple target encoding

In [42]:
from sklearn.base import BaseEstimator, TransformerMixin
class TargetEncoding(BaseEstimator, TransformerMixin):
    def __init__(self, columns_names ):
        self.columns_names = columns_names
        self.learned_values = {}
        self.dataset_mean = np.nan
    
    def fit(self, X, y, **fit_params):
        X_ = X.copy()
        self.learned_values = {}
        X_["__target__"] = y
        for c in [x for x in X_.columns if x in self.columns_names]:
            self.learned_values[c] = (X_[[c,"__target__"]]
                                      .groupby(c)["__target__"].mean()
                                      .reset_index())
        self.dataset_mean = np.mean(y)
        return self
    
    def transform(self, X, **fit_params):
        transformed_X = X[self.columns_names].copy()
        for c in transformed_X.columns:
            transformed_X[c] = (transformed_X[[c]]
                                .merge(self.learned_values[c], on = c, how = 'left')
                               )["__target__"]
        transformed_X = transformed_X.fillna(self.dataset_mean)
        return transformed_X
    
    def fit_transform(self, X, y, **fit_params):
        self.fit(X,y)
        return self.transform(X)

In [43]:
skf = StratifiedKFold(n_splits=5, random_state = 5451, shuffle = True)
te = TargetEncoding(columns_names=col4train)
X_tr = te.fit_transform(train, y).values

scores = []
tr_scores = []
for train_index, test_index in skf.split(train, y):
    train_df, valid_df = X_tr[train_index], X_tr[test_index]
    train_y, valid_y = y[train_index], y[test_index]

    model = get_model()
    model.fit(train_df,train_y)

    predictions = model.predict_proba(valid_df)[:,1]
    scores.append(roc_auc_score(valid_y, predictions))

    train_preds = model.predict_proba(train_df)[:,1]
    tr_scores.append(roc_auc_score(train_y, train_preds))

print("Train AUC score: {:.4f} Valid AUC score: {:.4f}, STD: {:.4f}".format(
    np.mean(tr_scores), np.mean(scores), np.std(scores)
))

Train AUC score: 1.0000 Valid AUC score: 0.9749, STD: 0.0017


Overfitting! solution : adding noise

### 2. target encoding smoothing

In [44]:
class TargetEncodingSmoothing(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns_names,k, f ):
        self.columns_names = columns_names
        self.learned_values = {}
        self.dataset_mean = np.nan
        self.k = k #
        self.f = f #
        
    def smoothing_func(self, N): #
        return 1 / (1 + np.exp(-(N-self.k)/self.f))
    
    def fit(self, X, y, **fit_params):
        X_ = X.copy()
        self.learned_values = {}
        self.dataset_mean = np.mean(y)
        X_["__target__"] = y
        for c in [x for x in X_.columns if x in self.columns_names]:
            stats = (X_[[c,"__target__"]]
                     .groupby(c)["__target__"].
                     agg(['mean', 'size'])) 
            stats["alpha"] = self.smoothing_func(stats["size"])
            stats["__target__"] = (stats["alpha"]*stats["mean"] 
                                   + (1-stats["alpha"])*self.dataset_mean)
            stats = (stats
                     .drop([x for x in stats.columns if x not in ["__target__",c]], axis = 1)
                     .reset_index())
            self.learned_values[c] = stats
        self.dataset_mean = np.mean(y)
        
        return self
    
    def transform(self, X, **fit_params):
        transformed_X = X[self.columns_names].copy()
        for c in transformed_X.columns:
            transformed_X[c] = (transformed_X[[c]]
                                .merge(self.learned_values[c], on = c, how = 'left')
                               )["__target__"]
        transformed_X = transformed_X.fillna(self.dataset_mean)
        
        return transformed_X
    def fit_transform(self, X, y, **fit_params):
        self.fit(X,y)
        return self.transform(X)

In [45]:
skf = StratifiedKFold(n_splits=5, random_state = 5451, shuffle = True)
scores = []
tr_scores = []
for train_index, test_index in skf.split(train, y):
    train_df = train.loc[train_index,col4train].reset_index(drop = True)
    valid_df = train.loc[test_index,col4train].reset_index(drop = True)
    train_y, valid_y = y[train_index], y[test_index]
    te = TargetEncodingSmoothing(
        columns_names= col4train,
        k = 3, f = 1.5
    )
    
    X_tr = te.fit_transform(train_df, train_y).values
    ##
    cols_mean_target_smoothing = te.fit_transform(train_df, train_y)
    ##
    X_val = te.transform(valid_df).values

    model = get_model()
    model.fit(X_tr,train_y)

    predictions = model.predict_proba(X_val)[:,1]
    scores.append(roc_auc_score(valid_y, predictions))

    train_preds = model.predict_proba(X_tr)[:,1]
    tr_scores.append(roc_auc_score(train_y, train_preds))

In [46]:
cols_mean_target_smoothing.head(5)

Unnamed: 0,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,0.971048,1.0,0.949714,0.957131,0.94,0.934343,0.943125,0.969506
1,0.962963,0.993098,0.949714,0.971547,0.883721,0.993098,0.946962,0.953846
2,0.96174,0.96174,0.918367,0.918367,0.916859,0.961538,0.902508,0.883929
3,0.954176,1.0,0.949714,0.971547,0.986928,0.961962,0.943125,0.921095
4,0.996238,0.998006,0.934211,0.885965,0.763158,0.941177,0.824742,0.875


In [47]:
print("Train AUC score: {:.4f} Valid AUC score: {:.4f}, STD: {:.4f}".format(
    np.mean(tr_scores), np.mean(scores), np.std(scores)
))

Train AUC score: 1.0000 Valid AUC score: 0.7878, STD: 0.0047


AUC score: 0.7878. 

## Adding noise. CV inside CV.

adding noise : make our embedding noisy

we split our train dataset into n folds, and we use n-1 folds to create target mean embedding and use it for the last n-th fold.


Here is the function which does that:

In [48]:
def get_CV_target_encoding(data, y, encoder, cv = 5):
    skfTE = StratifiedKFold(n_splits=cv, random_state = 545167, shuffle = True)
    result = []
    for train_indexTE, test_indexTE in skfTE.split(data, y):
        encoder.fit(data.iloc[train_indexTE,:].reset_index(drop = True), y[train_indexTE])
        tmp =  encoder.transform(data.iloc[test_indexTE,:].reset_index(drop = True))
        tmp["index"] = test_indexTE
        result.append(tmp)
        
    result = pd.concat(result, ignore_index = True)
    result = result.sort_values('index').reset_index(drop = True).drop('index', axis = 1)
    return result

In [49]:
scores = []
tr_scores = []
for train_index, test_index in skf.split(train, y):
    train_df = train.loc[train_index,col4train].reset_index(drop = True)
    valid_df = train.loc[test_index,col4train].reset_index(drop = True)
    train_y, valid_y = y[train_index], y[test_index]
    te = TargetEncodingSmoothing(
        columns_names= col4train,
        k = 3, f = 1.5
    )
    
    X_tr = get_CV_target_encoding(train_df, train_y, te, cv = 5)

    te.fit(train_df, train_y)
    X_val = te.transform(valid_df).values

    model = get_model()
    model.fit(X_tr,train_y)

    predictions = model.predict_proba(X_val)[:,1]
    scores.append(roc_auc_score(valid_y, predictions))

    train_preds = model.predict_proba(X_tr)[:,1]
    tr_scores.append(roc_auc_score(train_y, train_preds))

print("Train AUC score: {:.4f} Valid AUC score: {:.4f}, STD: {:.4f}".format(
    np.mean(tr_scores), np.mean(scores), np.std(scores)
))

Train AUC score: 0.9999 Valid AUC score: 0.8542, STD: 0.0060


AUC : From .78 to .85.

In [50]:
X_tr.head(5)

Unnamed: 0,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,0.961723,1.0,0.949851,0.955648,0.975,0.936102,0.944191,0.969525
1,0.95,0.987924,0.949308,0.969964,0.888889,0.987924,0.947368,0.941176
2,0.954189,0.954189,0.915966,0.915966,0.917582,0.95,0.901008,0.877238
3,0.942113,1.0,0.949146,0.9723,0.984,0.958647,0.940926,0.918511
4,0.987924,0.987924,0.925287,0.869565,0.757576,0.923101,0.814159,0.851064


## Adding noise. Expanding mean.

Imagine algorithm rolling trough data and for each new row it uses all previously seen rows to calculate this new row mean. For the very first row there is no previously seen rows available so it's mean will be dataset mean. For the second row you can use first (and only first) row, because you already saw it.

In [51]:
class TargetEncodingExpandingMean(BaseEstimator, TransformerMixin):
    def __init__(self, columns_names):
        self.columns_names = columns_names
        self.learned_values = {}
        self.dataset_mean = np.nan
    def fit(self, X, y, **fit_params):
        X_ = X.copy()
        self.learned_values = {}
        self.dataset_mean = np.mean(y)
        X_["__target__"] = y
        for c in [x for x in X_.columns if x in self.columns_names]:
            stats = (X_[[c,"__target__"]]
                     .groupby(c)["__target__"]
                     .agg(['mean', 'size'])) #
            stats["__target__"] = stats["mean"]
            stats = (stats
                     .drop([x for x in stats.columns if x not in ["__target__",c]], axis = 1)
                     .reset_index())
            self.learned_values[c] = stats
        return self
    def transform(self, X, **fit_params):
        transformed_X = X[self.columns_names].copy()
        for c in transformed_X.columns:
            transformed_X[c] = (transformed_X[[c]]
                                .merge(self.learned_values[c], on = c, how = 'left')
                               )["__target__"]
        transformed_X = transformed_X.fillna(self.dataset_mean)
        return transformed_X
    
    def fit_transform(self, X, y, **fit_params):
        self.fit(X,y)
    
        #Expanding mean transform
        X_ = X[self.columns_names].copy().reset_index(drop = True)
        X_["__target__"] = y
        X_["index"] = X_.index
        X_transformed = pd.DataFrame()
        for c in self.columns_names:
            X_shuffled = X_[[c,"__target__", "index"]].copy()
            X_shuffled = X_shuffled.sample(n = len(X_shuffled),replace=False)
            X_shuffled["cnt"] = 1
            X_shuffled["cumsum"] = (X_shuffled
                                    .groupby(c,sort=False)['__target__']
                                    .apply(lambda x : x.shift().cumsum()))
            X_shuffled["cumcnt"] = (X_shuffled
                                    .groupby(c,sort=False)['cnt']
                                    .apply(lambda x : x.shift().cumsum()))
            X_shuffled["encoded"] = X_shuffled["cumsum"] / X_shuffled["cumcnt"]
            X_shuffled["encoded"] = X_shuffled["encoded"].fillna(self.dataset_mean)
            X_transformed[c] = X_shuffled.sort_values("index")["encoded"].values
        return X_transformed

In [52]:
scores = []
tr_scores = []
for train_index, test_index in skf.split(train, y):
    train_df = train.loc[train_index,col4train].reset_index(drop = True)
    valid_df = train.loc[test_index,col4train].reset_index(drop = True)
    train_y, valid_y = y[train_index], y[test_index]
    te = TargetEncodingExpandingMean(columns_names=col4train)

    X_tr = te.fit_transform(train_df, train_y)
    X_val = te.transform(valid_df).values

    model = get_model()
    model.fit(X_tr,train_y)

    predictions = model.predict_proba(X_val)[:,1]
    scores.append(roc_auc_score(valid_y, predictions))

    train_preds = model.predict_proba(X_tr)[:,1]
    tr_scores.append(roc_auc_score(train_y, train_preds))

print("Train AUC score: {:.4f} Valid AUC score: {:.4f}, STD: {:.4f}".format(
    np.mean(tr_scores), np.mean(scores), np.std(scores)
))

Train AUC score: 1.0000 Valid AUC score: 0.8389, STD: 0.0093


AUC score: 0.8694

In [53]:
X_tr.head(5)

Unnamed: 0,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,1.0,1.0,0.951947,0.96,0.95,0.937268,0.945392,0.969837
1,1.0,1.0,0.949832,0.977738,0.872093,1.0,0.94385,0.9
2,0.942096,0.942096,0.917241,0.878788,0.918429,0.941176,0.901975,0.877108
3,0.942096,1.0,0.949626,0.972653,1.0,0.963636,0.944145,0.919179
4,0.942096,1.0,0.958333,0.882979,0.444444,1.0,0.815574,0.892857


use feature pairs to create a new set of categorical features. 

take pair of existing features and concat them together

In [54]:
train[col4train] = train[col4train].values.astype(str)
test[col4train] = test[col4train].values.astype(str)

from itertools import combinations
new_col4train = col4train
for c1,c2 in combinations(col4train, 2):
    name = "{}_{}".format(c1,c2)
    new_col4train.append(name)
    train[name] = train[c1] + "_" + train[c2]
    test[name] = test[c1] + "_" + test[c2]

In [55]:
print(train[new_col4train].shape, test[new_col4train].shape)
train[new_col4train].head(5)

(32769, 36) (58921, 36)


Unnamed: 0,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE,RESOURCE_MGR_ID,RESOURCE_ROLE_ROLLUP_1,RESOURCE_ROLE_ROLLUP_2,RESOURCE_ROLE_DEPTNAME,RESOURCE_ROLE_FAMILY_DESC,RESOURCE_ROLE_FAMILY,RESOURCE_ROLE_CODE,MGR_ID_ROLE_ROLLUP_1,MGR_ID_ROLE_ROLLUP_2,MGR_ID_ROLE_DEPTNAME,MGR_ID_ROLE_FAMILY_DESC,MGR_ID_ROLE_FAMILY,MGR_ID_ROLE_CODE,ROLE_ROLLUP_1_ROLE_ROLLUP_2,ROLE_ROLLUP_1_ROLE_DEPTNAME,ROLE_ROLLUP_1_ROLE_FAMILY_DESC,ROLE_ROLLUP_1_ROLE_FAMILY,ROLE_ROLLUP_1_ROLE_CODE,ROLE_ROLLUP_2_ROLE_DEPTNAME,ROLE_ROLLUP_2_ROLE_FAMILY_DESC,ROLE_ROLLUP_2_ROLE_FAMILY,ROLE_ROLLUP_2_ROLE_CODE,ROLE_DEPTNAME_ROLE_FAMILY_DESC,ROLE_DEPTNAME_ROLE_FAMILY,ROLE_DEPTNAME_ROLE_CODE,ROLE_FAMILY_DESC_ROLE_FAMILY,ROLE_FAMILY_DESC_ROLE_CODE,ROLE_FAMILY_ROLE_CODE
0,39353,85475,117961,118300,123472,117906,290919,117908,39353_85475,39353_117961,39353_118300,39353_123472,39353_117906,39353_290919,39353_117908,85475_117961,85475_118300,85475_123472,85475_117906,85475_290919,85475_117908,117961_118300,117961_123472,117961_117906,117961_290919,117961_117908,118300_123472,118300_117906,118300_290919,118300_117908,123472_117906,123472_290919,123472_117908,117906_290919,117906_117908,290919_117908
1,17183,1540,117961,118343,123125,118536,308574,118539,17183_1540,17183_117961,17183_118343,17183_123125,17183_118536,17183_308574,17183_118539,1540_117961,1540_118343,1540_123125,1540_118536,1540_308574,1540_118539,117961_118343,117961_123125,117961_118536,117961_308574,117961_118539,118343_123125,118343_118536,118343_308574,118343_118539,123125_118536,123125_308574,123125_118539,118536_308574,118536_118539,308574_118539
2,36724,14457,118219,118220,117884,267952,19721,117880,36724_14457,36724_118219,36724_118220,36724_117884,36724_267952,36724_19721,36724_117880,14457_118219,14457_118220,14457_117884,14457_267952,14457_19721,14457_117880,118219_118220,118219_117884,118219_267952,118219_19721,118219_117880,118220_117884,118220_267952,118220_19721,118220_117880,117884_267952,117884_19721,117884_117880,267952_19721,267952_117880,19721_117880
3,36135,5396,117961,118343,119993,240983,290919,118322,36135_5396,36135_117961,36135_118343,36135_119993,36135_240983,36135_290919,36135_118322,5396_117961,5396_118343,5396_119993,5396_240983,5396_290919,5396_118322,117961_118343,117961_119993,117961_240983,117961_290919,117961_118322,118343_119993,118343_240983,118343_290919,118343_118322,119993_240983,119993_290919,119993_118322,240983_290919,240983_118322,290919_118322
4,42680,5905,117929,117930,119569,123932,19793,119325,42680_5905,42680_117929,42680_117930,42680_119569,42680_123932,42680_19793,42680_119325,5905_117929,5905_117930,5905_119569,5905_123932,5905_19793,5905_119325,117929_117930,117929_119569,117929_123932,117929_19793,117929_119325,117930_119569,117930_123932,117930_19793,117930_119325,119569_123932,119569_19793,119569_119325,123932_19793,123932_119325,19793_119325


In [56]:
train[new_col4train].apply(lambda x: len(x.unique()))

RESOURCE                           7518
MGR_ID                             4243
ROLE_ROLLUP_1                       128
ROLE_ROLLUP_2                       177
ROLE_DEPTNAME                       449
ROLE_FAMILY_DESC                   2358
ROLE_FAMILY                          67
ROLE_CODE                           343
RESOURCE_MGR_ID                   27626
RESOURCE_ROLE_ROLLUP_1            11091
RESOURCE_ROLE_ROLLUP_2            13422
RESOURCE_ROLE_DEPTNAME            17354
RESOURCE_ROLE_FAMILY_DESC         22734
RESOURCE_ROLE_FAMILY              13195
RESOURCE_ROLE_CODE                19043
MGR_ID_ROLE_ROLLUP_1               4461
MGR_ID_ROLE_ROLLUP_2               4623
MGR_ID_ROLE_DEPTNAME               5045
MGR_ID_ROLE_FAMILY_DESC            7973
MGR_ID_ROLE_FAMILY                 5483
MGR_ID_ROLE_CODE                   7484
ROLE_ROLLUP_1_ROLE_ROLLUP_2         187
ROLE_ROLLUP_1_ROLE_DEPTNAME        1185
ROLE_ROLLUP_1_ROLE_FAMILY_DESC     3282
ROLE_ROLLUP_1_ROLE_FAMILY           750


> use both `TargetEncodingExpandingMean` and `TargetEncodingSmoothing` with CV to create embeddings.

In [57]:
scores = []
tr_scores = []
for train_index, test_index in skf.split(train, y):
    train_df = train.loc[train_index,new_col4train].reset_index(drop = True)
    valid_df = train.loc[test_index,new_col4train].reset_index(drop = True)
    train_y, valid_y = y[train_index], y[test_index]
    te = TargetEncodingExpandingMean(columns_names=new_col4train)

    X_tr = te.fit_transform(train_df, train_y)
    X_val = te.transform(valid_df)
    
    te2 = TargetEncodingSmoothing(
        columns_names= new_col4train,
        k = 3, f = 1.5,
    )
    
    X_tr2 = get_CV_target_encoding(train_df, train_y, te2, cv = 5)
    te2.fit(train_df, train_y)
    X_val2 = te2.transform(valid_df)
    
    X_tr = pd.concat([X_tr, X_tr2], axis = 1)
    X_val = pd.concat([X_val, X_val2], axis = 1)

    model = get_model()
    model.fit(X_tr,train_y)

    predictions = model.predict_proba(X_val)[:,1]
    scores.append(roc_auc_score(valid_y, predictions))

    train_preds = model.predict_proba(X_tr)[:,1]
    tr_scores.append(roc_auc_score(train_y, train_preds))

print("Train AUC score: {:.4f} Valid AUC score: {:.4f}, STD: {:.4f}".format(
    np.mean(tr_scores), np.mean(scores), np.std(scores)
))

Train AUC score: 1.0000 Valid AUC score: 0.8772, STD: 0.0059


> AUC score is 0.8772

Let's define all things we need. 
1. Using original features and 2-nd level interactions (pairs)
2. And we will need: Frequency Encoding, Label Encoding, SVD encoding and target encoding.
3. we are building 2 datasets and train 2 models. The final submission is an averages of these 2 models.

In [58]:
#dataset #1
cols_svd = ['MGR_ID_ROLE_CODE','MGR_ID_ROLE_DEPTNAME','MGR_ID_ROLE_FAMILY', 
            'RESOURCE_MGR_ID','RESOURCE_ROLE_CODE', 'RESOURCE_ROLE_FAMILY',
            'RESOURCE_ROLE_ROLLUP_1','RESOURCE_ROLE_ROLLUP_2','RESOURCE',
            'ROLE_DEPTNAME_ROLE_CODE','ROLE_DEPTNAME_ROLE_FAMILY',
            'ROLE_FAMILY_DESC_ROLE_FAMILY','ROLE_FAMILY_ROLE_CODE',
            'ROLE_FAMILY','ROLE_ROLLUP_1_ROLE_DEPTNAME',
            'ROLE_ROLLUP_1_ROLE_FAMILY_DESC', 'ROLE_ROLLUP_1_ROLE_FAMILY',
            'ROLE_ROLLUP_1','ROLE_ROLLUP_2']

cols_rnd = ['MGR_ID_ROLE_DEPTNAME','MGR_ID_ROLE_FAMILY','MGR_ID_ROLE_ROLLUP_1',
 'MGR_ID_ROLE_ROLLUP_2','MGR_ID','RESOURCE_MGR_ID','RESOURCE_ROLE_CODE',
 'RESOURCE_ROLE_FAMILY_DESC','RESOURCE_ROLE_FAMILY','RESOURCE_ROLE_ROLLUP_1',
 'RESOURCE_ROLE_ROLLUP_2','ROLE_DEPTNAME_ROLE_FAMILY_DESC','ROLE_FAMILY_DESC_ROLE_CODE',
 'ROLE_FAMILY_DESC_ROLE_FAMILY','ROLE_FAMILY','ROLE_ROLLUP_1_ROLE_CODE',
 'ROLE_ROLLUP_1_ROLE_DEPTNAME','ROLE_ROLLUP_1_ROLE_FAMILY_DESC','ROLE_ROLLUP_2_ROLE_FAMILY']

cols_freq = ['MGR_ID_ROLE_DEPTNAME','RESOURCE_MGR_ID','RESOURCE_ROLE_CODE',
 'RESOURCE_ROLE_DEPTNAME','RESOURCE_ROLE_FAMILY_DESC','RESOURCE_ROLE_FAMILY',
 'RESOURCE_ROLE_ROLLUP_1','ROLE_DEPTNAME_ROLE_FAMILY_DESC','ROLE_DEPTNAME_ROLE_FAMILY',
 'ROLE_DEPTNAME','ROLE_FAMILY_DESC_ROLE_CODE','ROLE_FAMILY_DESC_ROLE_FAMILY',
 'ROLE_ROLLUP_1_ROLE_CODE','ROLE_ROLLUP_2_ROLE_DEPTNAME']

data_svd = transform_dataset(train[cols_svd], test[cols_svd], get_col_interactions_svd)
data_rnd = transform_dataset(train[cols_rnd], test[cols_rnd], 
                             assign_rnd_integer, {"number_of_times":10})
data_freq = transform_dataset(train[cols_freq], test[cols_freq], get_freq_encoding)

In [59]:
data_train = pd.concat([x[0] for x in [data_svd, data_rnd, data_freq]], axis = 1)
data_test = pd.concat([x[1] for x in [data_svd, data_rnd, data_freq]], axis = 1)

In [60]:
print("Dataset shape, Train: {}, Test: {}".format(data_train.shape, data_test.shape))

Dataset shape, Train: (32769, 542), Test: (58921, 542)


In [61]:
validate_model(
    model = get_model(), 
    data = [data_train.values, y]
)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fit_time,5.0,39.893509,2.494584,37.413507,39.156827,39.218722,39.584074,44.094414
score_time,5.0,0.433319,0.047021,0.410871,0.41126,0.411518,0.415581,0.517363
test_score,5.0,0.89057,0.014317,0.873879,0.882327,0.890487,0.894122,0.912035
train_score,5.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


dataset1 ( Label + SVD + Frequency ) : 0.8905

In [62]:
del([data_svd, data_rnd, data_freq])
gc.collect()
data_train = data_train.values
data_test = data_test.values
gc.collect()

0

In [63]:
model = get_model()
model.fit(data_train, y)
predictions_1 = model.predict_proba(data_test)[:,1]

In [64]:
del([data_train, data_test, model])
gc.collect()

12

In [65]:
#dataset #2
cols_svd = ['MGR_ID','RESOURCE_MGR_ID','RESOURCE_ROLE_CODE',
 'RESOURCE_ROLE_DEPTNAME','RESOURCE_ROLE_FAMILY_DESC','RESOURCE_ROLE_FAMILY',
 'RESOURCE_ROLE_ROLLUP_1','RESOURCE','ROLE_CODE',
 'ROLE_DEPTNAME_ROLE_CODE','ROLE_DEPTNAME_ROLE_FAMILY','ROLE_FAMILY_DESC_ROLE_CODE',
 'ROLE_FAMILY_DESC_ROLE_FAMILY','ROLE_FAMILY_DESC','ROLE_ROLLUP_1_ROLE_DEPTNAME',
 'ROLE_ROLLUP_1_ROLE_FAMILY','ROLE_ROLLUP_1_ROLE_ROLLUP_2','ROLE_ROLLUP_2_ROLE_FAMILY_DESC',
 'ROLE_ROLLUP_2_ROLE_FAMILY','ROLE_ROLLUP_2']

cols_rnd = ['MGR_ID_ROLE_CODE','MGR_ID_ROLE_DEPTNAME','MGR_ID_ROLE_FAMILY_DESC',
 'MGR_ID_ROLE_ROLLUP_1','MGR_ID','RESOURCE_ROLE_DEPTNAME',
 'RESOURCE_ROLE_FAMILY','RESOURCE_ROLE_ROLLUP_1','ROLE_CODE',
 'ROLE_DEPTNAME_ROLE_FAMILY_DESC','ROLE_FAMILY_DESC_ROLE_CODE',
 'ROLE_FAMILY_DESC_ROLE_FAMILY','ROLE_FAMILY_ROLE_CODE',
 'ROLE_ROLLUP_1_ROLE_CODE','ROLE_ROLLUP_1_ROLE_FAMILY_DESC',
 'ROLE_ROLLUP_1_ROLE_ROLLUP_2']

cols_freq = ['MGR_ID_ROLE_CODE','MGR_ID_ROLE_DEPTNAME','MGR_ID_ROLE_ROLLUP_1',
 'MGR_ID_ROLE_ROLLUP_2','MGR_ID','RESOURCE_MGR_ID',
 'RESOURCE_ROLE_DEPTNAME','RESOURCE_ROLE_FAMILY_DESC','RESOURCE_ROLE_ROLLUP_2',
 'RESOURCE','ROLE_DEPTNAME_ROLE_FAMILY_DESC','ROLE_DEPTNAME',
 'ROLE_FAMILY_DESC','ROLE_FAMILY','ROLE_ROLLUP_1_ROLE_FAMILY_DESC',
 'ROLE_ROLLUP_1_ROLE_FAMILY','ROLE_ROLLUP_1_ROLE_ROLLUP_2',
 'ROLE_ROLLUP_1','ROLE_ROLLUP_2_ROLE_CODE','ROLE_ROLLUP_2']

cols_te = ['MGR_ID','RESOURCE_MGR_ID','RESOURCE_ROLE_CODE',
 'RESOURCE_ROLE_DEPTNAME','RESOURCE_ROLE_ROLLUP_2','RESOURCE',
 'ROLE_CODE','ROLE_DEPTNAME_ROLE_FAMILY_DESC','ROLE_DEPTNAME_ROLE_FAMILY',
 'ROLE_FAMILY_DESC_ROLE_CODE','ROLE_FAMILY_DESC','ROLE_FAMILY_ROLE_CODE',
 'ROLE_ROLLUP_1_ROLE_FAMILY','ROLE_ROLLUP_2_ROLE_FAMILY','ROLE_ROLLUP_2']

data_svd = transform_dataset(train[cols_svd], test[cols_svd], get_col_interactions_svd)
data_rnd = transform_dataset(train[cols_rnd], test[cols_rnd], 
                             assign_rnd_integer, {"number_of_times":10})
data_freq = transform_dataset(train[cols_freq], test[cols_freq], get_freq_encoding)

In [66]:
te = TargetEncodingExpandingMean(columns_names=cols_te)

X_tr = te.fit_transform(train[cols_te], y)
X_val = te.transform(test[cols_te])

te2 = TargetEncodingSmoothing(
    columns_names= cols_te,
    k = 3, f = 1.5,
)

X_tr2 = get_CV_target_encoding(train[cols_te], y, te2, cv = 5)
te2.fit(train[cols_te], y)
X_val2 = te2.transform(test[cols_te])

data_te_tr = pd.concat([X_tr, X_tr2], axis = 1)
data_te_te = pd.concat([X_val, X_val2], axis = 1)

In [67]:
data_train = pd.concat([x[0] for x in [data_svd, data_rnd, data_freq]], axis = 1)
data_test = pd.concat([x[1] for x in [data_svd, data_rnd, data_freq]], axis = 1)
data_train = pd.concat([data_train, data_te_tr], axis = 1)
data_test = pd.concat([data_test, data_te_te], axis = 1)
print("Dataset shape, Train: {}, Test: {}".format(data_train.shape, data_test.shape))
del([data_svd, data_rnd, data_freq, data_te_tr, data_te_te])
gc.collect()
data_train = data_train.values
data_test = data_test.values
gc.collect()

Dataset shape, Train: (32769, 586), Test: (58921, 586)


0

In [68]:
validate_model(
    model = get_model(), 
    data = [data_train, y]
)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fit_time,5.0,58.637237,3.245689,56.439875,56.648908,57.673382,58.116189,64.307831
score_time,5.0,0.47315,0.137032,0.411322,0.411481,0.411835,0.412833,0.718278
test_score,5.0,0.90821,0.010725,0.900879,0.902061,0.903895,0.90732,0.926896
train_score,5.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [69]:
model = get_model()
model.fit(data_train, y)
predictions_2 = model.predict_proba(data_test)[:,1]

del([data_train, data_test, model])
gc.collect()

378

In [70]:
submission = pd.DataFrame()
submission["Id"] = test["id"]
submission["ACTION"] = (predictions_1 + predictions_2) / 2

In [71]:
submission.to_csv("submission.csv", index = False)