In [None]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    print(averages)
    print("----------------")
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    print(smoothing)
    print("----------------")
    # Apply average function to all target data
    prior = target.mean()
    print(prior)
    print("----------------")
    
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    print(averages)
    print("-----------------")
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [None]:
floattypes = []
inttypes = []
stringtypes = []
for c in trainsub.columns[1:]:
    if(trainsub[c].dtype=='object'):
        trainsub[c] = trainsub[c].astype('str')
        testsub[c] = testsub[c].astype('str')
        stringtypes.append(c)
    elif(trainsub[c].dtype=='int64'):
        #trainsub[c] = trainsub[c].astype('int32')
        #testsub[c] = testsub[c].astype('int32')
        inttypes.append(c)
    else:
        #trainsub[c] = trainsub[c].astype('float32')
        #testsub[c] = testsub[c].astype('float32')
        floattypes.append(c)

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
for col in ['NAME_CONTRACT_TYPE']:
    #stringtypes
    
    trainsub['te_'+col] = 0.
    testsub['te_'+col] = 0.
    SMOOTHING = testsub[~testsub[col].isin(trainsub[col])].shape[0]/testsub.shape[0]
    print(SMOOTHING)
    for f, (vis_index, blind_index) in enumerate(kf.split(trainsub)):
        _, trainsub.loc[blind_index, 'te_'+col] = target_encode(trainsub.loc[vis_index, col], 
                                                            trainsub.loc[blind_index, col], 
                                                            target=trainsub.loc[vis_index,'TARGET'], 
                                                            min_samples_leaf=100,
                                                            smoothing=SMOOTHING,
                                                            noise_level=0.0)
        _, x = target_encode(trainsub.loc[vis_index, col], 
                                              testsub[col], 
                                              target=trainsub.loc[vis_index,'TARGET'], 
                                              min_samples_leaf=100,
                                              smoothing=SMOOTHING,
                                              noise_level=0.0)
        testsub['te_'+col] += (.2*x)
    trainsub.drop(col,inplace=True,axis=1)
    testsub.drop(col,inplace=True,axis=1)