In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.simplefilter('ignore')

In [None]:
import pickle

def to_pickle(filename, obj):
    with open(filename, mode='wb') as f:
        pickle.dump(obj, f)
        
def unpickle(filename):
    with open(filename, mode='rb') as fo:
        p = pickle.load(fo)
    return p 

In [None]:
FOLD_NUM = 4

In [None]:
pre_pet_df = pd.read_csv('../input/pp-stacking-avg-v019-nlp-feature/prepetfinder_add_emoji.csv')
pre_pet_df['Breed_266_265_299'] = pre_pet_df['Breed1'].map(lambda x: 1 if x in [266, 265, 299] else 0)
pre_pet_df['Breed_292_179'] = pre_pet_df['Breed1'].map(lambda x: 1 if x in [292, 179] else 0)
pre_pet_df['Breed_285_103'] = pre_pet_df['Breed1'].map(lambda x: 1 if x in [285, 103] else 0)
pre_pet_df.head()

In [None]:
"""pre_pet_df = pd.concat([pd.read_csv('../input/prepetfinder/pre_petfinder/train/train.csv'), pd.read_csv('../input/prepetfinder/pre_petfinder_test/test/test.csv')])
pre_pet_df['Breed_179'] = (pre_pet_df['Breed1'] == 179).astype(int)
pre_pet_df['Breed_103'] = (pre_pet_df['Breed1'] == 103).astype(int)
pre_pet_df['RescuerID_8b6c'] = (pre_pet_df['RescuerID'] == '8b6c5cd067ada5f54ca5ffc7f7b5d896').astype(int)
pre_pet_df['Name_len'] = pre_pet_df['Name'].fillna('').apply(lambda x : len(x))
pre_pet_df['Name_word_len'] = pre_pet_df['Name'].fillna('').apply(lambda x : len(x.split(' ')))
pre_pet_df['Description_len'] = pre_pet_df['Description'].fillna('').apply(lambda x : len(x.split(' ')))
pre_pet_df['Description_photo'] = pre_pet_df['Description_len'] * pre_pet_df['PhotoAmt']
"""

In [None]:
from tqdm import tqdm
tqdm.pandas()


#pre_pet_df = merge_emoji(pre_pet_df)

print(pre_pet_df.shape)
pre_pet_df.head()

In [None]:
print(pre_pet_df.columns)
#pre_pet_df['is_emoji'].value_counts()

In [None]:
pre_pet_df.to_csv('prepetfinder_add_emoji.csv', index=False)

In [None]:
pre_pet_df['num_name_&'] = pre_pet_df['Name'].fillna('').apply(lambda x: sum(x.count(w) for w in '&'))

_df = pre_pet_df['RescuerID'].value_counts().reset_index()
pre_pet_df['RescuerID_new_entry'] = pre_pet_df['RescuerID'].isin(_df[_df['RescuerID'] <= 6]['index']).astype(int)

In [None]:
# oofが含まれているDataFrame
train_df_org = pd.read_csv('../input/pet-oofs/ensemble_pp_0128_oofs.csv')
train_df_org['oof_0149'] = pd.read_csv('../input/pet-oofs/oof_DESKTOP-M3SEAIN_exp0149.csv').sort_values('Id').reset_index(drop=True)['oof']
train_df_org['oof_0156'] = pd.read_csv('../input/pet-oofs/oof_DESKTOP-M3SEAIN_exp0156.csv').sort_values('Id').reset_index(drop=True)['oof']
train_df_org['oof_0158'] = pd.read_csv('../input/pet-oofs/oof_DESKTOP-M3SEAIN_exp0158.csv').sort_values('Id').reset_index(drop=True)['oof']
train_df_org['k_oof_0084'] = pd.read_csv('../input/petfinder2-oof-dup-regenerate/oof084.csv').sort_values('Id').reset_index(drop=True)['oof']
train_df_org['k_oof_0085'] = pd.read_csv('../input/petfinder2-oof-dup-regenerate/oof085.csv').sort_values('Id').reset_index(drop=True)['oof']


## oofを追加するときはtrain_df_orgに足していく
## train_df_orgはIdでソートされているので、oofを追加するときは追加する側も必ずIdでソートすること。
# 例：
# train_df_org['new_oof'] = pd.read_csv('new_oof.csv').sort_values('Id').reset_index(drop=True)['oof_column']

In [None]:
train_df_org = train_df_org.merge(pd.read_csv('../input/pp-stacking-avg-v021-bert-sentiment/prepetfinder_add_bert_sentiment.csv')[['PetID', 'sentiment_name_label_roberta']], how='left', on='PetID')

In [None]:
train_df_org = train_df_org.merge(pre_pet_df[[
    'PetID', 'Breed_179', 'Breed_103', 'RescuerID_8b6c', 'Name_len', 'Description_len', 'Description_photo', 'Quantity', 'Type', 'Name_word_len',
    'is_emoji', 'Breed_266_265_299', 'Breed_292_179', 'Breed_285_103',
    'num_name_&',
    'RescuerID_new_entry'
]], how='left', on='PetID').fillna(0)

In [None]:
base_features = ['oof', 'Breed_307', 'Breed_266_265_299', 'Breed_292_179', 'Breed_285_103', 'Age', 'RescuerID_3', 'RescuerID_5', 'Description_len', 'PhotoAmt', 'Name_word_len',
                 'is_emoji',
                 #'sentiment_name_label_roberta',
                 #'num_name_&',
                 'RescuerID_new_entry'
                ]
ad0_features = ['oof', 'Age']
ad1_features = ['oof', 'Age']
ad2_features = ['oof', 'Breed_205', 'Breed_307', 'Breed_266_265_299', 'Age']
ad3_features = ['oof', 'Breed_307', 'Breed_266_265_299', 'Age']
ad4_features = ['oof', 'Breed_307', 'Breed_266_265_299', 'Age']


In [None]:
before = train_df_org.copy()

## Fold作成

In [None]:
def update_seed(seed=42):
    #train_df_org['Pawpularity_bin'] = pd.cut(train_df_org['Pawpularity'], 10, labels=False)
    train_df_org['fold_num'] = -1

    under_threshold_df = train_df_org[(train_df_org['sim'] < 0.65) | (train_df_org['adoptionSpeed'] < 0)].copy().reset_index(drop=True) # 1.前回コンペとの重複無し
    over_threshold_df_as0 = train_df_org[(train_df_org['sim'] >= 0.65) & (train_df_org['adoptionSpeed'] == 0)].copy().reset_index(drop=True) # 2.前回コンペとの重複且AdoptionSpeed=0
    over_threshold_df_as1 = train_df_org[(train_df_org['sim'] >= 0.65) & (train_df_org['adoptionSpeed'] == 1)].copy().reset_index(drop=True) # 3.前回コンペとの重複且AdoptionSpeed=1
    over_threshold_df_as2 = train_df_org[(train_df_org['sim'] >= 0.65) & (train_df_org['adoptionSpeed'] == 2)].copy().reset_index(drop=True) # 4.前回コンペとの重複且AdoptionSpeed=2
    over_threshold_df_as3 = train_df_org[(train_df_org['sim'] >= 0.65) & (train_df_org['adoptionSpeed'] == 3)].copy().reset_index(drop=True) # 5.前回コンペとの重複且AdoptionSpeed=3
    over_threshold_df_as4 = train_df_org[(train_df_org['sim'] >= 0.65) & (train_df_org['adoptionSpeed'] == 4)].copy().reset_index(drop=True) # 6.前回コンペとの重複且AdoptionSpeed=4

    skf = StratifiedKFold(n_splits=FOLD_NUM, random_state=seed, shuffle=True)

    for _df in [under_threshold_df, over_threshold_df_as0, over_threshold_df_as1, over_threshold_df_as2, over_threshold_df_as3, over_threshold_df_as4]:

        for fold, (_, test_index) in enumerate(skf.split(_df, _df['Pawpularity_bin'])):
            _df['fold_num'] = _df['fold_num'].where(~(_df.index.isin(test_index)), fold)

    return pd.concat([under_threshold_df, over_threshold_df_as0, over_threshold_df_as1, over_threshold_df_as2, over_threshold_df_as3, over_threshold_df_as4]).sort_values('Id').reset_index(drop=True).copy()


In [None]:
def pp_stacking(phase='train'):
        global train_pp_df
        if phase == 'train':
            model_as = []
            stacking_model = BayesianRidge()
            stacking_model.fit(train_pp_df[['oof_0158', 'oof_0149', 'oof_0080', 'oof_c0007', 'oof_johannyjm_1790651', 'oof_0042', 
                                            'oof_0044',
                                            'oof_0055', 'oof_fastai_0004',
                                            'k_oof_0084',
                                            'oof_0062', 'oof_0126', 'oof_0074']], train_pp_df['Pawpularity'])
            model_as.append(stacking_model)
        else:
            model_as = model_folds_stacking[-1]

        train_pp_df['oof'] = model_as[0].predict(train_pp_df[['oof_0158', 'oof_0080', 'oof_0158', 'oof_c0007', 'oof_johannyjm_1790651', 'oof_0042', 
                                                              'oof_0044',
                                                              'oof_0055', 'oof_fastai_0004', 
                                                              'k_oof_0084',
                                                              'oof_0062', 'oof_0126', 'oof_0074']])
        train_pp_df['stacking_oof'] = train_pp_df['oof']

        under_threshold_df = train_pp_df[train_pp_df['sim'] < 0.65].copy()
        over_threshold_df = train_pp_df[train_pp_df['sim'] >= 0.65].copy()

        if phase == 'train':
            lr_base = BayesianRidge()

            lr0 = BayesianRidge()
            lr1 = BayesianRidge()
            lr2 = BayesianRidge()
            lr3 = BayesianRidge()
            lr4 = BayesianRidge()

            lr_base.fit(over_threshold_df[base_features], over_threshold_df['Pawpularity'])
            over_threshold_df['oof'] = lr_base.predict(over_threshold_df[base_features])

            over_threshold_df_as0 = over_threshold_df[over_threshold_df['adoptionSpeed'] == 0].copy()
            over_threshold_df_as1 = over_threshold_df[over_threshold_df['adoptionSpeed'] == 1].copy()
            over_threshold_df_as2 = over_threshold_df[over_threshold_df['adoptionSpeed'] == 2].copy()
            over_threshold_df_as3 = over_threshold_df[over_threshold_df['adoptionSpeed'] == 3].copy()
            over_threshold_df_as4 = over_threshold_df[over_threshold_df['adoptionSpeed'] == 4].copy()
            over_threshold_df_other = over_threshold_df[~(over_threshold_df['adoptionSpeed'].isin([i for i in range(5)]))].copy()

            lr0.fit(over_threshold_df_as0[ad0_features], over_threshold_df_as0['Pawpularity'])
            lr1.fit(over_threshold_df_as1[ad1_features], over_threshold_df_as1['Pawpularity'])
            lr2.fit(over_threshold_df_as2[ad2_features], over_threshold_df_as2['Pawpularity'])
            lr3.fit(over_threshold_df_as3[ad3_features], over_threshold_df_as3['Pawpularity'])
            lr4.fit(over_threshold_df_as4[ad4_features], over_threshold_df_as4['Pawpularity'])

            model_as.append(lr0)
            model_as.append(lr1)
            model_as.append(lr2)
            model_as.append(lr3)
            model_as.append(lr4)
            model_as.append(lr_base)
        else:
            over_threshold_df['oof'] = model_as[6].predict(over_threshold_df[base_features])

            over_threshold_df_as0 = over_threshold_df[over_threshold_df['adoptionSpeed'] == 0].copy()
            over_threshold_df_as1 = over_threshold_df[over_threshold_df['adoptionSpeed'] == 1].copy()
            over_threshold_df_as2 = over_threshold_df[over_threshold_df['adoptionSpeed'] == 2].copy()
            over_threshold_df_as3 = over_threshold_df[over_threshold_df['adoptionSpeed'] == 3].copy()
            over_threshold_df_as4 = over_threshold_df[over_threshold_df['adoptionSpeed'] == 4].copy()
            over_threshold_df_other = over_threshold_df[~(over_threshold_df['adoptionSpeed'].isin([i for i in range(5)]))].copy()

        over_threshold_df_as0['oof'] = model_as[1].predict(over_threshold_df_as0[ad0_features])
        over_threshold_df_as1['oof'] = model_as[2].predict(over_threshold_df_as1[ad1_features])
        over_threshold_df_as2['oof'] = model_as[3].predict(over_threshold_df_as2[ad2_features])
        over_threshold_df_as3['oof'] = model_as[4].predict(over_threshold_df_as3[ad3_features])
        over_threshold_df_as4['oof'] = model_as[5].predict(over_threshold_df_as4[ad4_features])

        train_adjust_df = pd.concat([under_threshold_df, over_threshold_df_as0, over_threshold_df_as1, over_threshold_df_as2, over_threshold_df_as3, over_threshold_df_as4, over_threshold_df_other]).sort_values('index')
        return train_adjust_df, model_as

In [None]:
def pp_average(phase='train'):
    global train_pp_df
    if phase == 'train':
        model_as = []
        stacking_model = BayesianRidge()
        model_as.append(stacking_model)
    else:
        model_as = model_folds_average[-1]    
    train_pp_df['oof'] =  train_df_org[['oof_0158', 'oof_0149', 'oof_0080', 'oof_c0007', 'oof_johannyjm_1790651', 'oof_0042',
                                        'k_oof_0085',
                                        'oof_0055', 'oof_fastai_0004', 
                                        'k_oof_0084',
                                        'oof_0062', 'oof_0126', 'oof_0074']].mean(axis=1)
    train_pp_df['average_oof'] = train_pp_df['oof']
    
    under_threshold_df = train_pp_df[train_pp_df['sim'] < 0.65].copy()
    over_threshold_df = train_pp_df[train_pp_df['sim'] >= 0.65].copy()
    
    if phase == 'train':
        lr_base = BayesianRidge()
        
        lr0 = BayesianRidge()
        lr1 = BayesianRidge()
        lr2 = BayesianRidge()
        lr3 = BayesianRidge()
        lr4 = BayesianRidge()
        
        lr_base.fit(over_threshold_df[base_features], over_threshold_df['Pawpularity'])
        over_threshold_df['oof'] = lr_base.predict(over_threshold_df[base_features])
        
        over_threshold_df_as0 = over_threshold_df[over_threshold_df['adoptionSpeed'] == 0].copy()
        over_threshold_df_as1 = over_threshold_df[over_threshold_df['adoptionSpeed'] == 1].copy()
        over_threshold_df_as2 = over_threshold_df[over_threshold_df['adoptionSpeed'] == 2].copy()
        over_threshold_df_as3 = over_threshold_df[over_threshold_df['adoptionSpeed'] == 3].copy()
        over_threshold_df_as4 = over_threshold_df[over_threshold_df['adoptionSpeed'] == 4].copy()
        over_threshold_df_other = over_threshold_df[~(over_threshold_df['adoptionSpeed'].isin([i for i in range(5)]))].copy()
        
        lr0.fit(over_threshold_df_as0[ad0_features], over_threshold_df_as0['Pawpularity'])
        lr1.fit(over_threshold_df_as1[ad1_features], over_threshold_df_as1['Pawpularity'])
        lr2.fit(over_threshold_df_as2[ad2_features], over_threshold_df_as2['Pawpularity'])
        lr3.fit(over_threshold_df_as3[ad3_features], over_threshold_df_as3['Pawpularity'])
        lr4.fit(over_threshold_df_as4[ad4_features], over_threshold_df_as4['Pawpularity'])
        
        model_as.append(lr0)
        model_as.append(lr1)
        model_as.append(lr2)
        model_as.append(lr3)
        model_as.append(lr4)
        model_as.append(lr_base)
    else:
        over_threshold_df['oof'] = model_as[6].predict(over_threshold_df[base_features])

        over_threshold_df_as0 = over_threshold_df[over_threshold_df['adoptionSpeed'] == 0].copy()
        over_threshold_df_as1 = over_threshold_df[over_threshold_df['adoptionSpeed'] == 1].copy()
        over_threshold_df_as2 = over_threshold_df[over_threshold_df['adoptionSpeed'] == 2].copy()
        over_threshold_df_as3 = over_threshold_df[over_threshold_df['adoptionSpeed'] == 3].copy()
        over_threshold_df_as4 = over_threshold_df[over_threshold_df['adoptionSpeed'] == 4].copy()
        over_threshold_df_other = over_threshold_df[~(over_threshold_df['adoptionSpeed'].isin([i for i in range(5)]))].copy()
        
    over_threshold_df_as0['oof'] = model_as[1].predict(over_threshold_df_as0[ad0_features])
    over_threshold_df_as1['oof'] = model_as[2].predict(over_threshold_df_as1[ad1_features])
    over_threshold_df_as2['oof'] = model_as[3].predict(over_threshold_df_as2[ad2_features])
    over_threshold_df_as3['oof'] = model_as[4].predict(over_threshold_df_as3[ad3_features])
    over_threshold_df_as4['oof'] = model_as[5].predict(over_threshold_df_as4[ad4_features])
    
    train_adjust_df = pd.concat([under_threshold_df, over_threshold_df_as0, over_threshold_df_as1, over_threshold_df_as2, over_threshold_df_as3, over_threshold_df_as4, over_threshold_df_other]).sort_values('index')
    
    return train_adjust_df, model_as

In [None]:
%%time

rmses = []
rmses_wo_pp = []
result_dfs_all = []
result_dfs_all_stacking = []
result_dfs_all_average = []

model_folds_stacking = []
model_folds_average = []

for i, seed in enumerate(range(42, 62)):
    #print(f'====={i}=====')
    
    train_df_org = update_seed(seed=seed)

    
    result_dfs = []
    params_fold = []
    for fold in range(FOLD_NUM):

        #print(f'====={fold}=====')
        train_pp_df = train_df_org[train_df_org['fold_num'] != fold].copy()
        cv_valid_df = train_df_org[train_df_org['fold_num'] == fold].copy()

        # 学習
        result_df, model_result = pp_stacking()
        model_folds_stacking.append(model_result)
        #print('train score:{}'.format(np.sqrt(mean_squared_error(result_df['Pawpularity'], result_df['oof']))))

        # 推論&
        train_pp_df = cv_valid_df
        result_df, _ = pp_stacking(phase='valid')
        #print('valid score:{}'.format(np.sqrt(mean_squared_error(result_df['Pawpularity'], result_df['oof']))))
        result_dfs.append(result_df)

    ## Stackingのoof score
    result_all_df = pd.concat(result_dfs)
    result_all_df_stacking = pd.concat(result_dfs)
    #print('#' * 35)
    #print('stacking score:{}'.format(np.sqrt(mean_squared_error(result_all_df['Pawpularity'], result_all_df['oof']))))
    #print('#' * 35)
    
    result_dfs_all.append(result_all_df.copy())
    result_dfs_all_stacking.append(result_all_df.copy())
    
    

    
    result_dfs_average = []
    params_fold = []
    for fold in range(FOLD_NUM):

        #print(f'====={fold}=====')
        train_pp_df = train_df_org[train_df_org['fold_num'] != fold].copy()
        cv_valid_df = train_df_org[train_df_org['fold_num'] == fold].copy()

        result_df_average, model_result_average = pp_average()
        model_folds_average.append(model_result_average)
        #print('train score:{}'.format(np.sqrt(mean_squared_error(result_df_average['Pawpularity'], result_df_average['oof']))))

        train_pp_df = cv_valid_df

        result_df_average, _ = pp_average(phase='valid')
        #print('valid score:{}'.format(np.sqrt(mean_squared_error(result_df_average['Pawpularity'], result_df_average['oof']))))
        result_dfs_average.append(result_df_average)

    result_all_df_average = pd.concat(result_dfs_average)
    #print('#' * 35)
    #print(np.sqrt(mean_squared_error(result_all_df_average['Pawpularity'], result_all_df_average['oof'])))
    #print('#' * 35)
    
    
    stacking_oof = result_all_df_stacking.groupby('Id')['oof'].mean().reset_index().sort_values('Id')['oof'].to_numpy()
    average_oof = result_all_df_average.groupby('Id')['oof'].mean().reset_index().sort_values('Id')['oof'].to_numpy()
    
    
    stacking_average_rmse = np.sqrt(mean_squared_error(result_all_df.sort_values('Id')['Pawpularity'], np.average([stacking_oof, average_oof], weights=[0.75, 0.25], axis=0)))
    #stacking_average_rmse = np.sqrt(mean_squared_error(result_all_df['Pawpularity'], (result_all_df.reset_index(drop=True)['oof'] + result_all_df_average.reset_index(drop=True)['oof']) / 2))
    stacking_average_rmse_wo_pp = np.sqrt(mean_squared_error(result_all_df['Pawpularity'], (result_all_df.reset_index(drop=True)['stacking_oof'] + result_all_df_average.reset_index(drop=True)['average_oof']) / 2))
    
    
    print( f'### {i} ' + '#' * 25)
    print(stacking_average_rmse)
    #print('#' * 35)
    
    rmses.append(stacking_average_rmse)
    rmses_wo_pp.append(stacking_average_rmse_wo_pp)
    
    result_dfs_all.append(result_all_df_average.copy())
    result_dfs_all_average.append(result_all_df_average.copy())

In [None]:
import matplotlib.pyplot as plt
plt.hist(rmses, bins=20)

## score

In [None]:
np.sqrt(mean_squared_error(result_all_df.sort_values('Id')['Pawpularity'], pd.concat(result_dfs_all).groupby('Id')['oof'].mean().to_frame().reset_index().sort_values('Id')['oof']))

In [None]:
stacking_oof = pd.concat(result_dfs_all_stacking).groupby('Id')['oof'].mean().reset_index().sort_values('Id')['oof'].to_numpy()
average_oof = pd.concat(result_dfs_all_average).groupby('Id')['oof'].mean().reset_index().sort_values('Id')['oof'].to_numpy()
np.sqrt(mean_squared_error(result_all_df.sort_values('Id')['Pawpularity'], np.average([stacking_oof, average_oof], weights=[0.75, 0.25], axis=0)))

## score without pp

In [None]:
np.sqrt(mean_squared_error(result_all_df.sort_values('Id')['Pawpularity'], pd.concat(result_dfs_all).groupby('Id')['stacking_oof'].mean().to_frame().reset_index().sort_values('Id')['stacking_oof']))

In [None]:
to_pickle(f'./pp_models_avg.pkl', model_folds_average)
to_pickle(f'./pp_models_stacking.pkl', model_folds_stacking)