In [1]:
%%time
import pandas as pd
import numpy as np
from tqdm import tqdm 
from sklearn.preprocessing import LabelEncoder
from scipy import stats

train = pd.read_csv('train_data.csv')
train['salary_round'] = np.round(train['salary'])
test = pd.read_csv('test_data.csv')
df = pd.concat([train,test],axis=0)
lbl = LabelEncoder()
df['area'] = lbl.fit_transform(df['area'])
df['age_bin'] = pd.cut(df['age'], [0, 23, 30,40,50,60,100], labels=[1, 2, 3,4,5,6])
df['age_bin'] = df['age_bin'].astype('int')

    
## aggregation features
def agg(df,agg_cols):
    for c in tqdm(agg_cols):
        print (c)
        print (c['agg'])
        new_feature = '{}_{}_{}'.format('_'.join(c['groupby']), c['agg'], c['target'])
        if c['agg'] == 'mode':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].apply(pd.Series.mode).reset_index(drop=True)           
        elif c['agg'] == 'diff':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform(lambda x: x.diff())
        elif c['agg'] == 'cumcount':
            df[new_feature] = df.groupby(c['groupby']).cumcount()
        elif c['agg'] == 'shift':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].shift()      
        else:    
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform(c['agg'])
    return df

agg_cols = [        

    {'groupby': ['position'], 'target':'area', 'agg':'nunique'},      
    {'groupby': ['position'], 'target':'education', 'agg':'mean'},       
    {'groupby': ['position'], 'target':'age', 'agg':'max'}, 
    {'groupby': ['position'], 'target':'age', 'agg':'min'},     
    {'groupby': ['position'], 'target':'age', 'agg':'mean'}, 
    {'groupby': ['position'], 'target':'age', 'agg':'std'}, 
    {'groupby': ['position'], 'target':'age', 'agg':'nunique'},     
# ---------------
#     position
# ---------------    
    {'groupby': ['position'], 'target':'commute', 'agg':'mean'}, 
    {'groupby': ['position'], 'target':'commute', 'agg':'max'}, 
    {'groupby': ['position'], 'target':'commute', 'agg':'min'}, 
    {'groupby': ['position'], 'target':'commute', 'agg':'std'},  
        
# ---------------
#     age
# ---------------      
    
    {'groupby': ['age'], 'target':'commute', 'agg':'mean'}, 
    {'groupby': ['age'], 'target':'commute', 'agg':'max'}, 
    {'groupby': ['age'], 'target':'commute', 'agg':'min'}, 
    {'groupby': ['age'], 'target':'commute', 'agg':'std'},  
    {'groupby': ['age'], 'target':'commute', 'agg':'median'},     
    {'groupby': ['age'], 'target':'commute', 'agg':'var'},  
    {'groupby': ['age'], 'target':'commute', 'agg':'skew'},               
    
# ---------------
#     area
# ---------------      
    {'groupby': ['area'], 'target':'commute', 'agg':'mean'}, 
    {'groupby': ['area'], 'target':'commute', 'agg':'max'}, 
    {'groupby': ['area'], 'target':'commute', 'agg':'min'}, 
    {'groupby': ['area'], 'target':'commute', 'agg':'std'},  
    {'groupby': ['area'], 'target':'commute', 'agg':'median'},     
    {'groupby': ['area'], 'target':'commute', 'agg':'var'},  
    {'groupby': ['area'], 'target':'commute', 'agg':'skew'},      
# ---------------
#     education
# ---------------      
    {'groupby': ['education'], 'target':'commute', 'agg':'mean'}, 
    {'groupby': ['education'], 'target':'commute', 'agg':'max'}, 
    {'groupby': ['education'], 'target':'commute', 'agg':'min'}, 
    {'groupby': ['education'], 'target':'commute', 'agg':'std'},  
   
# ---------------
#     sex
# ---------------      
    {'groupby': ['sex'], 'target':'commute', 'agg':'mean'}, 
    {'groupby': ['sex'], 'target':'commute', 'agg':'max'}, 
    {'groupby': ['sex'], 'target':'commute', 'agg':'min'}, 
    {'groupby': ['sex'], 'target':'commute', 'agg':'std'},  
    
# ---------------
#     service_length
# ---------------      
    {'groupby': ['service_length'], 'target':'commute', 'agg':'mean'}, 
    {'groupby': ['service_length'], 'target':'commute', 'agg':'max'}, 
    {'groupby': ['service_length'], 'target':'commute', 'agg':'min'}, 
    {'groupby': ['service_length'], 'target':'commute', 'agg':'std'},      
   
# ---------------
#     num_child
# ---------------       
    {'groupby': ['num_child'], 'target':'commute', 'agg':'mean'}, 
    {'groupby': ['num_child'], 'target':'commute', 'agg':'max'}, 
    {'groupby': ['num_child'], 'target':'commute', 'agg':'min'}, 
    {'groupby': ['num_child'], 'target':'commute', 'agg':'std'},  
 
# ---------------
#     partner
# ---------------      
    {'groupby': ['partner'], 'target':'commute', 'agg':'mean'}, 
    {'groupby': ['partner'], 'target':'commute', 'agg':'max'}, 
    {'groupby': ['partner'], 'target':'commute', 'agg':'min'}, 
    {'groupby': ['partner'], 'target':'commute', 'agg':'std'},      
    
# ---------------
#     position sex
# --------------- 
    {'groupby': ['position','sex'], 'target':'commute', 'agg':'mean'}, 
    {'groupby': ['position','sex'], 'target':'commute', 'agg':'max'}, 
    {'groupby': ['position','sex'], 'target':'commute', 'agg':'min'}, 
# ---------------
#     position age
# ---------------     
    {'groupby': ['position','age'], 'target':'commute', 'agg':'mean'}, 
    {'groupby': ['position','age'], 'target':'commute', 'agg':'max'}, 
    {'groupby': ['position','age'], 'target':'commute', 'agg':'min'},     
# ---------------
#     position area
# ---------------     
    {'groupby': ['position','area'], 'target':'commute', 'agg':'mean'}, 
    {'groupby': ['position','area'], 'target':'commute', 'agg':'max'}, 
    {'groupby': ['position','area'], 'target':'commute', 'agg':'min'},
# ---------------
#     position education
# --------------- 
    {'groupby': ['position','education'], 'target':'commute', 'agg':'mean'}, 
    {'groupby': ['position','education'], 'target':'commute', 'agg':'max'}, 
    {'groupby': ['position','education'], 'target':'commute', 'agg':'min'},    
# ---------------
#     position num_child
# ---------------     
    {'groupby': ['position','num_child'], 'target':'commute', 'agg':'mean'}, 
    {'groupby': ['position','num_child'], 'target':'commute', 'agg':'max'}, 
    {'groupby': ['position','num_child'], 'target':'commute', 'agg':'min'},      
    
    {'groupby': ['position','partner'], 'target':'commute', 'agg':'mean'}, 
    {'groupby': ['position','partner'], 'target':'commute', 'agg':'max'}, 
    {'groupby': ['position','partner'], 'target':'commute', 'agg':'min'},  
    
    {'groupby': ['position','num_child','partner'], 'target':'commute', 'agg':'mean'}, 
    {'groupby': ['position','num_child','partner'], 'target':'commute', 'agg':'max'},      
    {'groupby': ['position','num_child','partner'], 'target':'commute', 'agg':'min'},   
    
    {'groupby': ['position','age','num_child','partner'], 'target':'commute', 'agg':'mean'}, 
    {'groupby': ['position','age','num_child','partner'], 'target':'commute', 'agg':'max'},      
    {'groupby': ['position','age','num_child','partner'], 'target':'commute', 'agg':'min'},   
    
    {'groupby': ['position','area','num_child','partner'], 'target':'commute', 'agg':'mean'}, 
    {'groupby': ['position','area','num_child','partner'], 'target':'commute', 'agg':'max'},      
    {'groupby': ['position','area','num_child','partner'], 'target':'commute', 'agg':'min'}, 
    
    {'groupby': ['position','education','num_child','partner'], 'target':'commute', 'agg':'mean'}, 
    {'groupby': ['position','education','num_child','partner'], 'target':'commute', 'agg':'max'},      
    {'groupby': ['position','education','num_child','partner'], 'target':'commute', 'agg':'min'},    
    
    {'groupby': ['position','sex','num_child','partner'], 'target':'commute', 'agg':'mean'}, 
    {'groupby': ['position','sex','num_child','partner'], 'target':'commute', 'agg':'max'},      
    {'groupby': ['position','sex','num_child','partner'], 'target':'commute', 'agg':'min'},       
]

df = agg(df,agg_cols)

df['diff_position_mean_commute'] = df['position_mean_commute'] - df['commute']
df['diff_age_mean_commute'] = df['age_mean_commute'] - df['commute']
df['diff_area_mean_commute'] = df['area_mean_commute'] - df['commute']
df['diff_education_mean_commute'] = df['education_mean_commute'] - df['commute']
df['diff_sex_mean_commute'] = df['sex_mean_commute'] - df['commute']
df['diff_service_length_mean_commute'] = df['service_length_mean_commute'] - df['commute']
df['diff_num_child_mean_commute'] = df['num_child_mean_commute'] - df['commute']
df['diff_partner_mean_commute'] = df['partner_mean_commute'] - df['commute']

df['ratio_position_mean_commute'] = df['position_mean_commute'] / df['commute']
df['ratio_age_mean_commute'] = df['age_mean_commute'] / df['commute']
df['ratio_area_mean_commute'] = df['area_mean_commute'] / df['commute']
df['ratio_education_mean_commute'] = df['education_mean_commute'] / df['commute']
df['ratio_sex_mean_commute'] = df['sex_mean_commute'] / df['commute']
df['ratio_service_length_mean_commute'] = df['service_length_mean_commute'] / df['commute']
df['ratio_num_child_mean_commute'] = df['num_child_mean_commute'] / df['commute']
df['ratio_partner_mean_commute'] = df['partner_mean_commute'] / df['commute']


df['diff_position_max_commute'] = df['position_max_commute'] - df['commute']
df['diff_age_max_commute'] = df['age_max_commute'] - df['commute']
df['diff_area_max_commute'] = df['area_max_commute'] - df['commute']
df['diff_education_max_commute'] = df['education_max_commute'] - df['commute']
df['diff_sex_max_commute'] = df['sex_max_commute'] - df['commute']
df['diff_service_length_max_commute'] = df['service_length_max_commute'] - df['commute']
df['diff_num_child_max_commute'] = df['num_child_max_commute'] - df['commute']
df['diff_partner_max_commute'] = df['partner_max_commute'] - df['commute']

df['ratio_position_max_commute'] = df['position_max_commute'] / df['commute']
df['ratio_age_max_commute'] = df['age_max_commute'] / df['commute']
df['ratio_area_max_commute'] = df['area_max_commute'] / df['commute']
df['ratio_education_max_commute'] = df['education_max_commute'] / df['commute']
df['ratio_sex_max_commute'] = df['sex_max_commute'] / df['commute']
df['ratio_service_length_max_commute'] = df['service_length_max_commute'] / df['commute']
df['ratio_num_child_max_commute'] = df['num_child_max_commute'] / df['commute']
df['ratio_partner_max_commute'] = df['partner_max_commute'] / df['commute']


df['diff_position_min_commute'] = df['position_min_commute'] - df['commute']
df['diff_age_min_commute'] = df['age_min_commute'] - df['commute']
df['diff_area_min_commute'] = df['area_min_commute'] - df['commute']
df['diff_education_min_commute'] = df['education_min_commute'] - df['commute']
df['diff_sex_min_commute'] = df['sex_min_commute'] - df['commute']
df['diff_service_length_min_commute'] = df['service_length_min_commute'] - df['commute']
df['diff_num_child_min_commute'] = df['num_child_min_commute'] - df['commute']
df['diff_partner_min_commute'] = df['partner_min_commute'] - df['commute']

df['ratio_position_min_commute'] = df['position_min_commute'] / df['commute']
df['ratio_age_min_commute'] = df['age_min_commute'] / df['commute']
df['ratio_area_min_commute'] = df['area_min_commute'] / df['commute']
df['ratio_education_min_commute'] = df['education_min_commute'] / df['commute']
df['ratio_sex_min_commute'] = df['sex_min_commute'] / df['commute']
df['ratio_service_length_min_commute'] = df['service_length_min_commute'] / df['commute']
df['ratio_num_child_min_commute'] = df['num_child_min_commute'] / df['commute']
df['ratio_partner_min_commute'] = df['partner_min_commute'] / df['commute']

df['diff_position_mean_age'] = df['position_mean_age'] - df['age']
df['diff_position_max_age'] = df['position_max_age'] - df['age']
df['diff_position_min_age'] = df['position_min_age'] - df['age']

df['ratio_position_mean_age'] = df['position_mean_age'] / df['age']
df['ratio_position_max_age'] = df['position_max_age'] / df['age']
df['ratio_position_min_age'] = df['position_min_age'] / df['age']

df['diff_position_mean_education'] = df['position_mean_education'] - df['education']
df['ratio_position_mean_education'] = df['position_mean_education'] / df['education']


df['diff_position_sex_max_commute'] = df['position_sex_max_commute'] - df['commute']
df['diff_position_sex_min_commute'] = df['position_sex_min_commute'] - df['commute']
df['diff_position_sex_mean_commute'] = df['position_sex_mean_commute'] - df['commute']

df['diff_position_age_max_commute'] = df['position_age_max_commute'] - df['commute']
df['diff_position_age_min_commute'] = df['position_age_min_commute'] - df['commute']
df['diff_position_age_mean_commute'] = df['position_age_mean_commute'] - df['commute']

df['diff_position_area_max_commute'] = df['position_area_max_commute'] - df['commute']
df['diff_position_area_min_commute'] = df['position_area_min_commute'] - df['commute']
df['diff_position_area_mean_commute'] = df['position_area_mean_commute'] - df['commute']

df['diff_position_education_max_commute'] = df['position_education_max_commute'] - df['commute']
df['diff_position_education_min_commute'] = df['position_education_min_commute'] - df['commute']
df['diff_position_education_mean_commute'] = df['position_education_mean_commute'] - df['commute']

df['diff_position_num_child_max_commute'] = df['position_num_child_max_commute'] - df['commute']
df['diff_position_num_child_min_commute'] = df['position_num_child_min_commute'] - df['commute']
df['diff_position_num_child_mean_commute'] = df['position_num_child_mean_commute'] - df['commute']

df['diff_position_partner_max_commute'] = df['position_partner_max_commute'] - df['commute']
df['diff_position_partner_min_commute'] = df['position_partner_min_commute'] - df['commute']
df['diff_position_partner_mean_commute'] = df['position_partner_mean_commute'] - df['commute']

df['ratio_position_sex_max_commute'] = df['position_sex_max_commute'] / df['commute']
df['ratio_position_sex_min_commute'] = df['position_sex_min_commute'] / df['commute']
df['ratio_position_sex_mean_commute'] = df['position_sex_mean_commute'] / df['commute']

df['ratio_position_age_max_commute'] = df['position_age_max_commute'] / df['commute']
df['ratio_position_age_min_commute'] = df['position_age_min_commute'] / df['commute']
df['ratio_position_age_mean_commute'] = df['position_age_mean_commute'] / df['commute']

df['ratio_position_area_max_commute'] = df['position_area_max_commute'] / df['commute']
df['ratio_position_area_min_commute'] = df['position_area_min_commute'] / df['commute']
df['ratio_position_area_mean_commute'] = df['position_area_mean_commute'] / df['commute']

df['ratio_position_education_max_commute'] = df['position_education_max_commute'] / df['commute']
df['ratio_position_education_min_commute'] = df['position_education_min_commute'] / df['commute']
df['ratio_position_education_mean_commute'] = df['position_education_mean_commute'] / df['commute']

df['ratio_position_num_child_max_commute'] = df['position_num_child_max_commute'] / df['commute']
df['ratio_position_num_child_min_commute'] = df['position_num_child_min_commute'] / df['commute']
df['ratio_position_num_child_mean_commute'] = df['position_num_child_mean_commute'] / df['commute']

df['ratio_position_partner_max_commute'] = df['position_partner_max_commute'] / df['commute']
df['ratio_position_partner_min_commute'] = df['position_partner_min_commute'] / df['commute']
df['ratio_position_partner_mean_commute'] = df['position_partner_mean_commute'] / df['commute']

df['diff_position_num_child_partner_max_commute'] = df['position_num_child_partner_max_commute'] - df['commute']
df['diff_position_num_child_partner_min_commute'] = df['position_num_child_partner_min_commute'] - df['commute']
df['diff_position_num_child_partner_mean_commute'] = df['position_num_child_partner_mean_commute'] - df['commute']

df['ratio_position_num_child_partner_max_commute'] = df['position_num_child_partner_max_commute'] / df['commute']
df['ratio_position_num_child_partner_min_commute'] = df['position_num_child_partner_min_commute'] / df['commute']
df['ratio_position_num_child_partner_mean_commute'] = df['position_num_child_partner_mean_commute'] / df['commute']

df['diff_position_age_num_child_partner_max_commute'] = df['position_age_num_child_partner_max_commute'] - df['commute']
df['diff_position_age_num_child_partner_min_commute'] = df['position_age_num_child_partner_min_commute'] - df['commute']
df['diff_position_age_num_child_partner_mean_commute'] = df['position_age_num_child_partner_mean_commute'] - df['commute']

df['ratio_position_age_num_child_partner_max_commute'] = df['position_age_num_child_partner_max_commute'] / df['commute']
df['ratio_position_age_num_child_partner_min_commute'] = df['position_age_num_child_partner_min_commute'] / df['commute']
df['ratio_position_age_num_child_partner_mean_commute'] = df['position_age_num_child_partner_mean_commute'] / df['commute']

df['diff_position_area_num_child_partner_max_commute'] = df['position_area_num_child_partner_max_commute'] - df['commute']
df['diff_position_area_num_child_partner_min_commute'] = df['position_area_num_child_partner_min_commute'] - df['commute']
df['diff_position_area_num_child_partner_mean_commute'] = df['position_area_num_child_partner_mean_commute'] - df['commute']

df['diff_position_education_num_child_partner_max_commute'] = df['position_education_num_child_partner_max_commute'] - df['commute']
df['diff_position_education_num_child_partner_min_commute'] = df['position_education_num_child_partner_min_commute'] - df['commute']
df['diff_position_education_num_child_partner_mean_commute'] = df['position_education_num_child_partner_mean_commute'] - df['commute']

df['diff_position_sex_num_child_partner_max_commute'] = df['position_sex_num_child_partner_max_commute'] - df['commute']
df['diff_position_sex_num_child_partner_min_commute'] = df['position_sex_num_child_partner_min_commute'] - df['commute']
df['diff_position_sex_num_child_partner_mean_commute'] = df['position_sex_num_child_partner_mean_commute'] - df['commute']

df['ratio_position_sex_num_child_partner_max_commute'] = df['position_sex_num_child_partner_max_commute'] / df['commute']
df['ratio_position_sex_num_child_partner_min_commute'] = df['position_sex_num_child_partner_min_commute'] / df['commute']
df['ratio_position_sex_num_child_partner_mean_commute'] = df['position_sex_num_child_partner_mean_commute'] / df['commute']

df['overtime_commute_month_ratio'] =   df['overtime'] / (df['commute']*30  + 1)
df['commute_overtime_day_ratio'] = df['commute'] / (df['overtime']/30 + 1)

df['sum_diff_partner_num_child_mean_commute'] = df['diff_partner_mean_commute'] + df['diff_num_child_mean_commute']

df['point1'] = df['position']*10 + df['age'] +  df['education']*10 
df['point2'] = df['position']*5 +  df['education']*3
df['point3'] = df['age']  + df['education']*3 

df.columns.values

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # Remove the CWD from sys.path while we load stuff.
 21%|██        | 16/78 [00:00<00:00, 156.84it/s]

{'groupby': ['position'], 'target': 'area', 'agg': 'nunique'}
nunique
{'groupby': ['position'], 'target': 'education', 'agg': 'mean'}
mean
{'groupby': ['position'], 'target': 'age', 'agg': 'max'}
max
{'groupby': ['position'], 'target': 'age', 'agg': 'min'}
min
{'groupby': ['position'], 'target': 'age', 'agg': 'mean'}
mean
{'groupby': ['position'], 'target': 'age', 'agg': 'std'}
std
{'groupby': ['position'], 'target': 'age', 'agg': 'nunique'}
nunique
{'groupby': ['position'], 'target': 'commute', 'agg': 'mean'}
mean
{'groupby': ['position'], 'target': 'commute', 'agg': 'max'}
max
{'groupby': ['position'], 'target': 'commute', 'agg': 'min'}
min
{'groupby': ['position'], 'target': 'commute', 'agg': 'std'}
std
{'groupby': ['age'], 'target': 'commute', 'agg': 'mean'}
mean
{'groupby': ['age'], 'target': 'commute', 'agg': 'max'}
max
{'groupby': ['age'], 'target': 'commute', 'agg': 'min'}
min
{'groupby': ['age'], 'target': 'commute', 'agg': 'std'}
std
{'groupby': ['age'], 'target': 'commute', 

 42%|████▏     | 33/78 [00:00<00:00, 100.39it/s]

{'groupby': ['education'], 'target': 'commute', 'agg': 'mean'}
mean
{'groupby': ['education'], 'target': 'commute', 'agg': 'max'}
max
{'groupby': ['education'], 'target': 'commute', 'agg': 'min'}
min
{'groupby': ['education'], 'target': 'commute', 'agg': 'std'}
std
{'groupby': ['sex'], 'target': 'commute', 'agg': 'mean'}
mean
{'groupby': ['sex'], 'target': 'commute', 'agg': 'max'}
max
{'groupby': ['sex'], 'target': 'commute', 'agg': 'min'}
min
{'groupby': ['sex'], 'target': 'commute', 'agg': 'std'}
std
{'groupby': ['service_length'], 'target': 'commute', 'agg': 'mean'}
mean
{'groupby': ['service_length'], 'target': 'commute', 'agg': 'max'}
max
{'groupby': ['service_length'], 'target': 'commute', 'agg': 'min'}
min
{'groupby': ['service_length'], 'target': 'commute', 'agg': 'std'}
std
{'groupby': ['num_child'], 'target': 'commute', 'agg': 'mean'}
mean
{'groupby': ['num_child'], 'target': 'commute', 'agg': 'max'}
max


 60%|██████    | 47/78 [00:00<00:00, 81.70it/s] 

{'groupby': ['num_child'], 'target': 'commute', 'agg': 'min'}
min
{'groupby': ['num_child'], 'target': 'commute', 'agg': 'std'}
std
{'groupby': ['partner'], 'target': 'commute', 'agg': 'mean'}
mean
{'groupby': ['partner'], 'target': 'commute', 'agg': 'max'}
max
{'groupby': ['partner'], 'target': 'commute', 'agg': 'min'}
min
{'groupby': ['partner'], 'target': 'commute', 'agg': 'std'}
std
{'groupby': ['position', 'sex'], 'target': 'commute', 'agg': 'mean'}
mean
{'groupby': ['position', 'sex'], 'target': 'commute', 'agg': 'max'}
max
{'groupby': ['position', 'sex'], 'target': 'commute', 'agg': 'min'}
min
{'groupby': ['position', 'age'], 'target': 'commute', 'agg': 'mean'}
mean
{'groupby': ['position', 'age'], 'target': 'commute', 'agg': 'max'}
max


 68%|██████▊   | 53/78 [00:00<00:00, 74.91it/s]

{'groupby': ['position', 'age'], 'target': 'commute', 'agg': 'min'}
min
{'groupby': ['position', 'area'], 'target': 'commute', 'agg': 'mean'}
mean
{'groupby': ['position', 'area'], 'target': 'commute', 'agg': 'max'}
max
{'groupby': ['position', 'area'], 'target': 'commute', 'agg': 'min'}
min
{'groupby': ['position', 'education'], 'target': 'commute', 'agg': 'mean'}
mean
{'groupby': ['position', 'education'], 'target': 'commute', 'agg': 'max'}
max
{'groupby': ['position', 'education'], 'target': 'commute', 'agg': 'min'}
min
{'groupby': ['position', 'num_child'], 'target': 'commute', 'agg': 'mean'}
mean
{'groupby': ['position', 'num_child'], 'target': 'commute', 'agg': 'max'}
max


 83%|████████▎ | 65/78 [00:01<00:00, 64.15it/s]

{'groupby': ['position', 'num_child'], 'target': 'commute', 'agg': 'min'}
min
{'groupby': ['position', 'partner'], 'target': 'commute', 'agg': 'mean'}
mean
{'groupby': ['position', 'partner'], 'target': 'commute', 'agg': 'max'}
max
{'groupby': ['position', 'partner'], 'target': 'commute', 'agg': 'min'}
min
{'groupby': ['position', 'num_child', 'partner'], 'target': 'commute', 'agg': 'mean'}
mean
{'groupby': ['position', 'num_child', 'partner'], 'target': 'commute', 'agg': 'max'}
max
{'groupby': ['position', 'num_child', 'partner'], 'target': 'commute', 'agg': 'min'}
min
{'groupby': ['position', 'age', 'num_child', 'partner'], 'target': 'commute', 'agg': 'mean'}
mean


 90%|████████▉ | 70/78 [00:01<00:00, 60.21it/s]

{'groupby': ['position', 'age', 'num_child', 'partner'], 'target': 'commute', 'agg': 'max'}
max
{'groupby': ['position', 'age', 'num_child', 'partner'], 'target': 'commute', 'agg': 'min'}
min
{'groupby': ['position', 'area', 'num_child', 'partner'], 'target': 'commute', 'agg': 'mean'}
mean
{'groupby': ['position', 'area', 'num_child', 'partner'], 'target': 'commute', 'agg': 'max'}
max
{'groupby': ['position', 'area', 'num_child', 'partner'], 'target': 'commute', 'agg': 'min'}
min
{'groupby': ['position', 'education', 'num_child', 'partner'], 'target': 'commute', 'agg': 'mean'}
mean
{'groupby': ['position', 'education', 'num_child', 'partner'], 'target': 'commute', 'agg': 'max'}
max


100%|██████████| 78/78 [00:01<00:00, 54.71it/s]

{'groupby': ['position', 'education', 'num_child', 'partner'], 'target': 'commute', 'agg': 'min'}
min
{'groupby': ['position', 'sex', 'num_child', 'partner'], 'target': 'commute', 'agg': 'mean'}
mean
{'groupby': ['position', 'sex', 'num_child', 'partner'], 'target': 'commute', 'agg': 'max'}
max
{'groupby': ['position', 'sex', 'num_child', 'partner'], 'target': 'commute', 'agg': 'min'}
min





CPU times: user 3.31 s, sys: 6.3 s, total: 9.61 s
Wall time: 12 s


In [2]:
%%time
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold
from sklearn.metrics import mean_absolute_error


def lgb_kfold(train_df,test_df,features,target,cat_features,folds,params,sampling=False):
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])

    cv_list = []
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[features], train_df['salary_round'])):
        print ('FOLD:' + str(n_fold))
        
        train_x, train_y = train_df[features].iloc[train_idx], train_df[target].iloc[train_idx]
        valid_x, valid_y = train_df[features].iloc[valid_idx], train_df[target].iloc[valid_idx]
        
        # Downsampling
        if sampling is True:
            valid_x, valid_y = train_df[features].iloc[valid_idx], train_df[target].iloc[valid_idx] 
            train_df_new = train_df.iloc[train_idx]
            train_df_new = train_df_new[train_df_new['salary_residual_abs']<129]
            train_x, train_y = train_df_new[features], train_df_new[target]
        
        print ('train_x shape:',train_x.shape)
        print ('valid_x shape:',valid_x.shape)
        
        dtrain = lgb.Dataset(train_x, label=train_y,)
        dval = lgb.Dataset(valid_x, label=valid_y, reference=dtrain,) 
        bst = lgb.train(params, dtrain, num_boost_round=50000,
            valid_sets=[dval,dtrain], verbose_eval=200, early_stopping_rounds=200,) 
        new_list = sorted(zip(features, bst.feature_importance('gain')),key=lambda x: x[1], reverse=True)[:]
        for item in new_list:
            print (item) 
         
        oof_preds[valid_idx] = bst.predict(valid_x, num_iteration=bst.best_iteration)
        oof_cv = mean_absolute_error(valid_y,  oof_preds[valid_idx])
        cv_list.append(oof_cv)
        print (cv_list)
        sub_preds += bst.predict(test_df[features], num_iteration=bst.best_iteration) / folds.n_splits
 
    cv = mean_absolute_error(train_df[target],  oof_preds)
    print('Full OOF MAE %.6f' % cv)  

    train_df['salary_residual_y'] = oof_preds
    test_df['salary_residual_y'] = sub_preds
    
    return train_df,test_df


target = 'salary_residual'
train_nn = df[df['salary'].notnull()].reset_index(drop=True)
test_nn = df[df['salary'].isnull()].reset_index(drop=True)

train_nn['nn_y'] = pd.read_csv('ensemble/train_nn1_oof_down_stratified_1981.csv')['nn_y']
test_nn['nn_y'] = pd.read_csv('ensemble/test_nn1_oof_down_stratified_1981.csv')['nn_y']

train_nn['salary_residual'] = train_nn['salary'] - train_nn['nn_y']
train_nn['salary_residual_abs'] = np.abs(train_nn['salary'] - train_nn['nn_y'])
cat_features = []
features_new = [
'area_mean_commute',  
'area_std_commute',  
'area_max_commute', 
'area_var_commute',  
'diff_position_area_mean_commute',  
'ratio_position_area_mean_commute',   
'service_length',   
'age',
'age_mean_commute',  
'age_max_commute',  
'diff_position_num_child_max_commute',
'diff_position_age_num_child_partner_mean_commute',    
'diff_position_sex_num_child_partner_mean_commute', 
'ratio_position_age_max_commute',
"sex_mean_commute",
"sex_std_commute",
"diff_service_length_mean_commute",
]


lgb_params = {
               "objective" : "l2_root", 
               "boosting" : "gbdt", 
               "metric" : "mae",  
               "max_depth": -1,
               "min_data_in_leaf": 25, 
               "min_gain_to_split": 0.001,
                "min_child_weight": 0.001,
                "reg_alpha": 0.1, 
                "reg_lambda": 0.8, 
               "num_leaves" : 15, 
               "max_bin" : 200, 
               "learning_rate" :0.01,
               "bagging_fraction" : 1,
               "bagging_freq" : 1,
               "feature_fraction" : 0.4,
               "verbosity": -1,
               "boost_from_average": False,
}
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=8868)
train_new,test_new = lgb_kfold(train_nn,test_nn,features_new,target,cat_features,folds,lgb_params,sampling=True)

train_new['salary_residual2'] = train_new['salary_residual'] - train_new['salary_residual_y']
train_new['salary_residual2_abs'] = np.abs(train_new['salary_residual'] - train_new['salary_residual_y'])




FOLD:0
train_x shape: (18603, 17)
valid_x shape: (2235, 17)
Training until validation scores don't improve for 200 rounds
[200]	training's l1: 17.7828	valid_0's l1: 19.624
[400]	training's l1: 17.6281	valid_0's l1: 19.6037
Early stopping, best iteration is:
[374]	training's l1: 17.6453	valid_0's l1: 19.5989
('area_mean_commute', 8250187.594604492)
('area_std_commute', 6635168.822875977)
('area_var_commute', 4543569.785888672)
('diff_position_sex_num_child_partner_mean_commute', 3149038.322631836)
('diff_position_age_num_child_partner_mean_commute', 2984782.8149414062)
('age', 2742113.1130371094)
('diff_position_num_child_max_commute', 2503935.3709716797)
('service_length', 2499356.7225341797)
('area_max_commute', 2124143.2255859375)
('age_mean_commute', 2104313.9294433594)
('diff_position_area_mean_commute', 1495520.8410644531)
('diff_service_length_mean_commute', 1396871.0695800781)
('ratio_position_area_mean_commute', 1218964.8579101562)
('ratio_position_age_max_commute', 1041166.542

[200]	training's l1: 17.8397	valid_0's l1: 19.5923
[400]	training's l1: 17.6854	valid_0's l1: 19.561
[600]	training's l1: 17.5706	valid_0's l1: 19.5611
Early stopping, best iteration is:
[456]	training's l1: 17.6513	valid_0's l1: 19.5555
('area_mean_commute', 9894839.141662598)
('area_std_commute', 6106607.610839844)
('area_var_commute', 4211130.076904297)
('diff_position_sex_num_child_partner_mean_commute', 3710418.2127075195)
('age', 3327905.934326172)
('diff_position_age_num_child_partner_mean_commute', 3323385.6364746094)
('diff_position_num_child_max_commute', 2882154.044555664)
('service_length', 2827088.0338134766)
('age_mean_commute', 2703874.0263061523)
('area_max_commute', 1821802.2390136719)
('ratio_position_area_mean_commute', 1688701.4369506836)
('diff_service_length_mean_commute', 1687710.978149414)
('diff_position_area_mean_commute', 1619805.6925048828)
('ratio_position_age_max_commute', 1429459.7885742188)
('age_max_commute', 964389.3706665039)
('sex_mean_commute', 1528

In [3]:
%%time
def lgb_kfold(train_df,test_df,features,target,cat_features,folds,params,sampling=False):
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])

    cv_list = []
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[features], train_df['salary_round'])):
        print ('FOLD:' + str(n_fold))
        
        train_x, train_y = train_df[features].iloc[train_idx], train_df[target].iloc[train_idx]
        valid_x, valid_y = train_df[features].iloc[valid_idx], train_df[target].iloc[valid_idx]
        
        # Downsampling
        if sampling is True:
            valid_x, valid_y = train_df[features].iloc[valid_idx], train_df[target].iloc[valid_idx] 
            train_df_new = train_df.iloc[train_idx]
            train_df_new = train_df_new[train_df_new['salary_residual2_abs']<129]
            train_x, train_y = train_df_new[features], train_df_new[target]

        print ('train_x shape:',train_x.shape)
        print ('valid_x shape:',valid_x.shape)
        
        dtrain = lgb.Dataset(train_x, label=train_y,)
        dval = lgb.Dataset(valid_x, label=valid_y, reference=dtrain,) 
        bst = lgb.train(params, dtrain, num_boost_round=50000,
            valid_sets=[dval,dtrain], verbose_eval=200, early_stopping_rounds=200,) 
        new_list = sorted(zip(features, bst.feature_importance('gain')),key=lambda x: x[1], reverse=True)[:]
        for item in new_list:
            print (item) 
         
        oof_preds[valid_idx] = bst.predict(valid_x, num_iteration=bst.best_iteration)
        oof_cv = mean_absolute_error(valid_y,  oof_preds[valid_idx])
        cv_list.append(oof_cv)
        print (cv_list)
        sub_preds += bst.predict(test_df[features], num_iteration=bst.best_iteration) / folds.n_splits
 
    cv = mean_absolute_error(train_df[target],  oof_preds)
    print('Full OOF MAE %.6f' % cv)  

    train_df['salary_residual_y2'] = oof_preds
    test_df['salary_residual_y2'] = sub_preds
    
    return train_df,test_df

features_new = [
'nn_y',
'salary_residual_y',
]


lgb_params = {
               "objective" : "regression", 
               "boosting" : "gbdt", 
               "metric" : "mae",  
               "max_depth": 3,
               "min_data_in_leaf": 30, 
               "min_gain_to_split": 0.001,
                "min_child_weight": 0.001,
                "reg_alpha": 0.1, 
                "reg_lambda": 0.8, 
               "num_leaves" : 11, 
               "max_bin" : 200, 
               "learning_rate" :0.05,
               "bagging_fraction" : 1,
               "bagging_freq" : 1,
               "feature_fraction" : 0.8,
               "verbosity": -1,
               "boost_from_average": False,
}
target = 'salary_residual2'
train_lgb,test_lgb = lgb_kfold(train_new,test_new,features_new,target,cat_features,folds,lgb_params,sampling=True)




FOLD:0
train_x shape: (18604, 2)
valid_x shape: (2235, 2)
Training until validation scores don't improve for 200 rounds
[200]	training's l1: 17.7552	valid_0's l1: 19.5727
Early stopping, best iteration is:
[120]	training's l1: 17.7929	valid_0's l1: 19.562
('salary_residual_y', 1083526.4801635742)
('nn_y', 780754.0317230225)
[19.562017269448692]
FOLD:1
train_x shape: (18530, 2)
valid_x shape: (2312, 2)
Training until validation scores don't improve for 200 rounds
[200]	training's l1: 17.6959	valid_0's l1: 20.3115
Early stopping, best iteration is:
[10]	training's l1: 17.8256	valid_0's l1: 20.2219
('nn_y', 228104.2247314453)
('salary_residual_y', 219920.38061523438)
[19.562017269448692, 20.221862700583543]
FOLD:2
train_x shape: (18759, 2)
valid_x shape: (2072, 2)
Training until validation scores don't improve for 200 rounds
[200]	training's l1: 17.8669	valid_0's l1: 17.913
Early stopping, best iteration is:
[134]	training's l1: 17.8958	valid_0's l1: 17.9098
('salary_residual_y', 1000545.

In [4]:
train_lgb['y'] = train_lgb['nn_y']+train_lgb['salary_residual_y']+train_lgb['salary_residual_y2']
test_lgb['y'] = test_lgb['nn_y']+test_lgb['salary_residual_y']+test_lgb['salary_residual_y2']
display(mean_absolute_error(train_lgb['salary'],  train_lgb['y']))
train_lgb[['id','y']].to_csv('ensemble/train_lgb_downsample_residual_oof_19355_10fold.csv',index=False)
test_lgb[['id','y']].to_csv('ensemble/test_lgb_downsample_residual_oof_19355_10fold.csv',index=False)

19.355342209035886