In [4]:
%load_ext autoreload
%autoreload 2

from __future__ import division
import pandas as pd
import numpy as np
import copy
import pickle
import sys
import gc
from lightgbm import LGBMClassifier
from custom_estimator import Estimator
from sklearn.externals import joblib
pd.options.display.max_columns=200
pd.options.display.max_rows=100



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
def create_feats(df):
    df.degree_source=df.degree_source.astype('float32')
    df.degree_target=df.degree_target.astype('float32')
    
    df['degree_ratio']=df.degree_source/(1+df.degree_target)
    df['degree_delta']=df.degree_source-df.degree_target
    
    df['directed_degree_ratio']=df.directed_degree_source/(1+df.directed_degree_target)
    df['directed_degree_delta']=df.directed_degree_source-df.directed_degree_target

    df['directed_degree_ratio_in']=df.directed_degree_source_in/(1+df.directed_degree_target_in)
    df['directed_degree_delta_in']=df.directed_degree_source_in-df.directed_degree_target_in

    df['directed_degree_ratio_out']=df.directed_degree_source_out/(1+df.directed_degree_target_out)
    df['directed_degree_delta_out']=df.directed_degree_source_out-df.directed_degree_target_out

    df['node_sum']=df['node1_id']+df['node2_id']
    df['node_ratio']=(df['node1_id']/df['node2_id']).astype('float16')
    
    df['is_chat_diff']=df['source_is_chat_count']-df['target_is_chat_count']
    df['is_chat_ratio']=(df['source_is_chat_count']/(df['target_is_chat_count']+1).astype('float16'))
    
    df['mutual_chat_diff']=df['source_mutual_is_chat']-df['target_mutual_is_chat']
    df['mutual_chat_ratio']=(df['source_mutual_is_chat']/(df['target_mutual_is_chat']+1).astype('float16'))

    
    df['delta_triangle']=df['node1_triangles']- df['node2_triangles']
    df['ratio_triangle']=df['node1_triangles']/(1+df['node2_triangles'])
    df['triangle_degree_delta_source']=(df.degree_source*8264276).astype('int16')-df['node1_triangles']
    df['triangle_degree_delta_target']=(df.degree_target*8264276).astype('int16')-df['node1_triangles']

    df['clust_prod']=df['node1_cluster']* df['node2_cluster']
    df['clust_diff']=df['node1_cluster']- df['node2_cluster']

    gc.collect()
    
    df['source_net_act']=df[['f1_source_target', 'f2_source_target', 'f3_source_target', 'f4_source_target', 'f5_source_target',
     'f6_source_target', 'f7_source_target', 'f8_source_target', 'f9_source_target', 'f10_source_target',
     'f11_source_target', 'f12_source_target']].sum(axis=1)

    df['target_net_act']=df[['f1_target', 'f2_target', 'f3_target',
     'f4_target', 'f5_target', 'f6_target', 'f7_target', 'f8_target', 'f9_target', 'f10_target', 'f11_target',
     'f12_target']].sum(axis=1)

    df['net_act_diff']=df['source_net_act']- df['target_net_act']

    gc.collect()
    

    df['f14_source']=df['f1_source_target']+df['f4_source_target']+df['f7_source_target']+df['f10_source_target']
    df['f15_source']=df['f5_source_target']+df['f8_source_target']+df['f11_source_target']
    df['f16_source']=df['f6_source_target']+df['f9_source_target']+df['f12_source_target']


    df['f14_target']=df['f1_target']+df['f4_target']+df['f7_target']+df['f10_target']
    df['f15_target']=df['f5_target']+df['f8_target']+df['f11_target']
    df['f16_target']=df['f6_target']+df['f9_target']+df['f12_target']

    gc.collect()
    
    df['fdiff_1']=df['f1_source_target']-df['f1_target']
    df['fdiff_2']=df['f2_source_target']-df['f2_target']
    df['fdiff_3']=df['f3_source_target']-df['f3_target']
    df['fdiff_4']=df['f4_source_target']-df['f4_target']
    df['fdiff_5']=df['f5_source_target']-df['f5_target']
    df['fdiff_6']=df['f6_source_target']-df['f6_target']
    df['fdiff_7']=df['f7_source_target']-df['f7_target']
    df['fdiff_8']=df['f8_source_target']-df['f8_target']
    df['fdiff_9']=df['f9_source_target']-df['f9_target']
    df['fdiff_10']=df['f10_source_target']-df['f10_target']
    df['fdiff_11']=df['f11_source_target']-df['f11_target']
    df['fdiff_12']=df['f12_source_target']-df['f12_target']
    df['fdiff_13']=df['f13_source_target']-df['f13_target']
    df['fdiff_14']=df['f14_target']-df['f14_source']
    df['fdiff_15']=df['f15_target']-df['f15_source']
    df['fdiff_16']=df['f16_target']-df['f16_source']

    gc.collect()
    
    
    df['fmult_1']=df['f1_source_target']/(df['f1_target']+1).astype('float16')
    df['fmult_2']=df['f2_source_target']/(1+df['f2_target']).astype('float16')
    df['fmult_3']=df['f3_source_target']/(1+df['f3_target']).astype('float16')
    df['fmult_4']=df['f4_source_target']/(1+df['f4_target']).astype('float16')
    df['fmult_5']=df['f5_source_target']/(1+df['f5_target']).astype('float16')
    df['fmult_6']=df['f6_source_target']/(1+df['f6_target']).astype('float16')
    df['fmult_7']=df['f7_source_target']/(1+df['f7_target']).astype('float16')
    df['fmult_8']=df['f8_source_target']/(1+df['f8_target']).astype('float16')
    df['fmult_9']=df['f9_source_target']/(1+df['f9_target']).astype('float16')
    df['fmult_10']=df['f10_source_target']/(1+df['f10_target']).astype('float16')
    df['fmult_11']=df['f11_source_target']/(1+df['f11_target']).astype('float16')
    df['fmult_12']=df['f12_source_target']/(1+df['f12_target']).astype('float16')
    df['fmult_13']=df['f13_source_target']/(1+df['f13_target']).astype('float16')
    df['fmult_14']=df['f14_source']/(1+df['f14_target']).astype('float16')
    df['fmult_15']=df['f15_source']/(1+df['f15_target']).astype('float16')
    df['fmult_16']=df['f16_source']/(1+df['f16_target']).astype('float16')

    gc.collect()
    

    df['norm_user_diff']=np.sqrt(np.square(df[df.columns[df.columns.str.contains('diff')]].astype('int16')).sum(axis=1))
    
    df['norm_user_diff_1']=np.sqrt(np.square(train[['fdiff_1','fdiff_4','fdiff_7','fdiff_10']].astype('int16')).sum(axis=1))
    df['norm_user_diff_2']=np.sqrt(np.square(train[['fdiff_2','fdiff_5','fdiff_8','fdiff_11']].astype('int16')).sum(axis=1))
    df['norm_user_diff_3']=np.sqrt(np.square(train[['fdiff_3','fdiff_6','fdiff_9','fdiff_12']].astype('int16')).sum(axis=1))

    df['source_net_act']=df['source_net_act'].astype('int16')
    df['target_net_act']=df['target_net_act'].astype('int16')
    df['net_act_diff']=df['net_act_diff'].astype('int16')
    df['norm_user_diff']=df['norm_user_diff'].astype('float16')

    gc.collect()
    
    
    return df

In [12]:
%%time
train=pd.read_pickle('freq_new_train.pkl')
clust=pd.read_pickle('cluster_coeffs.pkl')
triangles=pd.read_pickle('triangles.pkl')
train['node1_cluster']=clust.clust_source.iloc[:train.shape[0]].values
train['node2_cluster']=clust.clust_target.iloc[:train.shape[0]].values
train['node1_triangles']=triangles.triangles_source.iloc[:train.shape[0]].values
train['node2_triangles']=triangles.triangles_target.iloc[:train.shape[0]].values

CPU times: user 1.1 s, sys: 3.63 s, total: 4.73 s
Wall time: 4.73 s


In [13]:
train.head()

Unnamed: 0,node1_id,node2_id,degree_source,degree_target,f1_source_target,f2_source_target,f3_source_target,f4_source_target,f5_source_target,f6_source_target,f7_source_target,f8_source_target,f9_source_target,f10_source_target,f11_source_target,f12_source_target,f13_source_target,f1_target,f2_target,f3_target,f4_target,f5_target,f6_target,f7_target,f8_target,f9_target,f10_target,f11_target,f12_target,f13_target,is_chat,node1_cluster,node2_cluster,node1_triangles,node2_triangles
0,31,8,5e-06,9.680221e-07,22,0,0,24,0,0,24,0,0,26,0,0,15,22,15,5,15,11,5,19,12,5,16,10,4,10,0,0.025604,0.035706,18,1
1,41,12,6e-06,1.452033e-06,0,0,0,0,0,0,0,0,0,0,0,0,7,9,8,6,16,13,6,20,16,7,25,21,6,15,0,0.055847,0.030304,63,2
2,28,17,6e-06,2.662061e-06,9,2,0,14,7,1,14,7,1,14,7,1,15,1,1,0,1,1,0,1,1,0,0,0,0,8,0,0.029602,0.173218,32,40
3,47,43,7e-06,8.591195e-06,11,11,10,11,10,9,10,9,8,7,6,6,7,0,0,0,1,1,1,1,1,1,1,1,1,15,0,0.037506,0.082275,62,193
4,58,5,8e-06,1.33103e-06,31,31,31,31,31,31,31,31,31,31,31,30,7,3,1,0,7,1,0,9,1,0,16,1,0,7,0,0.01918,0.090881,45,5


In [15]:
%%time
temp_df=pd.read_pickle('neigbours_vars_pat_leftover_2.pkl')
temp_df.columns=['deg2_feat1','deg2_feat2','deg2_feat3','deg2_feat4']
train=pd.concat((train,temp_df.iloc[:train.shape[0],:]),axis=1)
train.head()

CPU times: user 4.27 ms, sys: 192 µs, total: 4.46 ms
Wall time: 4.07 ms


In [16]:
train.head()

Unnamed: 0,node1_id,node2_id,degree_source,degree_target,f1_source_target,f2_source_target,f3_source_target,f4_source_target,f5_source_target,f6_source_target,f7_source_target,f8_source_target,f9_source_target,f10_source_target,f11_source_target,f12_source_target,f13_source_target,f1_target,f2_target,f3_target,f4_target,f5_target,f6_target,f7_target,f8_target,f9_target,f10_target,f11_target,f12_target,f13_target,is_chat,node1_cluster,node2_cluster,node1_triangles,node2_triangles,deg2_feat1,deg2_feat2,deg2_feat3,deg2_feat4
0,31,8,5e-06,9.680221e-07,22,0,0,24,0,0,24,0,0,26,0,0,15,22,15,5,15,11,5,19,12,5,16,10,4,10,0,0.025604,0.035706,18,1,0,,0,
1,41,12,6e-06,1.452033e-06,0,0,0,0,0,0,0,0,0,0,0,0,7,9,8,6,16,13,6,20,16,7,25,21,6,15,0,0.055847,0.030304,63,2,0,,0,
2,28,17,6e-06,2.662061e-06,9,2,0,14,7,1,14,7,1,14,7,1,15,1,1,0,1,1,0,1,1,0,0,0,0,8,0,0.029602,0.173218,32,40,0,,23,4.6
3,47,43,7e-06,8.591195e-06,11,11,10,11,10,9,10,9,8,7,6,6,7,0,0,0,1,1,1,1,1,1,1,1,1,15,0,0.037506,0.082275,62,193,0,,0,
4,58,5,8e-06,1.33103e-06,31,31,31,31,31,31,31,31,31,31,31,30,7,3,1,0,7,1,0,9,1,0,16,1,0,7,0,0.01918,0.090881,45,5,0,,5,2.5


In [17]:
%%time
deg_2_neigh=pd.read_pickle('degree_2_neighbour_feats.pkl')
train=pd.concat((train,deg_2_neigh.iloc[:train.shape[0],:]),axis=1)

CPU times: user 5.04 ms, sys: 1.06 ms, total: 6.1 ms
Wall time: 12.8 ms


In [18]:
train.head()

Unnamed: 0,node1_id,node2_id,degree_source,degree_target,f1_source_target,f2_source_target,f3_source_target,f4_source_target,f5_source_target,f6_source_target,f7_source_target,f8_source_target,f9_source_target,f10_source_target,f11_source_target,f12_source_target,f13_source_target,f1_target,f2_target,f3_target,f4_target,f5_target,f6_target,f7_target,f8_target,f9_target,f10_target,f11_target,f12_target,f13_target,is_chat,node1_cluster,node2_cluster,node1_triangles,node2_triangles,deg2_feat1,deg2_feat2,deg2_feat3,deg2_feat4,degree_2_neighs_chat_sum_source,degree_2_neighs_chat_avg_source,degree_2_neighs_chat_sum_target,degree_2_neighs_chat_avg_target,mutual_neighs_avg_chat_sum,mutual_neighs_avg_chat_avg,union_neighs_avg_chat_sum,union_neighs_avg_chat_avg
0,31,8,5e-06,9.680221e-07,22,0,0,24,0,0,24,0,0,26,0,0,15,22,15,5,15,11,5,19,12,5,16,10,4,10,0,0.025604,0.035706,18,1,0,,0,,26,0.702703,11,1.571429,1,1.0,36,0.837209
1,41,12,6e-06,1.452033e-06,0,0,0,0,0,0,0,0,0,0,0,0,7,9,8,6,16,13,6,20,16,7,25,21,6,15,0,0.055847,0.030304,63,2,0,,0,,37,0.787234,10,0.909091,0,,47,0.810345
2,28,17,6e-06,2.662061e-06,9,2,0,14,7,1,14,7,1,14,7,1,15,1,1,0,1,1,0,1,1,0,0,0,0,8,0,0.029602,0.173218,32,40,0,,23,4.6,68,1.478261,44,2.095238,7,2.333333,105,1.640625
3,47,43,7e-06,8.591195e-06,11,11,10,11,10,9,10,9,8,7,6,6,7,0,0,0,1,1,1,1,1,1,1,1,1,15,0,0.037506,0.082275,62,193,0,,0,,101,1.77193,122,1.794118,20,2.857143,203,1.720339
4,58,5,8e-06,1.33103e-06,31,31,31,31,31,31,31,31,31,31,31,30,7,3,1,0,7,1,0,9,1,0,16,1,0,7,0,0.01918,0.090881,45,5,0,,5,2.5,47,0.691176,7,0.7,0,0.0,54,0.701299


In [20]:
%%time
dir_degrees=pd.read_pickle('directed_degrees.pkl')
train['directed_degree_source']= dir_degrees['directed_degree_source'].iloc[:train.shape[0]].values
train['directed_degree_target']= dir_degrees['directed_degree_target'].iloc[:train.shape[0]].values
train['directed_degree_source_in']= dir_degrees['directed_degree_source_in'].iloc[:train.shape[0]].values
train['directed_degree_target_in']= dir_degrees['directed_degree_target_in'].iloc[:train.shape[0]].values
train['directed_degree_source_out']= dir_degrees['directed_degree_source_out'].iloc[:train.shape[0]].values
train['directed_degree_target_out']= dir_degrees['directed_degree_target_out'].iloc[:train.shape[0]].values


CPU times: user 1.25 s, sys: 5.62 s, total: 6.87 s
Wall time: 6.87 s


In [22]:
train.head()

Unnamed: 0,node1_id,node2_id,degree_source,degree_target,f1_source_target,f2_source_target,f3_source_target,f4_source_target,f5_source_target,f6_source_target,f7_source_target,f8_source_target,f9_source_target,f10_source_target,f11_source_target,f12_source_target,f13_source_target,f1_target,f2_target,f3_target,f4_target,f5_target,f6_target,f7_target,f8_target,f9_target,f10_target,f11_target,f12_target,f13_target,is_chat,node1_cluster,node2_cluster,node1_triangles,node2_triangles,deg2_feat1,deg2_feat2,deg2_feat3,deg2_feat4,degree_2_neighs_chat_sum_source,degree_2_neighs_chat_avg_source,degree_2_neighs_chat_sum_target,degree_2_neighs_chat_avg_target,mutual_neighs_avg_chat_sum,mutual_neighs_avg_chat_avg,union_neighs_avg_chat_sum,union_neighs_avg_chat_avg,directed_degree_source,directed_degree_target,directed_degree_source_in,directed_degree_target_in,directed_degree_source_out,directed_degree_target_out
0,31,8,5e-06,9.680221e-07,22,0,0,24,0,0,24,0,0,26,0,0,15,22,15,5,15,11,5,19,12,5,16,10,4,10,0,0.025604,0.035706,18,1,0,,0,,26,0.702703,11,1.571429,1,1.0,36,0.837209,5e-06,9.680221e-07,1e-06,9.680221e-07,4e-06,0.0
1,41,12,6e-06,1.452033e-06,0,0,0,0,0,0,0,0,0,0,0,0,7,9,8,6,16,13,6,20,16,7,25,21,6,15,0,0.055847,0.030304,63,2,0,,0,,37,0.787234,10,0.909091,0,,47,0.810345,7e-06,1.452033e-06,2e-06,1.452033e-06,5e-06,0.0
2,28,17,6e-06,2.662061e-06,9,2,0,14,7,1,14,7,1,14,7,1,15,1,1,0,1,1,0,1,1,0,0,0,0,8,0,0.029602,0.173218,32,40,0,,23,4.6,68,1.478261,44,2.095238,7,2.333333,105,1.640625,7e-06,3.388077e-06,4e-06,2.057047e-06,3e-06,1.33103e-06
3,47,43,7e-06,8.591195e-06,11,11,10,11,10,9,10,9,8,7,6,6,7,0,0,0,1,1,1,1,1,1,1,1,1,15,0,0.037506,0.082275,62,193,0,,0,,101,1.77193,122,1.794118,20,2.857143,203,1.720339,9e-06,1.197927e-05,4e-06,5.203118e-06,6e-06,6.776154e-06
4,58,5,8e-06,1.33103e-06,31,31,31,31,31,31,31,31,31,31,31,30,7,3,1,0,7,1,0,9,1,0,16,1,0,7,0,0.01918,0.090881,45,5,0,,5,2.5,47,0.691176,7,0.7,0,0.0,54,0.701299,1e-05,1.573036e-06,3e-06,6.050137e-07,7e-06,9.680221e-07


In [23]:
%%time
neighbours=pd.read_csv('neigbours_vars_sahil_1.csv')
train['source_mutual_is_chat']=neighbours.iloc[:train.shape[0],1].values
train['target_mutual_is_chat']=neighbours.iloc[:train.shape[0],2].values
train['mutual_neighbours']=neighbours.iloc[:train.shape[0],0].values

CPU times: user 8.45 ms, sys: 414 µs, total: 8.87 ms
Wall time: 21.2 ms


In [25]:
train.head()

Unnamed: 0,node1_id,node2_id,degree_source,degree_target,f1_source_target,f2_source_target,f3_source_target,f4_source_target,f5_source_target,f6_source_target,f7_source_target,f8_source_target,f9_source_target,f10_source_target,f11_source_target,f12_source_target,f13_source_target,f1_target,f2_target,f3_target,f4_target,f5_target,f6_target,f7_target,f8_target,f9_target,f10_target,f11_target,f12_target,f13_target,is_chat,node1_cluster,node2_cluster,node1_triangles,node2_triangles,deg2_feat1,deg2_feat2,deg2_feat3,deg2_feat4,degree_2_neighs_chat_sum_source,degree_2_neighs_chat_avg_source,degree_2_neighs_chat_sum_target,degree_2_neighs_chat_avg_target,mutual_neighs_avg_chat_sum,mutual_neighs_avg_chat_avg,union_neighs_avg_chat_sum,union_neighs_avg_chat_avg,directed_degree_source,directed_degree_target,directed_degree_source_in,directed_degree_target_in,directed_degree_source_out,directed_degree_target_out,source_mutual_is_chat,target_mutual_is_chat,mutual_neighbours
0,31,8,5e-06,9.680221e-07,22,0,0,24,0,0,24,0,0,26,0,0,15,22,15,5,15,11,5,19,12,5,16,10,4,10,0,0.025604,0.035706,18,1,0,,0,,26,0.702703,11,1.571429,1,1.0,36,0.837209,5e-06,9.680221e-07,1e-06,9.680221e-07,4e-06,0.0,0,0,1
1,41,12,6e-06,1.452033e-06,0,0,0,0,0,0,0,0,0,0,0,0,7,9,8,6,16,13,6,20,16,7,25,21,6,15,0,0.055847,0.030304,63,2,0,,0,,37,0.787234,10,0.909091,0,,47,0.810345,7e-06,1.452033e-06,2e-06,1.452033e-06,5e-06,0.0,0,0,0
2,28,17,6e-06,2.662061e-06,9,2,0,14,7,1,14,7,1,14,7,1,15,1,1,0,1,1,0,1,1,0,0,0,0,8,0,0.029602,0.173218,32,40,0,,23,4.6,68,1.478261,44,2.095238,7,2.333333,105,1.640625,7e-06,3.388077e-06,4e-06,2.057047e-06,3e-06,1.33103e-06,0,0,3
3,47,43,7e-06,8.591195e-06,11,11,10,11,10,9,10,9,8,7,6,6,7,0,0,0,1,1,1,1,1,1,1,1,1,15,0,0.037506,0.082275,62,193,0,,0,,101,1.77193,122,1.794118,20,2.857143,203,1.720339,9e-06,1.197927e-05,4e-06,5.203118e-06,6e-06,6.776154e-06,0,0,7
4,58,5,8e-06,1.33103e-06,31,31,31,31,31,31,31,31,31,31,31,30,7,3,1,0,7,1,0,9,1,0,16,1,0,7,0,0.01918,0.090881,45,5,0,,5,2.5,47,0.691176,7,0.7,0,0.0,54,0.701299,1e-05,1.573036e-06,3e-06,6.050137e-07,7e-06,9.680221e-07,0,0,1


In [26]:
%%time
neighbours2=pd.read_csv('neigbours_vars_sahil_2.csv')
train['source_is_chat_count']=neighbours2.iloc[:train.shape[0],0].values
train['target_is_chat_count']=neighbours2.iloc[:train.shape[0],1].values

CPU times: user 4.98 ms, sys: 1.25 ms, total: 6.23 ms
Wall time: 13.6 ms


In [28]:
train.head()

Unnamed: 0,node1_id,node2_id,degree_source,degree_target,f1_source_target,f2_source_target,f3_source_target,f4_source_target,f5_source_target,f6_source_target,f7_source_target,f8_source_target,f9_source_target,f10_source_target,f11_source_target,f12_source_target,f13_source_target,f1_target,f2_target,f3_target,f4_target,f5_target,f6_target,f7_target,f8_target,f9_target,f10_target,f11_target,f12_target,f13_target,is_chat,node1_cluster,node2_cluster,node1_triangles,node2_triangles,deg2_feat1,deg2_feat2,deg2_feat3,deg2_feat4,degree_2_neighs_chat_sum_source,degree_2_neighs_chat_avg_source,degree_2_neighs_chat_sum_target,degree_2_neighs_chat_avg_target,mutual_neighs_avg_chat_sum,mutual_neighs_avg_chat_avg,union_neighs_avg_chat_sum,union_neighs_avg_chat_avg,directed_degree_source,directed_degree_target,directed_degree_source_in,directed_degree_target_in,directed_degree_source_out,directed_degree_target_out,source_mutual_is_chat,target_mutual_is_chat,mutual_neighbours,source_is_chat_count,target_is_chat_count
0,31,8,5e-06,9.680221e-07,22,0,0,24,0,0,24,0,0,26,0,0,15,22,15,5,15,11,5,19,12,5,16,10,4,10,0,0.025604,0.035706,18,1,0,,0,,26,0.702703,11,1.571429,1,1.0,36,0.837209,5e-06,9.680221e-07,1e-06,9.680221e-07,4e-06,0.0,0,0,1,0,0
1,41,12,6e-06,1.452033e-06,0,0,0,0,0,0,0,0,0,0,0,0,7,9,8,6,16,13,6,20,16,7,25,21,6,15,0,0.055847,0.030304,63,2,0,,0,,37,0.787234,10,0.909091,0,,47,0.810345,7e-06,1.452033e-06,2e-06,1.452033e-06,5e-06,0.0,0,0,0,0,0
2,28,17,6e-06,2.662061e-06,9,2,0,14,7,1,14,7,1,14,7,1,15,1,1,0,1,1,0,1,1,0,0,0,0,8,0,0.029602,0.173218,32,40,0,,23,4.6,68,1.478261,44,2.095238,7,2.333333,105,1.640625,7e-06,3.388077e-06,4e-06,2.057047e-06,3e-06,1.33103e-06,0,0,3,1,4
3,47,43,7e-06,8.591195e-06,11,11,10,11,10,9,10,9,8,7,6,6,7,0,0,0,1,1,1,1,1,1,1,1,1,15,0,0.037506,0.082275,62,193,0,,0,,101,1.77193,122,1.794118,20,2.857143,203,1.720339,9e-06,1.197927e-05,4e-06,5.203118e-06,6e-06,6.776154e-06,0,0,7,0,0
4,58,5,8e-06,1.33103e-06,31,31,31,31,31,31,31,31,31,31,31,30,7,3,1,0,7,1,0,9,1,0,16,1,0,7,0,0.01918,0.090881,45,5,0,,5,2.5,47,0.691176,7,0.7,0,0.0,54,0.701299,1e-05,1.573036e-06,3e-06,6.050137e-07,7e-06,9.680221e-07,0,0,1,2,0


In [29]:
%%time
def change_dtype(a,dt):
    return a.astype(dt)
for col in train.columns[train.columns.str.contains('f[0-9]')].tolist():
    train[col]=change_dtype(train[col],'int16')

CPU times: user 9.81 ms, sys: 6.34 ms, total: 16.1 ms
Wall time: 12.4 ms


In [30]:
gc.collect()

263

In [43]:
# train.shape

In [54]:
%%time
extra_feats=pd.read_csv('jc_rsa_pa_aai.csv')
extra_feats.rsa = extra_feats.rsa.shift(1).fillna(0)
train['jc']= extra_feats.jc.iloc[:train.shape[0]].values
train['rsa']= extra_feats.rsa.iloc[:train.shape[0]].values
train['pa']= extra_feats.pa.iloc[:train.shape[0]].values
train['adamic_adar']= extra_feats.aa.iloc[:train.shape[0]].values

CPU times: user 6.58 ms, sys: 0 ns, total: 6.58 ms
Wall time: 5.86 ms


In [15]:
gc.collect()

23

In [34]:
leak_feature=pd.read_pickle('leak_feature.pkl')
train['leak_feature']=leak_feature.leak_feature.iloc[:train.shape[0]].values
#train['is_duplicated']=leak_feature.b.iloc[:train.shape[0]].values

In [36]:
gc.collect()

35

In [18]:
train.shape

(70661802, 63)

In [55]:
train.head()

Unnamed: 0,node1_id,node2_id,degree_source,degree_target,f1_source_target,f2_source_target,f3_source_target,f4_source_target,f5_source_target,f6_source_target,f7_source_target,f8_source_target,f9_source_target,f10_source_target,f11_source_target,f12_source_target,f13_source_target,f1_target,f2_target,f3_target,f4_target,f5_target,f6_target,f7_target,f8_target,f9_target,f10_target,f11_target,f12_target,f13_target,is_chat,node1_cluster,node2_cluster,node1_triangles,node2_triangles,deg2_feat1,deg2_feat2,deg2_feat3,deg2_feat4,degree_2_neighs_chat_sum_source,degree_2_neighs_chat_avg_source,degree_2_neighs_chat_sum_target,degree_2_neighs_chat_avg_target,mutual_neighs_avg_chat_sum,mutual_neighs_avg_chat_avg,union_neighs_avg_chat_sum,union_neighs_avg_chat_avg,directed_degree_source,directed_degree_target,directed_degree_source_in,directed_degree_target_in,directed_degree_source_out,directed_degree_target_out,source_mutual_is_chat,target_mutual_is_chat,mutual_neighbours,source_is_chat_count,target_is_chat_count,jc,rsa,pa,adamic_adar,leak_feature,degree_ratio,degree_delta,directed_degree_ratio,directed_degree_delta,directed_degree_ratio_in,directed_degree_delta_in,directed_degree_ratio_out,directed_degree_delta_out,node_sum,node_ratio,is_chat_diff,is_chat_ratio,mutual_chat_diff,mutual_chat_ratio,delta_triangle,ratio_triangle,triangle_degree_delta_source,triangle_degree_delta_target,clust_prod,clust_diff,source_net_act,target_net_act,net_act_diff,f14_source,f15_source,f16_source,f14_target,f15_target,f16_target,fdiff_1,fdiff_2,fdiff_3,fdiff_4,fdiff_5,fdiff_6,fdiff_7,fdiff_8,fdiff_9,fdiff_10,fdiff_11,fdiff_12,fdiff_13,fdiff_14,fdiff_15,fdiff_16,fmult_1,fmult_2,fmult_3,fmult_4,fmult_5,fmult_6,fmult_7,fmult_8,fmult_9,fmult_10,fmult_11,fmult_12,fmult_13,fmult_14,fmult_15,fmult_16,norm_user_diff,norm_user_diff_1,norm_user_diff_2,norm_user_diff_3
0,31,8,5e-06,9.680221e-07,22,0,0,24,0,0,24,0,0,26,0,0,15,22,15,5,15,11,5,19,12,5,16,10,4,10,0,0.025604,0.035706,18,1,0,,0,,26,0.702703,11,1.571429,1,1.0,36,0.837209,5e-06,9.680221e-07,1e-06,9.680221e-07,4e-06,0.0,0,0,1,0,0,0.022222,0.0,320,0.225692,-1,5e-06,4e-06,5e-06,4e-06,1e-06,2.420054e-07,4e-06,4e-06,39,3.875,0,0.0,0,0.0,17,9.0,22,-10,0.000914,-0.010101,96,139,-43,96,0,0,72,33,14,0,-15,-5,9,-11,-5,5,-12,-5,10,-10,-4,5,-24,33,14,0.956522,0.0,0.0,1.5,0.0,0.0,1.2,0.0,0.0,1.529412,0.0,0.0,1.363636,1.315068,0.0,0.0,68.0,14.3527,24.289916,9.539392
1,41,12,6e-06,1.452033e-06,0,0,0,0,0,0,0,0,0,0,0,0,7,9,8,6,16,13,6,20,16,7,25,21,6,15,0,0.055847,0.030304,63,2,0,,0,,37,0.787234,10,0.909091,0,,47,0.810345,7e-06,1.452033e-06,2e-06,1.452033e-06,5e-06,0.0,0,0,0,0,0,0.0,0.011905,600,0.0,-1,6e-06,5e-06,7e-06,5e-06,2e-06,4.840111e-07,5e-06,5e-06,53,3.416016,0,0.0,0,0.0,61,21.0,-13,-51,0.001693,0.025543,0,153,-153,0,0,0,70,50,19,-9,-8,-6,-16,-13,-6,-20,-16,-7,-25,-21,-6,-8,70,50,19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4375,0.0,0.0,0.0,183.5,36.905284,30.495901,12.529964
2,28,17,6e-06,2.662061e-06,9,2,0,14,7,1,14,7,1,14,7,1,15,1,1,0,1,1,0,1,1,0,0,0,0,8,0,0.029602,0.173218,32,40,0,,23,4.6,68,1.478261,44,2.095238,7,2.333333,105,1.640625,7e-06,3.388077e-06,4e-06,2.057047e-06,3e-06,1.33103e-06,0,0,3,1,4,0.045455,0.0,1078,0.840082,-1,6e-06,3e-06,7e-06,4e-06,4e-06,1.815042e-06,3e-06,2e-06,45,1.647461,-3,0.2,0,0.0,-8,0.780488,17,-10,0.005127,-0.143555,77,6,71,51,21,3,3,2,0,8,1,0,13,6,1,13,6,1,14,7,1,7,-48,-19,-3,4.5,1.0,0.0,7.0,3.5,1.0,7.0,3.5,1.0,14.0,7.0,1.0,1.666667,12.75,7.0,3.0,92.1875,24.454039,11.045361,1.732051
3,47,43,7e-06,8.591195e-06,11,11,10,11,10,9,10,9,8,7,6,6,7,0,0,0,1,1,1,1,1,1,1,1,1,15,0,0.037506,0.082275,62,193,0,,0,,101,1.77193,122,1.794118,20,2.857143,203,1.720339,9e-06,1.197927e-05,4e-06,5.203118e-06,6e-06,6.776154e-06,0,0,7,0,0,0.058333,0.086736,4118,1.772571,-1,7e-06,-2e-06,9e-06,-3e-06,4e-06,-1.694038e-06,6e-06,-1e-06,90,1.092773,0,0.0,0,0.0,-131,0.319588,-4,9,0.003086,-0.044769,108,9,99,39,25,23,3,3,3,11,11,10,10,9,8,9,8,7,6,5,5,-8,-36,-22,-20,11.0,11.0,10.0,5.5,5.0,4.5,5.0,4.5,4.0,3.5,3.0,3.0,0.4375,9.75,6.25,5.75,113.625,18.384776,17.058722,15.427249
4,58,5,8e-06,1.33103e-06,31,31,31,31,31,31,31,31,31,31,31,30,7,3,1,0,7,1,0,9,1,0,16,1,0,7,0,0.01918,0.090881,45,5,0,,5,2.5,47,0.691176,7,0.7,0,0.0,54,0.701299,1e-05,1.573036e-06,3e-06,6.050137e-07,7e-06,9.680221e-07,0,0,1,2,0,0.012658,0.137,759,0.276938,-1,8e-06,7e-06,1e-05,8e-06,3e-06,2.17805e-06,7e-06,6e-06,63,11.601562,2,2.0,0,0.0,40,7.5,24,-34,0.001743,-0.071716,371,39,332,124,93,92,35,3,0,28,30,31,24,30,31,22,30,31,15,30,30,0,-89,-90,-92,7.75,15.5,31.0,3.875,15.5,31.0,3.1,15.5,31.0,1.823529,15.5,30.0,0.875,3.444444,23.25,92.0,114.4375,45.486262,60.0,61.506097


In [39]:
def neg_sample_estimator(train_df,est,seed,frac=0.05):
    temp_train=train_df[train_df.is_chat==0].sample(frac=frac,random_state=seed)
    df=pd.concat((temp_train,train_df[train_df.is_chat==1]),axis=0)
    df=df.sample(frac=1,random_state=seed)
    y=df.is_chat
    df.drop('is_chat',inplace=True,axis=1)
    gc.collect()
    est.get_repeated_out_of_folds(df.values,y.values)
    return est

In [40]:
def neg_sample_data(train_df,seed,frac=0.05):
    temp_train=train_df[train_df.is_chat==0].sample(frac=frac,random_state=seed)
    df=pd.concat((temp_train,train_df[train_df.is_chat==1]),axis=0)
    df=df.sample(frac=1,random_state=seed)
    y=df.is_chat
    df.drop('is_chat',inplace=True,axis=1)
    gc.collect()
    return df,y

In [41]:
%%time
train=create_feats(train)

CPU times: user 592 ms, sys: 0 ns, total: 592 ms
Wall time: 702 ms




In [23]:
train.shape

(70661802, 128)

#### Model building

In [24]:
params={'num_leaves': 256, 'n_jobs': -1, 'colsample_bytree': 0.8, 
        'learning_rate': 0.1, 'min_child_weight': 200.0, 'n_estimators': 1000,
        'subsample': 1.0, 'objective': 'binary', 'bagging_freq': 5, 'boosting_type': 'gbdt'}

mod=Estimator(model=LGBMClassifier(**params),n_jobs=-1,n_splits=5,random_state=100,shuffle=True,early_stopping_rounds=100)

In [25]:
del clust,extra_feats,leak_feature,dir_degrees,triangles,neighbours,neighbours2,deg_2_neigh,temp_df

In [26]:
gc.collect()


802

In [None]:
%%time
mod2=neg_sample_estimator(est=mod,frac=0.05,seed=100,train_df=train)
joblib.dump(mod2,'lgb_128_feats_with_leak_and_mutual_friends_with_is_chat_target_depth_2_feats_rsa_2_more_deg2.pkl')

In [None]:
print 'mod2',mod2.cv_scores, mod2.avg_cv_score

In [27]:
%%time
mod2=neg_sample_estimator(est=mod,frac=0.05,seed=200,train_df=train)
joblib.dump(mod2,'lgb_128_feats_with_leak_and_mutual_friends_with_is_chat_target_depth_2_feats_rsa_2_more_deg2_seed_200.pkl')

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.938757	valid_0's binary_logloss: 0.302289	valid_1's auc: 0.940718	valid_1's binary_logloss: 0.297629
[200]	valid_0's auc: 0.939858	valid_0's binary_logloss: 0.29932	valid_1's auc: 0.943662	valid_1's binary_logloss: 0.290086
[300]	valid_0's auc: 0.940091	valid_0's binary_logloss: 0.298691	valid_1's auc: 0.945636	valid_1's binary_logloss: 0.285223
[400]	valid_0's auc: 0.940161	valid_0's binary_logloss: 0.298487	valid_1's auc: 0.947364	valid_1's binary_logloss: 0.280999
[500]	valid_0's auc: 0.940193	valid_0's binary_logloss: 0.298378	valid_1's auc: 0.949003	valid_1's binary_logloss: 0.276993
Early stopping, best iteration is:
[498]	valid_0's auc: 0.940194	valid_0's binary_logloss: 0.298376	valid_1's auc: 0.948967	valid_1's binary_logloss: 0.277079
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.938349	valid_0's binary_logloss: 0.303061	valid_1's auc: 0.940791	valid_1

In [28]:
print 'mod2',mod2.cv_scores, mod2.avg_cv_score

mod2 [0.9401937317733958, 0.9398295974413856, 0.9402266619931113, 0.9406578657752643, 0.9402328778769024] 0.940228146972012


In [29]:
%%time
mod2=neg_sample_estimator(est=mod,frac=0.05,seed=300,train_df=train)
joblib.dump(mod2,'lgb_128_feats_with_leak_and_mutual_friends_with_is_chat_target_depth_2_feats_rsa_2_more_deg2_seed_300.pkl')

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.938295	valid_0's binary_logloss: 0.303274	valid_1's auc: 0.940799	valid_1's binary_logloss: 0.297475
[200]	valid_0's auc: 0.939522	valid_0's binary_logloss: 0.300028	valid_1's auc: 0.943794	valid_1's binary_logloss: 0.289833
[300]	valid_0's auc: 0.939718	valid_0's binary_logloss: 0.299495	valid_1's auc: 0.945745	valid_1's binary_logloss: 0.285054
[400]	valid_0's auc: 0.939771	valid_0's binary_logloss: 0.299344	valid_1's auc: 0.947441	valid_1's binary_logloss: 0.280852
[500]	valid_0's auc: 0.939808	valid_0's binary_logloss: 0.299236	valid_1's auc: 0.949055	valid_1's binary_logloss: 0.276886
[600]	valid_0's auc: 0.939807	valid_0's binary_logloss: 0.299227	valid_1's auc: 0.950573	valid_1's binary_logloss: 0.273117
Early stopping, best iteration is:
[547]	valid_0's auc: 0.939827	valid_0's binary_logloss: 0.299183	valid_1's auc: 0.949808	valid_1's binary_logloss: 0.275024
Training until validation scores d

In [30]:
print 'mod2',mod2.cv_scores, mod2.avg_cv_score

mod2 [0.9398269590461181, 0.9404481181971962, 0.9405796764435808, 0.9402091424270741, 0.9401698169609554] 0.9402467426149848


In [31]:
%%time
mod2=neg_sample_estimator(est=mod,frac=0.05,seed=400,train_df=train)
joblib.dump(mod2,'lgb_128_feats_with_leak_and_mutual_friends_with_is_chat_target_depth_2_feats_rsa_2_more_deg2_seed_400.pkl')

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.938727	valid_0's binary_logloss: 0.302271	valid_1's auc: 0.940692	valid_1's binary_logloss: 0.297796
[200]	valid_0's auc: 0.939872	valid_0's binary_logloss: 0.299215	valid_1's auc: 0.94367	valid_1's binary_logloss: 0.290231
[300]	valid_0's auc: 0.940111	valid_0's binary_logloss: 0.298567	valid_1's auc: 0.945617	valid_1's binary_logloss: 0.285397
[400]	valid_0's auc: 0.940196	valid_0's binary_logloss: 0.298317	valid_1's auc: 0.947371	valid_1's binary_logloss: 0.281145
[500]	valid_0's auc: 0.940231	valid_0's binary_logloss: 0.298209	valid_1's auc: 0.949005	valid_1's binary_logloss: 0.277119
Early stopping, best iteration is:
[492]	valid_0's auc: 0.940233	valid_0's binary_logloss: 0.298205	valid_1's auc: 0.948883	valid_1's binary_logloss: 0.277418
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.939158	valid_0's binary_logloss: 0.301403	valid_1's auc: 0.940594	valid_1

In [None]:
print 'mod2',mod2.cv_scores, mod2.avg_cv_score

mod2 [0.9402334102242799, 0.94057945273079, 0.9398875814444789, 0.9401557522279543, 0.9403453260778156] 0.9402403045410637


In [None]:
%%time
mod2=neg_sample_estimator(est=mod,frac=0.05,seed=500,train_df=train)
joblib.dump(mod2,'lgb_128_feats_with_leak_and_mutual_friends_with_is_chat_target_depth_2_feats_rsa_2_more_deg2_seed_500.pkl')

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.938481	valid_0's binary_logloss: 0.30291	valid_1's auc: 0.94068	valid_1's binary_logloss: 0.29784
[200]	valid_0's auc: 0.939639	valid_0's binary_logloss: 0.29981	valid_1's auc: 0.943652	valid_1's binary_logloss: 0.290225
[300]	valid_0's auc: 0.939858	valid_0's binary_logloss: 0.299237	valid_1's auc: 0.945603	valid_1's binary_logloss: 0.285394
[400]	valid_0's auc: 0.939931	valid_0's binary_logloss: 0.299015	valid_1's auc: 0.947342	valid_1's binary_logloss: 0.28113
[500]	valid_0's auc: 0.939959	valid_0's binary_logloss: 0.298926	valid_1's auc: 0.948977	valid_1's binary_logloss: 0.27712
[600]	valid_0's auc: 0.939954	valid_0's binary_logloss: 0.298922	valid_1's auc: 0.950523	valid_1's binary_logloss: 0.273352
Early stopping, best iteration is:
[547]	valid_0's auc: 0.939968	valid_0's binary_logloss: 0.298893	valid_1's auc: 0.949717	valid_1's binary_logloss: 0.275305
Training until validation scores don't i

In [None]:
print 'mod2',mod2.cv_scores, mod2.avg_cv_score

mod2 [0.939968206676806, 0.9406600446099929, 0.9402283883290528, 0.9401602258962751, 0.9398803930793797] 0.9401794517183013


In [None]:
%%time
mod2=neg_sample_estimator(est=mod,frac=0.05,seed=600,train_df=train)
joblib.dump(mod2,'lgb_128_feats_with_leak_and_mutual_friends_with_is_chat_target_depth_2_feats_rsa_2_more_deg2_seed_600.pkl')

In [None]:
print 'mod2',mod2.cv_scores, mod2.avg_cv_score

In [None]:
%%time
mod2=neg_sample_estimator(est=mod,frac=0.05,seed=700,train_df=train)
joblib.dump(mod2,'lgb_128_feats_with_leak_and_mutual_friends_with_is_chat_target_depth_2_feats_rsa_2_more_deg2_seed_700.pkl')

In [None]:
print 'mod2',mod2.cv_scores, mod2.avg_cv_score

In [None]:
%%time
mod2=neg_sample_estimator(est=mod,frac=0.05,seed=800,train_df=train)
joblib.dump(mod2,'lgb_128_feats_with_leak_and_mutual_friends_with_is_chat_target_depth_2_feats_rsa_2_more_deg2_seed_800.pkl')

In [None]:
print 'mod2',mod2.cv_scores, mod2.avg_cv_score

In [None]:
%%time
mod2=neg_sample_estimator(est=mod,frac=0.05,seed=900,train_df=train)
joblib.dump(mod2,'lgb_128_feats_with_leak_and_mutual_friends_with_is_chat_target_depth_2_feats_rsa_2_more_deg2_seed_900.pkl')

In [None]:
print 'mod2',mod2.cv_scores, mod2.avg_cv_score

In [None]:
%%time
mod2=neg_sample_estimator(est=mod,frac=0.05,seed=1000,train_df=train)
joblib.dump(mod2,'lgb_128_feats_with_leak_and_mutual_friends_with_is_chat_target_depth_2_feats_rsa_2_more_deg2_seed_1000.pkl')

In [None]:
print 'mod2',mod2.cv_scores, mod2.avg_cv_score