In [1]:
%load_ext autoreload
%autoreload 2

from __future__ import division
import pandas as pd
import numpy as np
import copy
import pickle
import sys
import gc
from encoding import FreqeuncyEncoding
from lightgbm import LGBMClassifier
from custom_estimator import Estimator
from sklearn.externals import joblib

pd.options.display.max_columns=100

In [3]:
def create_feats(df):
    df.degree_source=df.degree_source.astype('float32')
    df.degree_target=df.degree_target.astype('float32')
    
    df['degree_ratio']=df.degree_source/(1+df.degree_target)
    df['degree_delta']=df.degree_source-df.degree_target
    
    df['directed_degree_ratio']=df.directed_degree_source/(1+df.directed_degree_target)
    df['directed_degree_delta']=df.directed_degree_source-df.directed_degree_target

    df['directed_degree_ratio_in']=df.directed_degree_source_in/(1+df.directed_degree_target_in)
    df['directed_degree_delta_in']=df.directed_degree_source_in-df.directed_degree_target_in

    df['directed_degree_ratio_out']=df.directed_degree_source_out/(1+df.directed_degree_target_out)
    df['directed_degree_delta_out']=df.directed_degree_source_out-df.directed_degree_target_out

    df['node_sum']=df['node1_id']+df['node2_id']
    df['node_ratio']=(df['node1_id']/df['node2_id']).astype('float16')
    
    df['is_chat_diff']=df['source_is_chat_count']-df['target_is_chat_count']
    df['is_chat_ratio']=(df['source_is_chat_count']/(df['target_is_chat_count']+1).astype('float16'))
    
    df['mutual_chat_diff']=df['source_mutual_is_chat']-df['target_mutual_is_chat']
    df['mutual_chat_ratio']=(df['source_mutual_is_chat']/(df['target_mutual_is_chat']+1).astype('float16'))


    df['delta_triangle']=df['node1_triangles']- df['node2_triangles']
    df['ratio_triangle']=df['node1_triangles']/(1+df['node2_triangles'])
    df['triangle_degree_delta_source']=(df.degree_source*8264276).astype('int16')-df['node1_triangles']
    df['triangle_degree_delta_target']=(df.degree_target*8264276).astype('int16')-df['node1_triangles']

    df['clust_prod']=df['node1_cluster']* df['node2_cluster']
    df['clust_diff']=df['node1_cluster']- df['node2_cluster']

    gc.collect()
    
    df['source_net_act']=df[['f1_source_target', 'f2_source_target', 'f3_source_target', 'f4_source_target', 'f5_source_target',
     'f6_source_target', 'f7_source_target', 'f8_source_target', 'f9_source_target', 'f10_source_target',
     'f11_source_target', 'f12_source_target']].sum(axis=1)

    df['target_net_act']=df[['f1_target', 'f2_target', 'f3_target',
     'f4_target', 'f5_target', 'f6_target', 'f7_target', 'f8_target', 'f9_target', 'f10_target', 'f11_target',
     'f12_target']].sum(axis=1)

    df['net_act_diff']=df['source_net_act']- df['target_net_act']

    gc.collect()
    

    df['f14_source']=df['f1_source_target']+df['f4_source_target']+df['f7_source_target']+df['f10_source_target']
    df['f15_source']=df['f5_source_target']+df['f8_source_target']+df['f11_source_target']
    df['f16_source']=df['f6_source_target']+df['f9_source_target']+df['f12_source_target']


    df['f14_target']=df['f1_target']+df['f4_target']+df['f7_target']+df['f10_target']
    df['f15_target']=df['f5_target']+df['f8_target']+df['f11_target']
    df['f16_target']=df['f6_target']+df['f9_target']+df['f12_target']

    gc.collect()
    
    df['fdiff_1']=df['f1_source_target']-df['f1_target']
    df['fdiff_2']=df['f2_source_target']-df['f2_target']
    df['fdiff_3']=df['f3_source_target']-df['f3_target']
    df['fdiff_4']=df['f4_source_target']-df['f4_target']
    df['fdiff_5']=df['f5_source_target']-df['f5_target']
    df['fdiff_6']=df['f6_source_target']-df['f6_target']
    df['fdiff_7']=df['f7_source_target']-df['f7_target']
    df['fdiff_8']=df['f8_source_target']-df['f8_target']
    df['fdiff_9']=df['f9_source_target']-df['f9_target']
    df['fdiff_10']=df['f10_source_target']-df['f10_target']
    df['fdiff_11']=df['f11_source_target']-df['f11_target']
    df['fdiff_12']=df['f12_source_target']-df['f12_target']
    df['fdiff_13']=df['f13_source_target']-df['f13_target']
    df['fdiff_14']=df['f14_target']-df['f14_source']
    df['fdiff_15']=df['f15_target']-df['f15_source']
    df['fdiff_16']=df['f16_target']-df['f16_source']

    gc.collect()
    
    
    df['fmult_1']=df['f1_source_target']/(df['f1_target']+1).astype('float16')
    df['fmult_2']=df['f2_source_target']/(1+df['f2_target']).astype('float16')
    df['fmult_3']=df['f3_source_target']/(1+df['f3_target']).astype('float16')
    df['fmult_4']=df['f4_source_target']/(1+df['f4_target']).astype('float16')
    df['fmult_5']=df['f5_source_target']/(1+df['f5_target']).astype('float16')
    df['fmult_6']=df['f6_source_target']/(1+df['f6_target']).astype('float16')
    df['fmult_7']=df['f7_source_target']/(1+df['f7_target']).astype('float16')
    df['fmult_8']=df['f8_source_target']/(1+df['f8_target']).astype('float16')
    df['fmult_9']=df['f9_source_target']/(1+df['f9_target']).astype('float16')
    df['fmult_10']=df['f10_source_target']/(1+df['f10_target']).astype('float16')
    df['fmult_11']=df['f11_source_target']/(1+df['f11_target']).astype('float16')
    df['fmult_12']=df['f12_source_target']/(1+df['f12_target']).astype('float16')
    df['fmult_13']=df['f13_source_target']/(1+df['f13_target']).astype('float16')
    df['fmult_14']=df['f14_source']/(1+df['f14_target']).astype('float16')
    df['fmult_15']=df['f15_source']/(1+df['f15_target']).astype('float16')
    df['fmult_16']=df['f16_source']/(1+df['f16_target']).astype('float16')

    gc.collect()
    

    df['norm_user_diff']=np.sqrt(np.square(df[df.columns[df.columns.str.contains('diff')]].astype('int16')).sum(axis=1))
    
    df['norm_user_diff_1']=np.sqrt(np.square(df[['fdiff_1','fdiff_4','fdiff_7','fdiff_10']].astype('int16')).sum(axis=1))
    df['norm_user_diff_2']=np.sqrt(np.square(df[['fdiff_2','fdiff_5','fdiff_8','fdiff_11']].astype('int16')).sum(axis=1))
    df['norm_user_diff_3']=np.sqrt(np.square(df[['fdiff_3','fdiff_6','fdiff_9','fdiff_12']].astype('int16')).sum(axis=1))

    df['source_net_act']=df['source_net_act'].astype('int16')
    df['target_net_act']=df['target_net_act'].astype('int16')
    df['net_act_diff']=df['net_act_diff'].astype('int16')
    df['norm_user_diff']=df['norm_user_diff'].astype('float16')

    gc.collect()
    
    
    return df

In [4]:
%%time
test=pd.read_pickle('freq_new_test.pkl')
clust=pd.read_pickle('cluster_coeffs.pkl')
triangles=pd.read_pickle('triangles.pkl')
test['node1_cluster']=clust.clust_source.iloc[-test.shape[0]:].values
test['node2_cluster']=clust.clust_target.iloc[-test.shape[0]:].values
test['node1_triangles']=triangles.triangles_source.iloc[-test.shape[0]:].values
test['node2_triangles']=triangles.triangles_target.iloc[-test.shape[0]:].values

CPU times: user 632 ms, sys: 7.28 s, total: 7.91 s
Wall time: 7.46 s


In [5]:
%%time
temp_df=pd.read_pickle('neigbours_vars_pat_leftover_2.pkl')
temp_df.columns=['deg2_feat1','deg2_feat2','deg2_feat3','deg2_feat4']
temp_df=temp_df.iloc[-test.shape[0]:,:]
temp_df.reset_index(inplace=True,drop=True)
test=pd.concat((test,temp_df),axis=1)

CPU times: user 7.06 s, sys: 3.37 s, total: 10.4 s
Wall time: 3.46 s


In [7]:
%%time
deg_2_neigh=pd.read_pickle('degree_2_neighbour_feats.pkl')
deg_2_neigh=deg_2_neigh.iloc[-test.shape[0]:,:]
deg_2_neigh.reset_index(inplace=True,drop=True)
test=pd.concat((test,deg_2_neigh),axis=1)


CPU times: user 8.7 s, sys: 6.32 s, total: 15 s
Wall time: 4.1 s


In [1]:
# test.head()

In [9]:
%%time
dir_degrees=pd.read_pickle('directed_degrees.pkl')
test['directed_degree_source']= dir_degrees['directed_degree_source'].iloc[-test.shape[0]:].values
test['directed_degree_target']= dir_degrees['directed_degree_target'].iloc[-test.shape[0]:].values
test['directed_degree_source_in']= dir_degrees['directed_degree_source_in'].iloc[-test.shape[0]:].values
test['directed_degree_target_in']= dir_degrees['directed_degree_target_in'].iloc[-test.shape[0]:].values
test['directed_degree_source_out']= dir_degrees['directed_degree_source_out'].iloc[-test.shape[0]:].values
test['directed_degree_target_out']= dir_degrees['directed_degree_target_out'].iloc[-test.shape[0]:].values


CPU times: user 1.18 s, sys: 10.8 s, total: 12 s
Wall time: 12 s


In [10]:
%%time
neighbours=pd.read_csv('neigbours_vars_sahil_1.csv')
test['source_mutual_is_chat']=neighbours.iloc[-test.shape[0]:,1].values
test['target_mutual_is_chat']=neighbours.iloc[-test.shape[0]:,2].values
test['mutual_neighbours']=neighbours.iloc[-test.shape[0]:,0].values

CPU times: user 15.2 s, sys: 7.68 s, total: 22.9 s
Wall time: 13.9 s


In [11]:
%%time
neighbours2=pd.read_csv('neigbours_vars_sahil_2.csv')
test['source_is_chat_count']=neighbours2.iloc[-test.shape[0]:,0].values
test['target_is_chat_count']=neighbours2.iloc[-test.shape[0]:,1].values

CPU times: user 12.7 s, sys: 5.46 s, total: 18.2 s
Wall time: 10.1 s


In [12]:
%%time
def change_dtype(a,dt):
    return a.astype(dt)

for col in test.columns[test.columns.str.contains('f[0-9]')].tolist():
    test[col]=change_dtype(test[col],'int16')

CPU times: user 7.94 s, sys: 4.68 s, total: 12.6 s
Wall time: 1.05 s


In [13]:
gc.collect()

88

In [None]:
%%time
extra_feats=pd.read_csv('jc_rsa_pa_aai.csv')
extra_feats.rsa = extra_feats.rsa.shift(1).fillna(0)
train['jc']= extra_feats.jc.iloc[:train.shape[0]].values
train['rsa']= extra_feats.rsa.iloc[:train.shape[0]].values
train['pa']= extra_feats.pa.iloc[:train.shape[0]].values
train['adamic_adar']= extra_feats.aa.iloc[:train.shape[0]].values

In [14]:
%%time
extra_feats=pd.read_csv('jc_rsa_pa_aai.csv')
test['jc']= extra_feats.jc.iloc[-test.shape[0]:].values
test['rsa']= extra_feats.rsa.iloc[-test.shape[0]:].values
test['pa']= extra_feats.pa.iloc[-test.shape[0]:].values
test['adamic_adar']=extra_feats.aa.iloc[-test.shape[0]:].values

CPU times: user 1.32 s, sys: 2.28 s, total: 3.6 s
Wall time: 2.6 s


In [15]:
gc.collect()

23

In [16]:
leak_feature=pd.read_pickle('leak_feature.pkl')
test['leak_feature']=leak_feature.leak_feature.iloc[-test.shape[0]:].values

In [17]:
gc.collect()

7

In [18]:
test=create_feats(test)



In [19]:
test.shape

(11776968, 127)

### predictions

In [20]:
mod2=joblib.load('lgb_128_feats_with_leak_and_mutual_friends_with_is_chat_target_depth_2_feats_rsa_2_more_deg2.pkl')
print mod2.cv_scores, mod2.overall_cv_score

[0.9399932460538108, 0.9403921499754164, 0.9400259100006944, 0.9405107722873594, 0.9402462167031546] 0.9402331492956435


In [21]:
%%time
pred1=mod2.transform(test.values)

CPU times: user 2h 48min 54s, sys: 1min 38s, total: 2h 50min 32s
Wall time: 9min 2s


In [22]:
mod2=joblib.load('lgb_128_feats_with_leak_and_mutual_friends_with_is_chat_target_depth_2_feats_rsa_2_more_deg2_seed_200.pkl')
print mod2.cv_scores, mod2.overall_cv_score

[0.9401937317733958, 0.9398295974413856, 0.9402266619931113, 0.9406578657752643, 0.9402328778769024] 0.9402278033925633


In [23]:
%%time
pred2=mod2.transform(test.values)

CPU times: user 2h 59min 41s, sys: 1min 5s, total: 3h 46s
Wall time: 8min 57s


In [24]:
mod2=joblib.load('lgb_128_feats_with_leak_and_mutual_friends_with_is_chat_target_depth_2_feats_rsa_2_more_deg2_seed_300.pkl')
print mod2.cv_scores, mod2.overall_cv_score

[0.9398269590461181, 0.9404481181971962, 0.9405796764435808, 0.9402091424270741, 0.9401698169609554] 0.9402457920841976


In [25]:
%%time
pred3=mod2.transform(test.values)

CPU times: user 3h 7min 42s, sys: 1min 7s, total: 3h 8min 49s
Wall time: 9min 18s


In [26]:
mod2=joblib.load('lgb_128_feats_with_leak_and_mutual_friends_with_is_chat_target_depth_2_feats_rsa_2_more_deg2_seed_400.pkl')
print mod2.cv_scores, mod2.overall_cv_score

[0.9402334102242799, 0.94057945273079, 0.9398875814444789, 0.9401557522279543, 0.9403453260778156] 0.9402398234781657


In [27]:
%%time
pred4=mod2.transform(test.values)

CPU times: user 2h 55min 48s, sys: 1min 7s, total: 2h 56min 56s
Wall time: 8min 44s


In [28]:
mod2=joblib.load('lgb_128_feats_with_leak_and_mutual_friends_with_is_chat_target_depth_2_feats_rsa_2_more_deg2_seed_500.pkl')
print mod2.cv_scores, mod2.overall_cv_score

[0.939968206676806, 0.9406600446099929, 0.9402283883290528, 0.9401602258962751, 0.9398803930793797] 0.940179158418961


In [29]:
%%time
pred5=mod2.transform(test.values)

CPU times: user 3h 17min 41s, sys: 1min 7s, total: 3h 18min 48s
Wall time: 9min 43s


In [None]:
mod2=joblib.load('lgb_128_feats_with_leak_and_mutual_friends_with_is_chat_target_depth_2_feats_rsa_2_more_deg2_seed_600.pkl')
print mod2.cv_scores, mod2.overall_cv_score

In [None]:
%%time
pred6=mod2.transform(test.values)

In [None]:
mod2=joblib.load('lgb_128_feats_with_leak_and_mutual_friends_with_is_chat_target_depth_2_feats_rsa_2_more_deg2_seed_700.pkl')
print mod2.cv_scores, mod2.overall_cv_score

In [None]:
%%time
pred7=mod2.transform(test.values)

In [None]:
mod2=joblib.load('lgb_128_feats_with_leak_and_mutual_friends_with_is_chat_target_depth_2_feats_rsa_2_more_deg2_seed_800.pkl')
print mod2.cv_scores, mod2.overall_cv_score

In [None]:
%%time
pred8=mod2.transform(test.values)

In [None]:
mod2=joblib.load('lgb_128_feats_with_leak_and_mutual_friends_with_is_chat_target_depth_2_feats_rsa_2_more_deg2_seed_900.pkl')
print mod2.cv_scores, mod2.overall_cv_score

In [None]:
%%time
pred9=mod2.transform(test.values)

In [None]:
mod2=joblib.load('lgb_128_feats_with_leak_and_mutual_friends_with_is_chat_target_depth_2_feats_rsa_2_more_deg2_seed_1000.pkl')
print mod2.cv_scores, mod2.overall_cv_score

In [None]:
%%time
pred10=mod2.transform(test.values)

### bagging all the 10 models

In [31]:
sol1=pd.DataFrame({'id':range(len(pred1)),'var1':pred1,'var2':pred2,'var3':pred3,'var4':pred4,'var5':pred5,
                   'var6':pred6,'var7':pred7,'var8':pred8,'var9':pred9,'var10':pred10})


In [32]:
sol1.id=sol1.id+1

In [39]:
sol1['is_chat']=sol1.iloc[:,1:].mean(axis=1)

In [2]:
sol1.head()

In [43]:
sol1.iloc[:,[0,-1]].to_csv('final_sol.csv',index=False)