In [None]:
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import KFold,train_test_split
import xgboost as xgb
import lightgbm as lgb
import time
import datetime
import dask.dataframe as dd
import math
import _pickle as cPickle
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
def log(info):
    print(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())+' '+str(info))
    
def log_shape(train,test):
    log('Train data shape: %s' %str(train.shape))
    log('Test data shape: %s' %str(test.shape))
    
    
dtypes={
    'click_id':'uint32',
    'ip':'uint32',
    'app':'uint16',
    'device':'uint16',
    'os':'uint16',
    'channel':'uint16',
    'is_attributed':'uint8'
    
}

# Randomly load subset of the training data
train=pd.read_csv('train.csv',skiprows=lambda i:i>0 and np.random.random()>0.1,nrows=18000000,
                 header=0,sep=',',dtype=dtypes,usecols=['ip','app','device','os','channel','click_time','is_attributed'])
        

test=pd.read_csv('test.csv',header=0,sep=',',dtype=dtypes,
                 usecols=['ip','app','device','os','channel','click_time'])
log('Read data done.')
# Show the head of the table
log_shape(train,test)

In [None]:
Y_train=train['is_attributed']
# print(Y_train[:5])
sns.countplot(Y_train)
print(Y_train.value_counts())
print(Y_train.value_counts()/Y_train.value_counts().sum())
del Y_train
gc.collect()

In [None]:
def process_date(df):
    df['click_time']=pd.to_datetime(df['click_time'])
    df['day'] = df['click_time'].dt.day.astype('uint8')
    df['hour'] = df['click_time'].dt.hour.astype('uint8')
    df['minute'] = df['click_time'].dt.minute.astype('uint8')
    df['second'] = df['click_time'].dt.second.astype('uint8')
#     del df['click_time']
    return df
train=process_date(train)
test=process_date(test)
gc.collect()

In [None]:
df=train.append(test).reset_index(drop=True)
del train
del test
gc.collect()

In [None]:
predictors=[]
def cal_next_time_delta(df,suffix='next_time_delta',type='float32'):
    groupby_columns=[
        {'columns':['ip','app','channel','device','os']},
        {'columns':['ip','os','device']},
        {'columns':['ip','os','device','app']}
    ]
    for spec in groupby_columns:
        #new_feature_name
        new_feature_name='{}_{}'.format('_'.join(spec['columns']),suffix)
        all_features=spec['columns']+['click_time']
        df[new_feature_name]=(df[all_features].groupby(spec['columns']).click_time.shift(-1)-df.click_time).dt.seconds.astype(type)
        predictors.append(new_feature_name)
        gc.collect()
    return df
df=cal_next_time_delta(df)

def cal_prev_time_delta(df,suffix='prev_time_delta',type='float32'):
    groupby_columns=[
        {'columns':['ip','channel']},
        {'columns':['ip','os']},
    ]
    for spec in groupby_columns:
        #new_feature_name
        new_feature_name='{}_{}'.format('_'.join(spec['columns']),suffix)
        all_features=spec['columns']+['click_time']
        df[new_feature_name]=(df.click_time-df[all_features].groupby(spec['columns']).click_time.shift(+1)).dt.seconds.astype(type)
        predictors.append(new_feature_name)
        gc.collect()
    return df
df=cal_prev_time_delta(df)

def merge_nunique(df,columns_groupby,column,new_column_name,type='uint32'):
    add=pd.DataFrame(df.groupby(columns_groupby)[column].nunique()).reset_index()
    add.columns=columns_groupby+[new_column_name]
    df=df.merge(add,on=columns_groupby,how='left')
    df[new_column_name]=df[new_column_name].astype(type)
    predictors.append(new_column_name)
    return df
df=merge_nunique(df,['ip'],'channel','nunique_channel_gb_ip','uint32')
gc.collect()
df=merge_nunique(df,['ip'],'app','nunique_app_gb_ip','uint32')
df=merge_nunique(df,['ip','day'],'hour','nunique_hour_gb_ip_day','uint32')
gc.collect()
df=merge_nunique(df,['ip'],'device','nunique_device_gb_ip','uint32')
gc.collect()
df=merge_nunique(df,['app'],'channel','nunique_channel_gb_app','uint32')
gc.collect()
df=merge_nunique(df,['ip','app'],'os','nunique_channel_ip_app','uint32')
gc.collect()
df=merge_nunique(df,['ip','device','os'],'app','nunique_app_gb_ip_device_os','uint32')


def merge_cumcount(df,columns_groupby,column,new_column_name,type='uint32'):
    df[new_column_name]=df.groupby(columns_groupby)[column].cumcount().values.astype(type)
    predictors.append(new_column_name)
    return df
df=merge_cumcount(df,['ip'],'os','cumcount_os_gb_ip','uint32')
gc.collect()
df=merge_cumcount(df,['ip','device','os'],'app'
                  ,'cumcount_app_gb_ip_device_os','uint32')

def merge_count(df,columns_groupby,new_column_name,type='uint32'):
    add=pd.DataFrame(df.groupby(columns_groupby).size()).reset_index()
    add.columns=columns_groupby+[new_column_name]
    df=df.merge(add,on=columns_groupby,how='left')
    df[new_column_name]=df[new_column_name].astype(type)
    predictors.append(new_column_name)
    return df
df=merge_count(df,['ip','day','hour'],'count_gb_ip_day_hour','uint32')
df=merge_count(df,['ip','app'],'count_gb_ip_app','uint32')
gc.collect()
df=merge_count(df,['ip','app','os'],'count_gb_ip_app_os','uint32')

def merge_var(df,columns_groupby,column,new_column_name,type='float32'):
    add=pd.DataFrame(df.groupby(columns_groupby)[column].var()).reset_index()
    add.columns=columns_groupby+[new_column_name]
    df=df.merge(add,on=columns_groupby,how='left')
    df[new_column_name]=df[new_column_name].astype(type)
    predictors.append(new_column_name)
    return df

df=merge_var(df,['ip','app','os'],'day','var_day_gb_ip_app_os')

def merge_mean(df,columns_groupby,column,new_column_name,type='float32'):
    add=pd.DataFrame(df.groupby(columns_groupby)[column].mean()).reset_index()
    add.columns=columns_groupby+[new_column_name]
    df=df.merge(add,on=columns_groupby,how='left')
    df[new_column_name]=df[new_column_name].astype(type)
    predictors.append(new_column_name)
    return df

df=merge_var(df,['ip','app','channel'],'hour','mean_hour_gb_ip_app_channel')

In [None]:
cPickle.dump(df,open('all_v4.p','wb'))
# df=cPickle.load(open('all_v4.p','rb'))

In [None]:
train=df[:1800000]
test=df[1800000:]
log_shape(train,test)
train_label=train['is_attributed']
def get_model_input_data(train,test):
    feat=['ip','app','device','os','channel','hour']
    for f in feat:
        if f not in predictors:
            predictors.append(f)
    train_x=train[predictors]
    test_x=test[predictors]
    return train_x,test_x

train,test=get_model_input_data(train,test)
X_train,X_test,y_train,y_test=train_test_split(train,train_label,test_size=0.2,random_state=42,stratify=train_label)

In [None]:
config_lgb={
    'rounds':10000
}

params_lgb={
    'boosting_type':'gbdt',
    'objective':'xentropy',
    'learning_rate':0.05,#0.05
    'scale_pos_weight':200,
    'max_depth':-1,
    'num_leaves':31,
    'min_child_samples':100,
    'max_bin':128,
    'subsample':0.7,
    'subsample_freq':1,
    'colsample_bytree':0.9,
    'min_child_weight':0,
    'subsample_for_bin':200000,
    'min_split_gain':0,
    'reg_alpha':0.99,
    'reg_lambda':0.9,
    'nthread':24,
    'verbose':-1,
    'metric':'auc',
    'seed':8
}
def lgb_train(train_feature,train_label,test_feature,test_label,params,rounds):
    start=time.perf_counter()
    print(train_feature.columns)
    params['scale_pos_weight']=float(len(train_label[train_label==0]))/len(train_label[train_label==1])
    dtrain=lgb.Dataset(train_feature,label=train_label,categorical_feature=['app','device','os','channel','hour'])
    dtest=lgb.Dataset(test_feature,label=test_label,categorical_feature=['app','device','os','channel','hour'])
    print('LightGBM run :'+'round'+str(rounds))
    res=lgb.train(params,dtrain,rounds,valid_sets=[dtest],valid_names=['test'],verbose_eval=10,early_stopping_rounds=20)
    elapsed=(time.perf_counter()-start)
    return res,res.best_iteration,res.best_score
def lgb_predict(model,test_feature):
    predict=model.predict(test_feature)
    return predict

lgb_model,iteration_lgb,best_score_lgb=lgb_train(X_train,y_train,X_test,y_test,params_lgb,config_lgb['rounds'])
print('iteration_times:{}\nbest_score:{}'.format(iteration_lgb,best_score_lgb))

In [None]:
pred_lgb=lgb_predict(lgb_model,test)
test_click_id=pd.read_csv('test.csv',dtype=dtypes,usecols=['click_id'])
result=pd.DataFrame({'click_id':test_click_id['click_id'],'is_attributed':pred_lgb})
result.to_csv('./Submission/submission_v4.csv',index=False)