In [None]:
import pandas as pd
import numpy as np
import pandas_profiling as pd_pro
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import gc
import lightgbm as lgbm

In [None]:
test = pd.read_csv('input/test.csv', parse_dates=['ClickDate'], 
                    dtype= { 'ID' : np.int32, 
                            'Carrier' : np.float32,                             
                             #'publisherId' : np.int32,
                            'advertiserCampaignId' : np.float32,
                            'Fraud': np.float32},
                     encoding='UTF-8')  
                   

In [None]:
test.shape

In [None]:
test.columns

In [None]:
test_Id = test['ID'].values

In [None]:
#chunksize = 10 ** 7
#chunks = pd.read_csv('input/train.csv', chunksize=chunksize)
train = pd.read_csv('input/train.csv', parse_dates=['ClickDate'], 
                    dtype= { 'ID' : np.int32, 
                            'Carrier' : np.float32, 
                            'ConversionPayOut' : np.float32,
                             #'publisherId' : np.int32,
                            'advertiserCampaignId' : np.float32,
                            'Fraud': np.float32},
                     encoding='UTF-8',  
                     nrows= 10000000)

In [None]:
train.columns

In [None]:
train.shape

In [None]:
len_train = train.shape[0]
y_train = train.ConversionPayOut.values
train.drop(['ConversionStatus','ConversionDate', 'ConversionPayOut'], inplace = True, axis=1)

In [None]:
data = pd.concat([train, test])

In [None]:
del train, test
gc.collect()

In [None]:
data.Country.fillna('other', inplace = True)
data.TrafficType.fillna('other', inplace = True)
data.Device.fillna('other', inplace = True)
data.Browser.fillna('other', inplace = True)
data.OS.fillna('other', inplace = True)
data.RefererUrl.fillna('other', inplace = True)
data.subPublisherId.fillna('other', inplace = True)
data['publisherId'] = pd.to_numeric(data['publisherId'], errors='coerce')
data['subPublisherId'] = pd.to_numeric(data['subPublisherId'], errors='coerce')

In [None]:
cols_to_encode = ['Country',  'TrafficType', 'Device','Browser', 'OS', 'RefererUrl', 'UserIp', 
                 'subPublisherId','publisherId']
le = LabelEncoder()
for col in cols_to_encode:
    data[col] = le.fit_transform(data[col])
    

In [None]:
data['tweekday'] = data['ClickDate'].dt.weekday
data['thour'] = data['ClickDate'].dt.hour
data['tminute'] = data['ClickDate'].dt.minute

In [None]:
#data.head(5)

In [None]:
ref_url_cnt = data.groupby(['UserIp'])[ 'RefererUrl'].nunique().reset_index().rename(columns = {'RefererUrl': 'ref_url_cnt'})

In [None]:
pubId_cnt = data.groupby(['UserIp'])[ 'publisherId'].nunique().reset_index().rename(columns = {'publisherId': 'pubId_cnt'})

In [None]:
adv_comp_cnt = data.groupby(['UserIp'])[ 'advertiserCampaignId'].nunique().reset_index().\
                             rename(columns = {'advertiserCampaignId': 'compId_cnt'})

In [None]:
data = pd.merge(data, ref_url_cnt, how='left', on=['UserIp'])
data = pd.merge(data, pubId_cnt, how='left', on=['UserIp'])
data = pd.merge(data, adv_comp_cnt, how='left', on=['UserIp'])

In [None]:
del ref_url_cnt, pubId_cnt, adv_comp_cnt
gc.collect()

In [None]:
data['refUrl_user_cnt'] = data.groupby(['RefererUrl'])['ID'].transform('count')

In [None]:
data['user_cnt'] = data.groupby(['UserIp'])['ID'].transform('count')

In [None]:
data['user_week_cnt'] = data.groupby(['UserIp', 'tweekday'])['ID'].transform('count')
data['user_hr_cnt'] = data.groupby(['UserIp', 'tweekday', 'thour'])['ID'].transform('count')

In [None]:
data['user_refurl_share'] = data['user_cnt']/ data['ref_url_cnt']
data['user_pubid_share'] = data['user_cnt']/ data['pubId_cnt']
data['user_compid_share'] = data['user_cnt']/data['compId_cnt']

In [None]:
data.drop(['ID','ClickDate'], axis=1, inplace=True)
train = data[:len_train]
test = data[len_train:]
test.shape

In [None]:
del data
gc.collect()

print(train_X.shape, train_y.shape, valid_X.shape, valid_y.shape)

del train, y_train

In [None]:
train.dtypes

In [None]:
train['user_refurl_share'] = train['user_refurl_share'].astype(np.float32)
train['user_pubid_share'] = train['user_pubid_share'].astype(np.float32)
train['user_compid_share'] = train['user_compid_share'].astype(np.float32)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from mlxtend.regressor import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVR

In [None]:
lr = LinearRegression()
svr_lin = SVR(kernel='linear')
ridge = Ridge(random_state=1)
lasso = Lasso(random_state=1)
svr_rbf = SVR(kernel='rbf')
regressors = [svr_lin, lr, ridge, lasso]
stregr = StackingRegressor(regressors=regressors, 
                           meta_regressor=svr_rbf)

In [None]:
params = {'lasso__alpha': [0.1, 1.0, 10.0],
          'ridge__alpha': [0.1, 1.0, 10.0],
          'svr__C': [0.1, 1.0, 10.0],
          'meta-svr__C': [0.1, 1.0, 10.0, 100.0],
          'meta-svr__gamma': [0.1, 1.0, 10.0]}

In [None]:
grid = GridSearchCV(estimator=stregr, 
                    param_grid=params, 
                    cv=5,
                    refit=True)

In [None]:
grid.fit(train, y_train)

In [None]:
y_pred = gbm.predict(test, num_iteration=gbm.best_iteration)

In [None]:
sub = pd.DataFrame({'ID': test_Id, 'ConversionPayOut' : y_pred })
sub = sub[['ID','ConversionPayOut'] ]

In [None]:
su.head(5)

In [None]:
sub.to_csv('lgbm_v1.csv', index=False)