In [None]:
import pandas as pd
import numpy as np
import pandas_profiling as pd_pro
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import gc
import lightgbm as lgbm

In [None]:
test = pd.read_csv('input/test.csv', parse_dates=['ClickDate'], 
                    dtype= { 'ID' : np.int32, 
                            'Carrier' : np.float32,                             
                             #'publisherId' : np.int32,
                            'advertiserCampaignId' : np.float32,
                            'Fraud': np.float32},
                     encoding='UTF-8')  
                   

In [None]:
test_Id = test['ID'].values

In [None]:
#chunksize = 10 ** 7
#chunks = pd.read_csv('input/train.csv', chunksize=chunksize)
train = pd.read_csv('input/train.csv', parse_dates=['ClickDate'], 
                    dtype= { 'ID' : np.int32, 
                            'Carrier' : np.float32, 
                            'ConversionPayOut' : np.float32,
                             #'publisherId' : np.int32,
                            'advertiserCampaignId' : np.float32,
                            'Fraud': np.float32},
                     encoding='UTF-8',  
                     nrows= 17000000)

In [None]:
len_train = train.shape[0]
y_train = train.ConversionPayOut.values
train.drop(['ConversionStatus','ConversionDate', 'ConversionPayOut'], inplace = True, axis=1)

In [None]:
data = pd.concat([train, test])

In [None]:
del train, test
gc.collect()

In [None]:
data.Country.fillna('other', inplace = True)
data.TrafficType.fillna('other', inplace = True)
data.Device.fillna('other', inplace = True)
data.Browser.fillna('other', inplace = True)
data.OS.fillna('other', inplace = True)
data.RefererUrl.fillna('other', inplace = True)
data.subPublisherId.fillna('other', inplace = True)
data['publisherId'] = pd.to_numeric(data['publisherId'], errors='coerce').astype(np.float32)
data['subPublisherId'] = pd.to_numeric(data['subPublisherId'], errors='coerce').astype(np.float32)

In [None]:
cols_to_encode = ['Country',  'TrafficType', 'Device','Browser', 'OS', 'RefererUrl', 'UserIp', 
                 'subPublisherId','publisherId']
le = LabelEncoder()
for col in cols_to_encode:
    data[col] = le.fit_transform(data[col]).astype(np.int32)
    

In [None]:
data['tminute'] = data['ClickDate'].dt.minute.astype(np.int32)

In [None]:
data.head(5)

In [None]:
data.drop(['ID','ClickDate'], axis=1, inplace=True)
train = data[:len_train]
test = data[len_train:]
test.shape

In [None]:
del data
gc.collect()

In [None]:
train_X, valid_X, train_y, valid_y = train_test_split(train, y_train, test_size=0.6, random_state=32)

In [None]:
print(train_X.shape, train_y.shape, valid_X.shape, valid_y.shape)

In [None]:
lgb_train = lgbm.Dataset(train_X, train_y)
lgb_eval = lgbm.Dataset(valid_X, valid_y, reference=lgb_train)

In [None]:
del train_X, valid_X, train_y, valid_y

In [None]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves':200,
    'min_data_in_leaf':600,
    'max_bin':100,
    'learning_rate': 0.003,    
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    #'device':'gpu',
    'verbose': 0
}

In [None]:
model_lgb = lgbm.train(params,
                lgb_train,                
                num_boost_round=500,
                valid_sets=lgb_eval,
                early_stopping_rounds=50)

In [None]:
y_pred = model_lgb.predict(test, num_iteration=model_lgb.best_iteration)

In [None]:
sub = pd.DataFrame({'ID': test_Id, 'ConversionPayOut' : y_pred })
sub = sub[['ID','ConversionPayOut'] ]

In [None]:
sub.head(5)

In [None]:
sub.to_csv('lgb_v14-1.csv', index=False)