In [None]:
import pandas as pd
import gc
from sklearn import metrics, svm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import lightgbm as lgbm
import catboost as cb

train_file_path="../input/talkingdata-adtracking-fraud-detection/train_sample.csv"
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }

import pandas as pd
import gc
from sklearn import metrics, svm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import lightgbm as lgbm
import catboost as cb

train_file_path="../input/talkingdata-adtracking-fraud-detection/train.csv"
test_file_path="../input/talkingdata-adtracking-fraud-detection/test.csv"
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }

train_data=pd.read_csv(train_file_path, usecols=['ip','app', 'device','os', 'channel', 'is_attributed', 'click_time'],dtype=dtypes,parse_dates=['click_time'], skiprows=range(1,122991234), nrows=10000000)
train_data['weekday']=train_data['click_time'].dt.dayofweek.astype('uint8')
train_data['click_hour']=train_data['click_time'].dt.hour.astype('uint8')
train_data['click_day']=train_data['click_time'].dt.day.astype('uint8')
print("Training File read succesfully!")


**Adding and Extracting new Features**

In [None]:
from sklearn.model_selection import train_test_split
#ip_app grouping
gp = train_data[['ip','app','os']].groupby(by=['ip','app'])[['os']].count().reset_index().rename(index=str, columns={'os': 'ip_app_count'})
train_data = train_data.merge(gp, how="left", on=['ip','app'])
del gp
gc.collect()

#ip_time grouping
gp = train_data[['ip','click_day','click_hour','channel', 'os']].groupby(by=['ip','click_day','click_hour'])[['os']].count().reset_index().rename(index=str, columns={'os': 'ip_time_count'})
train_data = train_data.merge(gp, how="left", on=['ip','click_day','click_hour'])
del gp
gc.collect()

#ip_app_os grouping
gp = train_data[['ip','app','channel', 'os']].groupby(by=['ip','app', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_os_count'})
train_data = train_data.merge(gp, how="left", on=['ip','app', 'os'])
del gp
gc.collect()

#ip_app_os_var_hour grouping
gp = train_data[['ip','app','click_hour', 'os']].groupby(by=['ip','app', 'os'])[['click_hour']].var().reset_index().rename(index=str, columns={'click_hour': 'ip_app_os_var_hour'})
train_data = train_data.merge(gp, how="left", on=['ip','app', 'os'])
del gp
gc.collect()

#ip_app_os_var_weekday grouping
gp = train_data[['ip','app','weekday', 'os']].groupby(by=['ip','app', 'os'])[['weekday']].var().reset_index().rename(index=str, columns={'weekday': 'ip_app_os_var_weekday'})
train_data = train_data.merge(gp, how="left", on=['ip','app', 'os'])
del gp
gc.collect()

#ip_day_chn_var_hour grouping
gp = train_data[['ip','click_day','click_hour','channel']].groupby(by=['ip','click_day','channel'])[['click_hour']].var().reset_index().rename(index=str, columns={'click_hour': 'ip_day_chn_var_hour'})
train_data = train_data.merge(gp, how="left", on=['ip','click_day','channel'])
del gp
gc.collect()

#ip_app_chn_mean_hour grouping
gp = train_data[['ip','app','channel', 'click_hour']].groupby(by=['ip','app', 'channel'])[['click_hour']].mean().reset_index().rename(index=str, columns={'click_hour': 'ip_app_chn_mean_hour'})
train_data = train_data.merge(gp, how="left", on=['ip','app', 'channel'])
del gp
gc.collect()

#ip_app_chn_mean_weekday grouping
gp = train_data[['ip','app','channel', 'weekday']].groupby(by=['ip','app', 'channel'])[['weekday']].mean().reset_index().rename(index=str, columns={'weekday': 'ip_app_chn_mean_weekday'})
train_data = train_data.merge(gp, how="left", on=['ip','app', 'channel'])
del gp
gc.collect()

#ip_app_os_var_day grouping
gp = train_data[['ip','app','channel', 'click_day']].groupby(by=['ip','app', 'channel'])[['click_day']].var().reset_index().rename(index=str, columns={'click_day': 'ip_app_os_var_day'})
train_data = train_data.merge(gp, how="left", on=['ip','app', 'channel'])
del gp
gc.collect()
train_data=train_data.fillna(0)
train_data["ip_time_count"] = pd.DataFrame(train_data["ip_time_count"].astype('uint16'))
train_data["ip_app_count"] = pd.DataFrame(train_data["ip_app_count"].astype('uint16'))
train_data["ip_app_os_count"] = pd.DataFrame(train_data["ip_app_os_count"].astype('uint16'))

y_train=train_data.is_attributed
x_train=train_data.drop('is_attributed', axis=1)
x_train=x_train.drop('click_time', axis=1)


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1, random_state=99)
del train_data
gc.collect()

In [None]:
def perf_measure(y_actual, y_pred):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_pred)): 
        if y_actual[i]==y_pred[i]==1:
           TP += 1
        if y_pred[i]==1 and y_actual[i]!=y_pred[i]:
           FP += 1
        if y_actual[i]==y_pred[i]==0:
           TN += 1
        if y_pred[i]==0 and y_actual[i]!=y_pred[i]:
           FN += 1

    print("TP: "+ str(TP))
    print("FP: "+ str(FP))
    print("TN: "+ str(TN))
    print("FN: "+ str(FN))

**Decision Tree Classifer**

In [None]:
from sklearn import metrics
dt = DecisionTreeClassifier(max_depth=35, random_state=1234)
dt.fit(x_train, y_train)
print("DT Model built succesfully!")
y_pred=dt.predict(x_test)
#Calculate metrics here from y_test and y_pred

perf_measure(y_test.tolist(), y_pred.tolist())
# print(y_test)
# print(y_pred)
del dt
del y_pred
gc.collect()


**XGBoost**

In [None]:
params = {'eta': 0.3,
          'tree_method': "hist",
          'grow_policy': "lossguide",
          'max_leaves': 1400,  
          'max_depth': 0, 
          'subsample': 0.9, 
          'colsample_bytree': 0.7, 
          'colsample_bylevel':0.7,
          'min_child_weight':0,
          'alpha':4,
          'objective': 'binary:logistic', 
          'scale_pos_weight':9,
          'eval_metric': 'auc', 
          'nthread':8,
          'random_state': 99, 
          'silent': True}
dtrain = xgb.DMatrix(x_train, y_train)
watchlist = [(dtrain, 'train')]
model = xgb.train(params, dtrain, 30, watchlist, maximize=True, verbose_eval=1)
del dtrain
gc.collect()
print("XGB Model built succesfully!")

dtest = xgb.DMatrix(x_test)
y_pred= (model.predict(dtest, ntree_limit=model.best_ntree_limit) > 0.5).astype('int')
#Calculate metrics here from y_test and y_pred
perf_measure(y_test.tolist(), y_pred.tolist())
# print(y_test.tolist())
# print(y_pred.tolist())
del dtest
del y_pred
gc.collect()


**lgbm**

In [None]:
predictors = ['ip', 'device', 'app', 'os', 'channel', 'click_day', 'click_hour', 'weekday', 'ip_app_count', 'ip_time_count', 'ip_app_os_count', 'ip_app_os_var_hour', 'ip_app_os_var_weekday', 'ip_day_chn_var_hour', 'ip_app_chn_mean_hour', 'ip_app_chn_mean_weekday', 'ip_app_os_var_day']
categorical = ['ip', 'app', 'device', 'os', 'channel', 'click_day', 'click_hour', 'weekday']
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.1,
    'num_leaves': 1400,  
    'max_depth': 0,  
    'min_child_samples': 100,  
    'max_bin': 100,  
    'subsample': 0.7,  
    'subsample_freq': 1,  
    'colsample_bytree': 0.7,  
    'min_child_weight': 0,  
    'subsample_for_bin': 200000,  
    'min_split_gain': 0,  
    'reg_alpha': 0,  
    'reg_lambda': 0,  
   # 'nthread': 8,
    'verbose': 0,
    'scale_pos_weight':99 
    }

dtrain = lgbm.Dataset(x_train[predictors].values, label=y_train.values,
                      feature_name=predictors,
                      categorical_feature=categorical
                      )
dvalid = lgbm.Dataset(x_test[predictors].values, label=y_test.values,
                      feature_name=predictors,
                      categorical_feature=categorical
                      )
results = {}

lgb_model = lgbm.train(params, 
                 dtrain, 
                 valid_sets=[dtrain, dvalid], 
                 valid_names=['train','valid'], 
                 evals_result=results, 
                 num_boost_round=350,
                 early_stopping_rounds=30,
                 verbose_eval=True, 
                 feval=None)
del dtrain
gc.collect()
print("LGB Model built succesfully!")
y_pred=(lgb_model.predict(x_test[predictors], num_iteration=lgb_model.best_iteration) > 0.5).astype('int')
#Calculate metrics here from y_test and y_pred
perf_measure(y_test.tolist(), y_pred.tolist())
# print(y_test)
# print(y_pred)
print("LGBM: Done")
del y_test
del y_pred
del x_test
gc.collect()
