In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, PredefinedSplit, GridSearchCV, cross_validate
import category_encoders as ce

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

import lightgbm as lgb

# Load, clean and visualize data

In [None]:
df = pd.read_csv('/kaggle/input/talkingdata-adtracking-fraud-detection/train_sample.csv')
print('This data frame has %d rows and %d columns.' % (df.shape[0], df.shape[1]))

In [None]:
df.head(5)

In [None]:
pd.set_option('precision', 2)
df.describe()

In [None]:
counts = df['is_attributed'].value_counts()
fraud = counts[0]
click = counts[1]
tot = click+fraud
print('There are %d fraudulent clicks (%.2f%%) and %d normal clicks (%.2f%%)' % (fraud, fraud/tot*100, click, click/tot*100))

cat_features = ['ip', 'app', 'device', 'os', 'channel']
avg_count = dict()
for col in cat_features:
  n = len(df[col].value_counts())
  avg_count[col] = tot // n
  print('There are %d %s among %d examples, average count : %d.' % (n, col, tot, avg_count[col]))

In [None]:
df.isnull().sum()

In [None]:
# only normal clicks has a valid attributed_time, this feature is dropped
df.drop(columns=['attributed_time'], inplace=True)
# encode click time
df['click_time'] = pd.to_datetime(df['click_time'])
df['click_day'] = df['click_time'].dt.day.astype('uint8')
df['click_hr'] = df['click_time'].dt.hour.astype('uint8')
df['click_min'] = df['click_time'].dt.minute.astype('uint8')
df['click_sec'] = df['click_time'].dt.second.astype('uint8')

df.drop(columns=['click_time'], inplace=True)
# train validate split
x = df.drop('is_attributed', axis=1)
y = df['is_attributed']
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, stratify=y, random_state=2021)
train = pd.concat([x_train, y_train], axis=1)
val = pd.concat([x_val, y_val], axis=1)
train.hist(figsize=(10,10))
plt.show()
sns.heatmap(train.corr(), vmin=-1, vmax=1, center= 0, cmap= 'coolwarm')
plt.show()

In [None]:
# load test
test = pd.read_csv('/kaggle/input/talkingdata-adtracking-fraud-detection/test.csv')
test['click_time'] = pd.to_datetime(test['click_time'])
test['click_day'] = test['click_time'].dt.day.astype('uint8')
test['click_hr'] = test['click_time'].dt.hour.astype('uint8')
test['click_min'] = test['click_time'].dt.minute.astype('uint8')
test['click_sec'] = test['click_time'].dt.second.astype('uint8')

test.drop(columns=['click_time'], inplace=True)

## encode categorical features

In [None]:
# count encoder
count_encode = ce.CountEncoder(cols=cat_features, handle_unknown=avg_count)
count_encode.fit(train[cat_features])
train = train.join(count_encode.transform(train[cat_features]).add_suffix('_cnt'))
val = val.join(count_encode.transform(val[cat_features]).add_suffix('_cnt'))
test = test.join(count_encode.transform(test[cat_features]).add_suffix('_cnt'))
# target encoder
target_encode = ce.TargetEncoder(cols=cat_features, handle_unknown='value')
target_encode.fit(train[cat_features], train['is_attributed'])
train = train.join(target_encode.transform(train[cat_features]).add_suffix('_target'))
val = val.join(target_encode.transform(val[cat_features]).add_suffix('_target'))
test = test.join(target_encode.transform(test[cat_features]).add_suffix('_target'))

In [None]:
features = ['ip_cnt', 'app_cnt','device_cnt', 'os_cnt', 'channel_cnt', 'ip_target', 'app_target',
       'device_target', 'os_target', 'channel_target', 'click_day', 'click_hr', 'click_min', 'click_sec']
fig, axs = plt.subplots(2, 7, figsize=(21,6))
for i, ax1 in enumerate(axs):
    for j, ax in enumerate(ax1):
        f = features[i*7+j]
        train.groupby('is_attributed')[f].plot(kind='hist', alpha=0.3, legend=True, ax=ax)
        ax.set_xlabel(f)
fig.tight_layout(pad=2)

In [None]:
train2 = train[features+['is_attributed']]
sns.heatmap(train2.corr(), vmin=-1, vmax=1, center= 0, cmap= 'coolwarm')
plt.show()

# Train with weighted decision tree

In [None]:
clf = DecisionTreeClassifier(class_weight={0:1, 1:400}, max_depth=2, min_samples_split=2000)
feat = ['ip_cnt', 'app_cnt', 'device_cnt', 'os_cnt', 'channel_cnt', 'ip_target', 'app_target','device_target', 'os_target', 'channel_target', 'click_day', 'click_hr','click_min', 'click_sec']
x_train  = train[feat]
y_train = train['is_attributed']
clf.fit(x_train, y_train)
tree.plot_tree(clf, feature_names=feat)
plt.show()

In [None]:
x_test = test[feat]
result = test.loc[:, ['click_id']]
result['is_attributed'] = clf.predict(x_test)
result.to_csv('sample_wdt_ce.csv', index=False)

In [None]:
!kaggle competitions submit -c talkingdata-adtracking-fraud-detection -f sample_wdt_ce.csv -m "train wdt with sample data"