Fraud risk is everywhere, but for companies that advertise online, click fraud can happen at an overwhelming volume, resulting in misleading click data and wasted money. Ad channels can drive up costs by simply clicking on the ad at a large scale. With over 1 billion smart mobile devices in active use every month, China is the largest
mobile market in the world and therefore suffers from huge volumes of fradulent traffic.

TalkingData, China’s largest independent big data service platform, covers over 70% of active mobile devices nationwide. They handle 3 billion clicks per day, of which 90% are potentially fraudulent. Their current approach to prevent click fraud for app developers is to measure the journey of a user’s click across their portfolio, and flag IP addresses who produce lots of clicks, but never end up installing apps. With this information, they've built an IP blacklist and device blacklist.

While successful, they want to always be one step ahead of fraudsters and have turned to the Kaggle community for help in further developing their solution. In their 2nd competition with Kaggle,

**your mission Jim, should you choose to accept it**


you’re challenged to build an algorithm that predicts whether a user will download an app after clicking a mobile app ad. To support your modeling, they have provided a generous dataset covering approximately 200 million clicks over 4 days!

## A simple solution attempt

Data fields

Each row of the training data contains a click record, with the following features.

    ip: ip address of click.
    app: app id for marketing.
    device: device type id of user mobile phone (e.g., iphone 6 plus, iphone 7, huawei mate 7, etc.)
    os: os version id of user mobile phone
    channel: channel id of mobile ad publisher
    click_time: timestamp of click (UTC)
    attributed_time: if user download the app for after clicking an ad, this is the time of the app download
    is_attributed: the target that is to be predicted, indicating the app was downloaded

Note that ip, app, device, os, and channel are encoded.

The test data is similar, with the following differences:

    click_id: reference for making predictions
    is_attributed: not included


https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/overview

In [None]:
# basics
import os
import numpy as np
import pandas as pd
import datetime as dt

#graphs
import matplotlib.pyplot as plt
import seaborn as sns

#models
from sklearn.cluster import KMeans
from xgboost import XGBClassifier
import lightgbm as lgb

#intermediary tools
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.metrics import silhouette_score

In [None]:
#set globals and constants
random_state = 42

In [None]:
# run on all/ any path
def get_data_files(filePaths, hdr = 'infer'):
    data=pd.DataFrame()
    for csvfile in filePaths:
        df = pd.read_csv(csvfile, header = hdr)
        data=pd.concat([df,data],ignore_index=True)
    return data

In [None]:
def x_elbow(df,range0=np.arange(2,10)):
    distortions = []
    silhuettes = []

#K = range(1,10)
    for k in range0:
        x_cluster = KMeans(n_clusters=k,init='k-means++', n_init=20, random_state=random_state,max_iter=400)
        x_cluster.fit(df)
        distortions.append(x_cluster.inertia_)
        silhuettes.append(silhouette_score(df, x_cluster.labels_, metric='euclidean'))

    #https://matplotlib.org/2.2.5/gallery/api/two_scales.html
    fig, ax1 = plt.subplots()

    color = 'tab:red'
    ax1.set_xlabel('k')
    ax1.set_ylabel('Distortion', color=color)
    ax1.plot(range0, distortions, 'bx-')
    ax1.tick_params(axis='y', labelcolor=color)

    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

    color = 'tab:blue'
    ax2.set_ylabel('silhuette score', color=color)  # we already handled the x-label with ax1
    ax2.plot(range0, silhuettes, color=color)
    ax2.tick_params(axis='y', labelcolor=color)
    
    return distortions, silhuettes

In [None]:
def x_hist_stats(series0,title = ''):
    num_bins = 50
    fig, ax = plt.subplots(1,1, tight_layout = True)
    ax.hist(series0, num_bins)
    fig.tight_layout()
    plt.title(title)
    plt.show()
    
    print('mean ' + title + ': ' + str(series0.mean()))
    print('std ' + title + ': ' + str(series0.std()))
    print('median ' + title + ': ' + str(series0.median()))

The files pool
<pre>
/kaggle/input/talkingdata-adtracking-fraud-detection/sample_submission.csv
/kaggle/input/talkingdata-adtracking-fraud-detection/train_sample.csv
/kaggle/input/talkingdata-adtracking-fraud-detection/test_supplement.csv
/kaggle/input/talkingdata-adtracking-fraud-detection/train.csv
/kaggle/input/talkingdata-adtracking-fraud-detection/test.csv
</pre>


In [None]:
filePaths = ['../input/talkingdata-adtracking-fraud-detection/train_sample.csv']
base = get_data_files(filePaths)

filePaths = []
#get the extra couple of samples
for dirname, _, filenames in os.walk('/kaggle/input/adtracking-click-for-app-250k-samples-from-total'):
    for filename in filenames:
        filePaths.append(os.path.join(dirname, filename))

extra_train = get_data_files(filePaths, None)
extra_train.columns = base.columns

In [None]:
base.shape

In [None]:
base.head()

In [None]:
base[base['is_attributed'] == 1].tail(100)

### *Feature engineering, Time analysis*

In [None]:
base['click_time'] = pd.to_datetime(base['click_time'])
base['hr'] = base['click_time'].dt.hour
base['day'] = base['click_time'].dt.day
base['weekday'] = base['click_time'].dt.weekday
base['month'] = base['click_time'].dt.month
base['attributed_time'] = pd.to_datetime(base['attributed_time'])
base['hr_at'] = base['attributed_time'].dt.hour

In [None]:
attributed_time = pd.DataFrame(base[['click_time','attributed_time']][~base['attributed_time'].isnull()])
attributed_time['attr_hr'] = pd.to_datetime(attributed_time['attributed_time']).dt.hour
attributed_time['click_hr'] = pd.to_datetime(attributed_time['click_time']).dt.hour
attributed_time['click2attr'] = (pd.to_datetime(attributed_time['attributed_time'])-pd.to_datetime(attributed_time['click_time'])).astype('timedelta64[m]')
attributed_time.head(200)

In [None]:
x_hist_stats(attributed_time['attr_hr'])

In [None]:
x_hist_stats(attributed_time['click_hr'])

In [None]:
x_hist_stats(base['hr'][base['is_attributed'] == 0])

In [None]:
x_hist_stats(attributed_time['click2attr'])

we can see that there are slight differences between clicks for attributed customers and none attributec customer.
we can also see that most click are attributed immediately
next we can check the attribution time per hour, to see if in certain hours the attibution is immediate and in certain hours not
the best way to look at the data is the median, since the long tail affects the average

In [None]:
attributed_time.groupby('click_hr').agg({'click2attr':'median'})

we can use that as some sort of a probability estimator for attribution, which we'll be able to apply on any test data. <br>
we'll need to save the results in a table, merge it on the train and on any test/ validation data <br>
in hours case, we can afford to add it to the data before splitting it to train and test

In [None]:
click2attr_per_hour = attributed_time.groupby('click_hr').agg({'click2attr':'median'})
base = pd.merge(base,click2attr_per_hour,left_on='hr',right_index=True)
base.head()

In [None]:
base['click_time']=base['click_time'].map(dt.datetime.toordinal)
base['attributed_time']=base['attributed_time'].map(dt.datetime.toordinal)

In [None]:
base.columns

In [None]:
corrdf = base
sns.set_theme(style="white")
corr = corrdf.corr()
f, ax = plt.subplots(figsize=(11, 9))
colormap = sns.diverging_palette(230, 20, as_cmap=True)
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, cmap=colormap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

so, our y variable, is_attributed, is mostly correlated with ip and app, and the app is very much correlated (for obvious reasons, with os and device)
the hour of the attribution (hr_at) is, in turn correlated with the ip

Looking for clusters ofs apps, OSs, devices and channels in order to find a common pattern and use that as a predictor

In [None]:
base['hr_at'].fillna(-1, inplace = True)

In [None]:
#Sampling for KMeans
KMtrain = pd.DataFrame()
KMtrain = base.sample(n = 10000, replace=False, random_state = random_state) # 10% of 100,000

In [None]:
dists, sils = x_elbow(KMtrain[['app','device','os','channel','ip','click2attr']],np.arange(2,10))

the elbow shows us 4 clusters (dustortion) will be the best, the silhuette score shows us 4 as well. 4 it is

In [None]:
kmeans = KMeans(init="k-means++", n_clusters=8, n_init=6,random_state=random_state,max_iter=300).fit(base[['app','device','os','channel','ip','click2attr']])
base['cluster'] = kmeans.predict(base[['app','device','os','channel','ip','click2attr']])
#kmeans.fit(KMtrain) #,'channel'

In [None]:
train, test = train_test_split(base, test_size=0.3)

Y_train = train['is_attributed']
X_train = train.drop(['is_attributed','attributed_time','hr_at'], axis = 1, inplace=False)
Y_test = test['is_attributed']
X_test = test.drop(['is_attributed','attributed_time','hr_at'], axis = 1, inplace=False)
X_test.head()

In [None]:
sm = SMOTE(random_state = random_state)
X_train, Y_train = sm.fit_resample(X_train, Y_train.ravel())

In [None]:
print('total cases: ' + str(Y_train.size))
print('total attributed: ' + str(sum(Y_train)))
print('ratio: ' + str(sum(Y_train) / Y_train.size))
print('well, after smoting, what did we expect')

In [None]:
model = XGBClassifier(
            random_state = random_state, 
            #scale_pos_weight = 30,
            learning_rate = 0.1,
            max_depth= 4,
            min_child_weight= 4,
            subsample = 0.9,
            colsample_bytree = 0.8,
            colsample_bylevel = 0.8,
            reg_lambda = 0.6

)

In [None]:
model.fit(X_train, Y_train)
prediction = model.predict(X_test)
probabilities = model.predict_proba(X_test)

In [None]:
print(classification_report(Y_test, prediction))

In [None]:
model = lgb.LGBMClassifier(
    num_leaves = 10,
    max_bin = 45
)

In [None]:
model.fit(X_train, Y_train)
prediction = model.predict(X_test)
probabilities = model.predict_proba(X_test)

In [None]:
print(classification_report(Y_test, prediction))

In [None]:
# vars used

plt.figure(figsize=(12,6))
feat_importances = pd.Series(model.feature_importances_, index=X_train.columns)
feat_importances.nlargest(25).sort_values().plot(kind='barh')

plt.show()

In [None]:
#https://www.kaggle.com/ravikishore/titanic-survival-prediction
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

logit_roc_auc = roc_auc_score(Y_test, prediction)
fpr, tpr, thresholds = roc_curve(Y_test, prediction)
plt.figure()
plt.plot(fpr, tpr, label='Light GBM (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()