In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import preprocessing, metrics, ensemble
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

import xgboost as xgb
from xgboost import XGBClassifier, plot_importance

In [None]:
df= pd.read_csv("../input/talkingdata-adtracking-fraud-detection/train_sample.csv")

In [None]:
print("Count of rows and column are: " , df.shape)

In [None]:
df.head()

## Exploring the Data - Univariate Analysis

In [None]:
df.describe()

In [None]:
for i in df.columns:
    cnt = len(df[i].unique())
    print(i,":",cnt)

In [None]:
col = ['ip','app','device','os','channel','is_attributed']
for i in col:
    df[i]=df[i].astype('category')

In [None]:
df['click_time']=pd.to_datetime(df['click_time'])
df['attributed_time']=pd.to_datetime(df['attributed_time'])

df.info()

In [None]:
col = ['ip','app','device','os','channel']
cnt = [len(df[i].unique()) for i in col]

## Plotting on Barchart !

plt.figure(figsize=(12,7))
ax=sns.barplot(x=col, y=cnt, log= True)
ax.set(xlabel='Feature', ylabel='log of unique count',title="Count within each feature")
for p, uni in zip(ax.patches, cnt):
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 10,
            uni,
            ha="center")
plt.show()

In [None]:
plt.pie(df['is_attributed'].value_counts(normalize=True)*100,autopct='%1.2f%%')
plt.title("Plot of App Downloaded vs Not Downloaded")
plt.show()


In [None]:
plt.figure(figsize=(25,10))
sns.barplot(x=df['device'].value_counts().index,y=df['device'].value_counts(), log=True)
plt.xticks(rotation=45)
plt.title("Device Type for Click")
plt.show()

In [None]:
df['is_attributed']=df['is_attributed'].astype(int)
prop = df[['ip', 'is_attributed']].groupby('ip', as_index=False).median().sort_values('is_attributed', ascending=False) 
counts = df[['ip', 'is_attributed']].groupby('ip', as_index=False).count().sort_values('is_attributed', ascending=False)

merge = counts.merge(prop, on='ip', how='left')
merge.columns = ['ip', 'click_count', 'prop_downloaded']

ax = merge[:300].plot(secondary_y='prop_downloaded')
plt.title('Conversion Rates over Counts of 300 Most Popular IPs')
ax.set(ylabel='Count of clicks')
plt.ylabel('Proportion Downloaded')
plt.show()

In [None]:
proportion = df[['channel', 'is_attributed']].groupby('channel', as_index=False).mean().sort_values('is_attributed', ascending=False)
counts = df[['channel', 'is_attributed']].groupby('channel', as_index=False).count().sort_values('is_attributed', ascending=False)
merge = counts.merge(proportion, on='channel', how='left')
merge.columns = ['channel', 'click_count', 'prop_downloaded']
ax = merge[:100].plot(secondary_y='prop_downloaded')
plt.title('Conversion Rates over Counts of 100 Most Popular Apps')
ax.set(ylabel='Count of clicks')
plt.ylabel('Proportion Downloaded')
plt.plot()


In [None]:
app_target = df.groupby('app').is_attributed.agg(['mean', 'count'])
ax = app_target.plot(secondary_y='mean')
plt.title('Conversion Rates over Counts of Most Popular Apps')
ax.set(ylabel='Count of clicks')
plt.ylabel('Proportion Downloaded')
plt.show()

## Feature Engineering

In [None]:
df["datetime"]=pd.to_datetime(df["click_time"])
df["day_of_week"]=df["datetime"].dt.dayofweek
df["day_of_year"]=df["datetime"].dt.dayofyear
df["month"]=df["datetime"].dt.month
df["hour"]=df["datetime"].dt.hour

In [None]:
df['ip']=df['ip'].astype(int)
df['app']=df['app'].astype(int)
df['device']=df['device'].astype(int)
df['os']=df['os'].astype(int)
df['channel']=df['channel'].astype(int)

In [None]:
df=df.drop(["click_time","datetime","attributed_time"], axis=1)

In [None]:
X=df.drop("is_attributed",axis=1)
Y=df[["is_attributed"]]

In [None]:
x1,x2,y1,y2=train_test_split(X,Y,
                             test_size=0.25,
                             stratify=Y,
                             random_state=100)

In [None]:
print(y1.mean())
print(y2.mean())

## AdaBoost

In [None]:
#BAse Estimator
tree = DecisionTreeClassifier(max_depth=2)

#Adaboost using base estimator - tree

ada_model =  AdaBoostClassifier(
    base_estimator=tree,
    n_estimators=600,
    learning_rate=1.5,
    algorithm="SAMME")

ada_model.fit(x1,y1)

y_pred = ada_model.predict_proba(x2)

y_pred[:10]


In [None]:
ROC_Score = metrics.roc_auc_score(y2,y_pred[:,1])
print("ROC Score of AdaBoost Model: ", ROC_Score)

In [None]:
parameter = {"base_estimator__max_depth":[2,3],
            "n_estimators":[100,300,500]}

tree= DecisionTreeClassifier()

adaboostmodel = AdaBoostClassifier(base_estimator=tree,
                               learning_rate=0.9,
                                  algorithm="SAMME")

fold=3

grid_search_cv = GridSearchCV(adaboostmodel,
                             cv=fold,
                             param_grid=parameter,
                             scoring='roc_auc',
                             return_train_score=True,
                             verbose=1)
grid_search_cv.fit(x1,y1)

In [None]:
ada_cv_result = pd.DataFrame(grid_search_cv.cv_results_)
ada_cv_result

In [None]:
tree = DecisionTreeClassifier(max_depth=2)

ada_model1 = AdaBoostClassifier(base_estimator=tree,learning_rate=0.5,n_estimators=100,algorithm="SAMME")

ada_model1.fit(x1,y1)
y_pred1 = ada_model1.predict_proba(x2)

ROC_Score=metrics.roc_auc_score(y2,y_pred1[:,1])
print("ROC Score of Hyperparameter Tunned AdaBoost Model: ", ROC_Score)

## XGBoost

In [None]:
XGB_model = XGBClassifier()
XGB_model.fit(x1,y1)

y_pred3 =XGB_model.predict_proba(x2)
y_pred3[:10]

In [None]:
ROC_Score=metrics.roc_auc_score(y2,y_pred3[:,1])
print("ROC Score of XGBoost Model :%.2f%%" % (ROC_Score * 100.0) )

In [None]:
fold = 3

parameter = {"learning_rate":[0.1,0.3,0.5],
            "subsample":[0.3,0.6,0.8],
            "n_estimators":[100,200,300,500],
            "max_depth":[2,3,4]}

xgb_model = XGBClassifier()

grid_xgb_model = GridSearchCV(xgb_model,
                             param_grid=parameter,
                             cv=fold,
                             scoring="roc_auc",return_train_score=True,
                             verbose=0)

grid_xgb_model.fit(x1,y1)

In [None]:
cv_results = pd.DataFrame(grid_xgb_model.cv_results_)
cv_results

In [None]:
XGBC_model = XGBClassifier(max_depth=2,
                                       n_estimators=100,
                                       learning_rate=0.1,
                                       subsample=0.6)
XGBC_model.fit(x1,y1)
y_pred4=XGBC_model.predict_proba(x2)
y_pred4[:10]

In [None]:
ROC_Score=metrics.roc_auc_score(y2,y_pred4[:,1])
print("ROC Score of Hyperparameter Tunned XGBoost Model :%.2f%%" % (ROC_Score * 100.0) )

In [None]:
metrics.plot_roc_curve(XGBC_model,x2,y2)
plt.show()

In [None]:
plt.bar(range(len(XGBC_model.feature_importances_)), XGBC_model.feature_importances_)
plt.show()

In [None]:
# feature importance
importance = dict(zip(x1.columns, XGBC_model.feature_importances_))
importance

In [None]:
test = pd.read_csv("../input/talkingdata-adtracking-fraud-detection/test.csv")
print("Count of rows and column are: " , test.shape)

In [None]:
test["datetime"]=pd.to_datetime(test["click_time"])
test["day_of_week"]=test["datetime"].dt.dayofweek
test["day_of_year"]=test["datetime"].dt.dayofyear
test["month"]=test["datetime"].dt.month
test["hour"]=test["datetime"].dt.hour
test['ip']=test['ip'].astype(int)
test['app']=test['app'].astype(int)
test['device']=test['device'].astype(int)
test['os']=test['os'].astype(int)
test['channel']=test['channel'].astype(int)

test_df=test.drop(["click_time","datetime","click_id"], axis=1)

test_df.head()

#### XGBoost Model has given us best score we will predict using this model !

In [None]:
final_y_ada= XGBC_model.predict_proba(test_df)
sub1 = pd.DataFrame()
sub1['click_id'] = test['click_id']
sub1['is_attributed'] = final_y_ada[:, 1]
sub1.head()