In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier


In [None]:
from subprocess import check_output
print(check_output(["ls", "../input/adclickstrain/train/train.csv"]).decode("utf8"))

**Reading Data**

In [None]:
train = []
for num, i in tqdm(enumerate(pd.read_csv("../input/adclickstrain/train/train.csv", chunksize=10000))):
    x_train , _ = train_test_split(i, test_size=0.50)
    train.append(x_train)
    #print(i.shape)

In [None]:
x_train = pd.concat(train)
x_train.shape

**EDA and Data Preprocessing**

In [None]:
x_train['click'].value_counts()

In [None]:
x_train.head()

In [None]:
x_train.isnull().values.any()

In [None]:
#Plot missing values for each column
missing_df = x_train.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df = missing_df.loc[missing_df['missing_count']>0]
missing_df = missing_df.sort_values(by='missing_count')
print(missing_df)

In [None]:
ind = np.arange(missing_df.shape[0])
width = 0.9
fig, ax = plt.subplots(figsize=(8,6))
rects = ax.barh(ind, missing_df.missing_count.values, color='blue')
ax.set_yticks(ind)
ax.set_yticklabels(missing_df.column_name.values, rotation='horizontal')
ax.set_xlabel("Count of missing values")
ax.set_title("Number of missing values in each column")
plt.show()

In [None]:
cat_cols = ['countrycode','browserid','devid']

In [None]:
for col in cat_cols:
    lbl = LabelEncoder()
    lbl.fit(list(x_train[col].values))
    x_train[col] = lbl.transform(list(x_train[col].values))

In [None]:
print(x_train['siteid'].mean())
print(x_train['siteid'].median())
train_mode = x_train['siteid'].mode()
print(train_mode)
x_train.head()

In [None]:
x_train['siteid'].fillna(8896401.0, inplace=True)

In [None]:
x_train['siteid'].isnull().value_counts()

In [None]:
x_train['browserid'].fillna("Edge", inplace=True)

In [None]:
x_train['devid'].fillna("Mobile", inplace=True)

In [None]:
x_train.head()

In [None]:
sns.set(style="white")
corr = x_train.corr()
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

plt.show()

**Mean Encoding**

In [None]:
siteid = x_train["siteid"].value_counts().reset_index()
siteid.columns = ["siteid", "count"]
siteid.head()

In [None]:
#plt.hist(siteid["count"], bins = 10)
#plt.show()

In [None]:
print(siteid.shape)
print(siteid[siteid["count"] > 50].shape)
req_siteid = siteid[siteid["count"] > 50]["siteid"].values  #Selecting siteID which occurs greater than 50 
req_siteid.shape
x_train["siteid"]= x_train["siteid"].apply(lambda x: x if x in req_siteid else "others") #Replacing Site ID with 'others' whose SiteID occurence is smaller than 50' 


In [None]:
siteid_mean_enc = x_train.groupby(["siteid"])["click"].mean().reset_index()
siteid_mean_enc.columns = ["siteid", "mean_siteid_enc"]
siteid_mean_enc.head()
x_train = pd.merge(x_train, siteid_mean_enc, on="siteid", how="left")

In [None]:
offerid = x_train["offerid"].value_counts().reset_index()
offerid.columns = ["offerid", "count"]
offerid.head()
#plt.hist(offerid["count"], bins = 100)
#plt.show()
print(offerid.shape)
print(offerid[offerid["count"] > 50].shape)
req_offerid = offerid[offerid["count"] > 50]["offerid"].values 
x_train["offerid"]= x_train["offerid"].apply(lambda x: x if x in req_offerid else "others_offer")
offerid_mean_enc = x_train.groupby(["offerid"])["click"].mean().reset_index()
offerid_mean_enc.columns = ["offerid", "mean_offerid_enc"]


In [None]:
x_train.head()

In [None]:
x_train = pd.merge(x_train, offerid_mean_enc, on="offerid", how="left")

In [None]:
x_train.head()

** Splitting dateTime Column**

In [None]:
x_train['datetime'] = pd.to_datetime(x_train['datetime'])

In [None]:
x_train['day'] = x_train['datetime'].dt.weekday

In [None]:
x_train['hour'] = x_train['datetime'].dt.hour

In [None]:
x_train['minute'] = x_train['datetime'].dt.minute 

In [None]:
x_train.isnull().sum()

In [None]:
x_train.dtypes

In [None]:
sns.set(style="white")
corr = x_train.corr()
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

plt.show()

In [None]:
cols_to_use = list(set(x_train.columns) - set(['ID','siteid','datetime','click','offerid','category','day','minute','merchant']))

In [None]:
cols_to_use

**Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
X_train,X_test,y_train,y_test = train_test_split(x_train[cols_to_use],x_train['click'],test_size = 0.5)
logistic_regressor = LogisticRegression()

In [None]:
pipeline = Pipeline(steps=[('add_poly_features', PolynomialFeatures()),
                           ('logistic_regression', logistic_regressor)])

In [None]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(accuracy_score(y_test, y_pred))

In [None]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
print(roc_auc)

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
print(f1_score(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))

In [None]:
# Applying confusion matrix on above data
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
print(cm)

In [None]:
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

**XG Boost**

In [None]:
target = x_train['click']
XG_train, XG_test, yG_train, yG_test = train_test_split(x_train[cols_to_use], x_train['click'], test_size=0.50, random_state=42, stratify = target)

In [None]:
model = XGBClassifier()
model.fit(XG_train, yG_train)

In [None]:
yG_pred = model.predict(XG_test)
predictions = [round(value) for value in yG_pred]

In [None]:
# evaluate predictions
accuracy = accuracy_score(yG_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
# Applying confusion matrix on above data
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(yG_test,yG_pred)
print(cm)

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
print(f1_score(yG_test,yG_pred))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))

In [None]:
from sklearn import metrics
Gfpr, Gtpr, thresholds = metrics.roc_curve(yG_test, yG_pred, pos_label=1)
roc_aucG = metrics.auc(Gfpr, Gtpr)
print(roc_aucG)

**Random Forest model**


In [None]:
XR_train, XR_test, yR_train, yR_test = train_test_split(x_train[cols_to_use], x_train['click'], test_size=0.50, random_state=42)


In [None]:
rf_clf=RandomForestClassifier(n_estimators=100,n_jobs=10,random_state=0, min_samples_leaf=10)
rf_clf.fit(XR_train, yR_train)

In [None]:
score_rf=rf_clf.score(XR_test, yR_test)
print("Testing score: %.2f " % score_rf)

In [None]:
yR_pred = rf_clf.predict(XR_test)
print(accuracy_score(yR_test, yR_pred))

In [None]:
from sklearn import metrics
Rfpr, Rtpr, thresholds = metrics.roc_curve(yR_test, yR_pred, pos_label=1)
roc_aucR = metrics.auc(Rfpr, Rtpr)
print(roc_aucR)

In [None]:
from sklearn.metrics import f1_score
f1_score = f1_score(yR_test,yR_pred)
print(f1_score)
from sklearn.metrics import confusion_matrix
confusion_matrix(yR_test,yR_pred)

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
print(f1_score(yR_test,yR_pred))
print(precision_score(yR_test,yR_pred))
print(recall_score(yR_test,yR_pred))

**AUC-ROC score comparision**

In [None]:
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',label='AUC = %0.2f'% roc_auc)
plt.plot(Gfpr, Gtpr, 'g',label='XGAUC = %0.2f'% roc_aucG)
plt.plot(Gfpr, Gtpr, 'y',label='RAUC = %0.2f'% roc_aucR)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()