 # AD Click Prediction

In [None]:
## importing libraries ##
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV , train_test_split
from tqdm import tqdm_notebook
import warnings
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import gc
import featuretools as ft
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
data = pd.read_csv('../input/train.csv', parse_dates= ['DateTime'])

In [None]:
test = pd.read_csv('../input/test.csv', parse_dates = ['DateTime'])

In [None]:
hist = pd.read_csv('../input/historical_user_logs.csv', parse_dates= ['DateTime'])

In [None]:
data.head()

In [None]:
hist.head()

In [None]:
data.info()

In [None]:
sns.heatmap(data.isnull())

### Data Imputation

In [None]:
# data imputation
data = data.drop('product_category_2', axis = 1) # dropping the column 
# for rest of the columns with missing values, imputing using forward fill.
data['city_development_index'] = data['city_development_index'].fillna(method = 'ffill') 
data['gender'] = data['gender'].fillna(method = 'ffill')
data['user_group_id'] = data['user_group_id'].fillna(method = 'ffill')
data['age_level'] = data['age_level'].fillna(method = 'ffill')
data['user_depth'] = data['user_depth'].fillna(method = 'ffill')

In [None]:
data.info()

### Data Visualization

In [None]:
day = data.groupby('DateTime')['is_click'].sum()
day = day.resample('H').sum()
plt.figure(figsize=(20,5))
day.plot(kind='bar',grid = None)

#### Visualizing the trends in the data by setting granularity to per hour on daily basis.

In [None]:
part_day = day.loc[slice('2017-07-02','2017-07-03')]
plt.figure(figsize=(20,5))
part_day.plot(kind='bar',grid = None)

#### Portion of the above plot for 2 days of user data.

In [None]:
data1 = data.reset_index()
data1['weekday'] = data1['DateTime'].dt.day_name()
byday  = pd.DataFrame(data1.groupby('weekday')['is_click'].sum())
byday = byday.reset_index()
plt.figure(figsize=(20,5))
sns.barplot(data = byday , x= 'weekday', y = 'is_click')

#### Visualizing the user behavior on weekday basis. It seems that most of the clicks are for MONDAY & SUNDAY.

In [None]:
user = data.groupby(['gender','product'])['is_click'].sum()
user = pd.DataFrame(user.reset_index())
plt.figure(figsize=(20,5))
sns.barplot(data = user, x= 'product', y = 'is_click', hue = 'gender',palette='Set1')

#### Visualizing data for different products for male and female user groups.

In [None]:
n_data = data.reset_index()
campaign= pd.DataFrame(n_data.groupby(['campaign_id','product'])['is_click'].sum())
campaign= campaign.reset_index()
campaign= campaign.groupby(['product'])[['campaign_id','is_click']].max()
campaign= campaign.sort_values('is_click',ascending = False).reset_index()
campaign.columns = ['product', 'campaign_id', 'max click in any campaign']
plt.figure(figsize=(15,5))
sns.barplot(y= 'product', x= 'max click in any campaign', palette = 'Set1', data = campaign, orient='h')

#### Barplot showing max clicks for all the product from a single campaign.

In [None]:
n_data = data.reset_index()
campaign= pd.DataFrame(n_data.groupby(['campaign_id','product'])['is_click'].sum())
campaign= campaign.reset_index()
campaign= campaign.groupby('campaign_id')[['product','is_click']].max()
campaign.sort_values('is_click',ascending = False)

#### Table highlighting the most successful product and no. of clicks for each of them for each campaign.

In [None]:
plt.figure(figsize=(20,5))
sns.countplot(x= 'user_group_id', hue= 'gender', palette = 'Set1', data = data)

#### This visualization highlights that all the user groups from 0-6 are Male and from 7-12 are Females.

In [None]:
plt.figure(figsize=(15,5))
user_group = data.groupby('user_group_id')['is_click'].agg(['count','sum'])
user_group['%success']= round((user_group['sum']*100)/user_group['count'], 2)
user_group = user_group.reset_index()
sns.barplot(y= 'user_group_id', x= '%success', data = user_group, palette = 'Set1', order = user_group['%success'])

#### successs % on the basis of the user id group. Most successful user group is 12.

In [None]:
plt.figure(figsize=(15,3))
sns.countplot(x="product", hue= "is_click", palette = 'Set1', data =data )

#### Visualizing count of clicks and non clicks for each of the product.

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x="product", hue= "product_category_1", palette = 'Set1', data =data)

#### Performance of all the products compared category wise.

In [None]:
data1 = data[['user_depth', 'is_click']]
data1 = data.groupby(['user_depth','is_click']).size().unstack()
data1['success %'] = round(data1[1]*100/(data1[1]+data1[0]),2)
data1

In [None]:
print(data['is_click'].value_counts())
print(round(30057*100/(414991),2))  

### Number of clicks is only 7.24%. So, our dataset is imbalanced

### Feature Engineering & Data Preprocessing

In [None]:
data['weekday']=data['DateTime'].dt.day_name()
data['hour'] = data['DateTime'].dt.hour
data['minutes'] = data['DateTime'].dt.minute
data = data.drop(['DateTime','session_id'], axis = 1)
data.head()

In [None]:
es1 = ft.EntitySet()

In [None]:
es1 = es1.entity_from_dataframe(entity_id= 'hist', 
                                dataframe= hist,
                                make_index = True,
                                index = 'id',
                                time_index = 'DateTime',
                                variable_types={"user_id": ft.variable_types.Categorical})
                                       

In [None]:
es1['hist'].variables

In [None]:
es1 = es1.entity_from_dataframe(entity_id = 'data', 
                                dataframe= data, 
                                make_index= True, 
                                index = 'id',
                                variable_types={"user_id": ft.variable_types.Categorical, 
                                                'webpage_id': ft.variable_types.Categorical,
                                                'campaign_id': ft.variable_types.Categorical,
                                                'product_category_1': ft.variable_types.Categorical,
                                                'user_group_id':  ft.variable_types.Categorical,
                                                'age_level': ft.variable_types.Categorical,
                                                'user_depth': ft.variable_types.Categorical,
                                                'city_development_index': ft.variable_types.Categorical,
                                                'var_1': ft.variable_types.Categorical ,
                                                'is_click': ft.variable_types.Categorical
                                               })          

In [None]:
es1['data'].variables

In [None]:
relation = ft.Relationship(es1['data']['id'], es1['hist']['id'])

In [None]:
es1 = es1.add_relationship(relation)
es1

In [None]:
features, feature_names = ft.dfs(entityset= es1, 
                                 target_entity= 'data', 
                                  max_depth = 2
                                 )

In [None]:
features.info()

In [None]:
col =['product','gender','weekday','webpage_id','campaign_id','product_category_1','user_group_id','age_level','user_depth','var_1','city_development_index','MODE(hist.product)','MODE(hist.action)']
new_data = pd.get_dummies(features, columns = col, drop_first= True)

In [None]:
new_data.info()

In [None]:
X = new_data.drop('is_click', axis=1)
y = new_data['is_click']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.3,  stratify = y,random_state = 101)

In [None]:
train = pd.concat([X_train, y_train], axis = 1)

In [None]:
t_0 = train[train['is_click'] == 0]
t_1 = train[train['is_click'] == 1]

#### Working with a subset of Training Data

In [None]:
t0_sub = t_0.sample(n = 27573, random_state= 101)
t1_sub = t_1.sample(n = 2000, random_state= 101)
train_sub = pd.concat([t0_sub,t1_sub], axis = 0)
train_sub = train_sub.sample(frac=1, random_state= 101)

In [None]:
Xtr_sub = train_sub.drop('is_click', axis=1)
ytr_sub = train_sub['is_click']

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
sm = SMOTE(random_state = 101)
X_sub, y_sub  = sm.fit_sample(Xtr_sub,ytr_sub)

In [None]:
X_sub.shape, y_sub.shape

#### Feature Selection

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression()
rfe = RFE(model, n_features_to_select=75, verbose= 1)
fit = rfe.fit(X_sub,y_sub)

In [None]:
sum(fit.support_)

In [None]:
X_sub = pd.DataFrame(X_sub, columns= Xtr_sub.columns)
X_sub = np.array(X_sub.loc[:,fit.support_])
X_sub.shape

In [None]:
type(X_sub)

In [None]:
y_pred = fit.predict(X_test)
matrix =classification_report(y_test,y_pred)
print(matrix)

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
roc_auc_score(y_test,y_pred)

#### 75 features seems to be optimum

#### Logistic Regreession is our base model with roc_auc score of 0.55

### Data Modelling

In [None]:
def best_model(estimator,grid, refit_score, scorer):
    grid_search = GridSearchCV(estimator, param_grid=grid, scoring= scorer, refit= refit_score, cv = skf,n_jobs= -1)
    grid_search.fit(X_sub, y_sub)
    
    pred = grid_search.predict(X_test)
    
    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)
    
    print(pd.DataFrame(confusion_matrix(y_test, pred),columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))
    
    print('roc-auc : %0.2f'  % roc_auc_score(y_test, pred))
    return grid_search

In [None]:
skf = StratifiedKFold(n_splits=5, random_state=101)
scorers = ['recall']
X_test = np.array(X_test.loc[:,fit.support_])

#### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dctree = DecisionTreeClassifier()

In [None]:
para_grid = {
    'criterion': ['entropy', 'gini'],
    'min_samples_split': [2],
    'max_depth': [30,35,40],
    'max_features': [20, 25,27]
}

In [None]:
best_model(estimator= dctree, grid= para_grid, refit_score= 'recall', scorer= scorers)

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()

In [None]:
para_grid = {
    'min_samples_split': [2], 
    'n_estimators' : [300],
    'max_depth': [25],
    'max_features': [40, 45],
}

In [None]:
best_model(estimator= rf, grid= para_grid, refit_score= 'recall', scorer= scorers)

#### Thus, we can see a simple model sometimes give better results than complex algorithms.