In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [2]:
df = pd.read_csv('../input/advertising.csv') # load the data

In [3]:
df.head()

In [4]:
# Featuretools is a framework to perform automated feature engineering. 
# It excels at transforming transactional and relational datasets into feature matrices for machine learning.
import featuretools as ft 

In [5]:
# Check if there is any missing values
df.isnull().sum()

In [6]:
# Explore a bit about the count, mean, standard deviation, minimum and maximum values and the quantiles of the data
df.describe()

In [7]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('classic')
%matplotlib inline

In [8]:
df.info()

In [9]:
len(df['City'].unique())

In [10]:
len(df['Country'].unique())

There are 969 different cities in 1000 instances in the data, we need to consider the usefulness of this variable and if there is a need to use it. For country, there are 237 unique. 

In [11]:
df['Clicked on Ad'].value_counts()

In [12]:
g0 = sns.countplot(x='Clicked on Ad', data = df, palette='husl')

Looks like 50/50 click or no click, a perfect sythetic data set. 

In [13]:
df.groupby('Clicked on Ad').mean()

Observations: 
* The one who clicked spend a shorter amount of time on Site, is older in age (40), has lower incoem and shorter Daily Internet Usage.

Let us explore more categorical means for other variables such as Country, Male or female

In [14]:
df.groupby('Country').mean()

In [15]:
df.groupby('Country')['Clicked on Ad'].mean().nlargest(30)

In [16]:
df_1 = df.groupby('Country').mean()
g1 = sns.pairplot(df_1, palette="husl", kind="reg")

We focus on if any of these numeric variables having linear relationship with the percentage of click rate in each country (the last row).
* Higher Daily Time Spent on Site, lower % click. 
* Higher Age, higher % click
* Higher the income, lower % click
* Higher Daily Internet Usage,, lower % click. 

In [17]:
g = sns.pairplot(df, hue="Clicked on Ad", palette="husl")

In [18]:
# Convert Timestamp column as a datetime object
# http://hamelg.blogspot.com/2015/11/python-for-data-analysis-part-17.html
df2 = df.copy()
df2['Timestamp'] = pd.to_datetime(df2["Timestamp"] )
df2.info()

In [19]:
df2["month"] = df2['Timestamp'].dt.month
df2["day"] = df2['Timestamp'].dt.day
df2["dayofweek"] = df2['Timestamp'].dt.dayofweek
df2["hour"] = df2['Timestamp'].dt.hour
df2.head()

In [20]:
g2 = sns.pairplot(df2[['Clicked on Ad', 'month', 'day', 'dayofweek', 'hour']], hue="Clicked on Ad", palette="husl")


Observations: By looking at the diagonal bar plots, we can see that there are no clear indication of month, day, day of week, hour showing effects on the click outcome. Overall, we can see no seasonality effetcs and this data set is probably created to be easier to analyze. We can bucketize some of these variables, such as hours.

## Based on the above observation, we can use logistic regression and random forest for this classification problem. 
Let us try create a base model using data with only numeric conlumns containing ["Daily Time Spent on Site", "Age", "Area Income", "Daily Internet Usage", "Male"]

(1) logistic regression: 
Logistic Regression Assumptions
* Binary logistic regression requires the dependent variable to be binary.
* For a binary regression, the factor level 1 of the dependent variable should represent the desired outcome.
* Only the meaningful variables should be included.
* The independent variables should be independent of each other. That is, the model should have little or no multicollinearity.
* The independent variables are linearly related to the log odds.
* Logistic regression requires quite large sample sizes.



In [21]:
df3 = df2.copy()
df3 = pd.concat([df3, pd.get_dummies(df3['Country'], prefix='Country')],axis=1)
df3 = pd.concat([df3, pd.get_dummies(df3['month'], prefix='Month')],axis=1)
df3 = pd.concat([df3, pd.get_dummies(df3['dayofweek'], prefix='Dayofweek')],axis=1)
# create a bucket for hours into [0-5, 6-11, 12-17, 18-23] hour
df3['Hour_bin'] = pd.cut(df3['hour'], [0, 5, 11, 17, 23], labels=['hour_0-5', 'hour_6-11', 'hour_12-17', 'hour_18-23'], include_lowest=True)
df3 = pd.concat([df3, pd.get_dummies(df3['Hour_bin'], prefix='Hour')],axis=1)
df3.drop(['Country', 'Ad Topic Line', 'City', 'Timestamp', 'day', 'month', 'dayofweek', 'hour', 'Hour_bin'],axis=1, inplace=True)
df3.head(10)

In [22]:
# Check our final column variable names
df3.columns.values

# Use all feature and implementing the LogisticRegression model

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
# Select dependent variable and prediction outcome from the data
df3_final_vars = df3.columns.values.tolist()
y = df3['Clicked on Ad']
X_features = [i for i in df3_final_vars if i not in y]
X = df3[X_features]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

# Do a 10 fold cross-validation

In [24]:
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10, random_state=10)
modelCV = LogisticRegression()
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, X_train, y_train, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: %.3f" % (results.mean()))

The average accuracy remains very close to the Logistic Regression model accuracy; we can conclude that our model generalizes well in the test set.

# Look at different performance evaluation metrics in testing data set：confusion matrix, ROC, precision, recall, and f1-score

In [25]:
# Looking at different performance evaluation metrics in testing data set：confusion matrix, ROC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score, f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn import metrics
import matplotlib.pyplot as plt
from time import time

def evaluation(estimator, X_test, y_test):

    start = time()
    y_pred = estimator.predict(X_test)
    print("Querying with the best model took %f seconds." % (time() - start))
    print(len(y_pred))
    confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
    print(confmat)

    fig, ax = plt.subplots(figsize=(3, 3))
    ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
    for i in range(confmat.shape[0]):
        for j in range(confmat.shape[1]):
            ax.text(x=j, y=i,
                    s=confmat[i, j],
                    va='center', ha='center')

    plt.xlabel('predicted label')
    plt.ylabel('true label')
    plt.show()

    print('Precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred))
    print('Recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred))
    print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred))
    print('ROC AUC: %.3f' % roc_auc_score(y_true=y_test, y_score=y_pred))
    print('Accuracy: %.3f' % accuracy_score(y_true=y_test, y_pred=y_pred))
    print('-----------------------------------------')
    print(metrics.classification_report(y_test, y_pred))

In [26]:
evaluation(logreg, X_test, y_test)

According to Jean-Sebastien Provost's proposal, we assume that you have a marketing campaign for which you spend 1000 per potential customer. For each customer that you target with your ad campaign and that clicks on the ad, let's assume that you'll get an overall profit of 100 (earn back 1100 per correct target). However, if you target a customer that ends up not clicking on your ad, then you get a net loss of 1050. Let's calculate the performance of this model: profit = (all_positive x 1100) - (true_positive + false_positive)  x 1000 -  false_positive x 1050 (not making sense of this assumtion made ????? what if someone who is not tagerted end up with clicking the ad?)

In [42]:
profit_of_campaign = 126 * 100 - 3 * 50
print('How much money is made: ${}'.format(profit_of_campaign))
print('How much money is made per customer: ${}'.format(profit_of_campaign/129))

# Using Sklearn's Pipeline function to combining features transformers and estimators to see if it can further improve performance 
It consisted of two intermediate steps, a StandardScaler and a PCA transformer, and a LogisticRegression classifier as a final estimator.

In [28]:
# Using Sklearn's Pipeline function to combining features transformers and estimators; 
# It consisted of two intermediate steps, a StandardScaler and a PCA transformer, and a LogisticRegression classifier as a final estimator.
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

# Check performance of LogisticRegression algorithm with feature thransformation of StandardScaler and a PCA transformer
pipe_1 = Pipeline([('scl', StandardScaler()), ('pca', PCA(n_components=2)), ('lr', LogisticRegression(random_state=2))])
pipe_1.fit(X_train, y_train)
print('LogisticRegression (with scaler/PCA) Test Accuracy: %.3f' % pipe_1.score(X_test, y_test))

We can see an improvment of test accuracy from 0.957 to 0.977. 

In [29]:
results_2 = model_selection.cross_val_score(pipe_1, X_train, y_train, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy of pipe_1: %.3f" % (results_2.mean()))

10-fold cross validation average accuracy of pipe_1: 0.980, it performs better. 

In [30]:
# Let's do an evaluation to check all performance metrics. 
evaluation(pipe_1, X_test, y_test)

We can see the second model peform slightly better than the first one in metrics, see below for the increased profit for each customer.

In [41]:
profit_of_campaign_2 = 133 * 100 - 3 * 50
print('How much money is made: ${}'.format(profit_of_campaign_2))
print('How much money is made per customer: ${}'.format(profit_of_campaign_2/136))

# Feature ranking with recursive feature elimination.

Given an external estimator that assigns weights to features (e.g., the coefficients of a linear model), the goal of recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and the importance of each feature is obtained either through a coef_ attribute or through a feature_importances_ attribute. Then, the least important features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached.

In [31]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Select dependent variable and prediction outcome from the data
df3_final_vars = df3.columns.values.tolist()
y = ['Clicked on Ad']
X = [i for i in df3_final_vars if i not in y]

feature_names = df3.columns.values
# print(feature_names)
logreg = LogisticRegression()
rfe = RFE(logreg)
rfe = rfe.fit(df3[X], df3[y])
print(rfe.support_)
print("Features sorted by their rank:")
print(sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), feature_names)))

# Selecting the best ranked features

In [32]:
mask = rfe.support_ #list of booleans
new_features = [] # The list of your K best features
for bool, feature in zip(mask, feature_names):
    if bool:
        new_features.append(feature)
print(new_features)
df3_final = df3[new_features]
#print(df3_final)

In [33]:
df3_final.head(10)