In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
advertisement = pd.read_csv('../input/advertising/advertising.csv')

In [None]:
advertisement.shape

In [None]:
advertisement.info()

In [None]:
advertisement['Daily Internet Usage'].describe()

In [None]:
advertisement.head()

In [None]:
advertisement['Age'].describe()

In [None]:
def categorize_age_range(x):
    if x >=15 and x < 20:
        return "15-20"
    elif x >=20 and x < 25:
        return "20-25"
    elif x>=25 and x<30:
        return "25-30"
    elif x>=30 and x<35:
        return "30-35"
    elif x>=35 and x<40:
        return "35-40"
    elif x>=40 and x <45:
        return "40-45"
    elif x>=45 and x <=50:
        return "45-50"
    elif x>50:
        return ">50"
    
advertisement['Age'] = advertisement['Age'].apply(lambda x: categorize_age_range(x))

In [None]:
advertisement.head()

In [None]:
advertisement['Daily Time Spent on Site'].describe()

In [None]:
advertisement.drop(['Ad Topic Line','City','Country','Timestamp'],axis = 1, inplace = True)

In [None]:
age_dummies = pd.get_dummies(advertisement['Age'] , drop_first = True)

In [None]:
advertisement = pd.concat([advertisement, age_dummies], axis = 1)

In [None]:
advertisement.head()

In [None]:
advertisement.drop('Age', axis = 1, inplace= True)

In [None]:
advertisement.head()

In [None]:
advertisement[['Area Income','Daily Time Spent on Site','Daily Internet Usage']].describe()

### Model Selection

In [None]:
X = advertisement.drop('Clicked on Ad', axis = 1)

In [None]:
y = advertisement['Clicked on Ad']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size = 0.7, test_size =0.3, random_state = 1)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
X_train[['Area Income','Daily Time Spent on Site','Daily Internet Usage']] = scaler.fit_transform(X_train[['Area Income','Daily Time Spent on Site','Daily Internet Usage']])

In [None]:
X_train.head()

In [None]:
plt.figure(figsize = (10,8))
sns.heatmap(X_train.corr() , annot = True)
plt.show()

### Fit the Model

In [None]:
import statsmodels.api as sm
logm1 = sm.GLM(y_train, (sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

### RFE

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
from sklearn.feature_selection import RFE
rfe = RFE(lr, 10)
rfe = rfe.fit(X_train, y_train)

In [None]:
rfe.support_

In [None]:
col = X_train.columns[rfe.support_]
col

In [None]:
X_train_const = sm.add_constant(X_train[col])
logm2 = sm.GLM(y_train, X_train_const, family = sm.families.Binomial())

In [None]:
res = logm2.fit()
res.summary()

### Check VIF

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train = X_train[col]
X_train  = X_train.drop('20-25', axis = 1)

In [None]:
X_train.info()

In [None]:
X_train_const = sm.add_constant(X_train)
logm3 = sm.GLM(y_train, X_train_const, family = sm.families.Binomial())

In [None]:
res = logm3.fit()
res.summary()

### Check the VIF & P-value to eliminate the features that are not significant or have high inflation factor

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

### Eliminate the first feature with the low significance first and validate again.

In [None]:
X_train  = X_train.drop('25-30', axis = 1)

In [None]:
X_train_const = sm.add_constant(X_train)
logm4 = sm.GLM(y_train, X_train_const, family = sm.families.Binomial())
res = logm4.fit()
res.summary()

### Features are finalised

In [None]:
col = X_train.columns
col

In [None]:
y_train_pred = res.predict(X_train_const).values.reshape(-1)
y_train_pred[:10]

In [None]:
y_train_pred_final = pd.DataFrame({'Actual_Clicked':y_train.values, 'Click_Predict':y_train_pred})
y_train_pred_final.head()

In [None]:
y_train_pred_final['predicted'] = y_train_pred_final.Click_Predict.map(lambda x: 1 if x > 0.8 else 0)

# Let's see the head
y_train_pred_final.head()

In [None]:
y_train_pred_final.Click_Predict.describe()

### Further our analysis using the confusion matrix

In [None]:
from sklearn import metrics
# Let's take a look at the confusion matrix again 
confusion = metrics.confusion_matrix(y_train_pred_final.Actual_Clicked, y_train_pred_final.predicted )
confusion

### Accuracy, Sensitivity , Specificity 

In [None]:
print(metrics.accuracy_score(y_train_pred_final.Actual_Clicked, y_train_pred_final.predicted))

In [None]:
TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

### Sensitivity

In [None]:
TP / float(TP+FN)

### Specificity

In [None]:
TN / float(TN+FP)

### Plot the ROC curve

In [None]:
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
draw_roc(y_train_pred_final['Actual_Clicked'], y_train_pred_final['predicted'])

### Probability Cutoff 

In [None]:
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final['Click_Predict'].map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

In [None]:
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final['Actual_Clicked'], y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

From the above Probability cutoff line plot, we can find that the accuracy, sensitivity, specificity intersect at close 0.4, around 0.38 approx. So we will take this probability as our cut-off to proceed further. 

In [None]:
y_train_pred_final['final_predicted'] = y_train_pred_final['Click_Predict'].map( lambda x: 1 if x > 0.38 else 0)

y_train_pred_final.head()

In [None]:
metrics.accuracy_score(y_train_pred_final.Actual_Clicked, y_train_pred_final.final_predicted)

In [None]:
confusion2 = metrics.confusion_matrix(y_train_pred_final.Actual_Clicked, y_train_pred_final.final_predicted )
confusion2

In [None]:
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
TP / float(TP+FN)

In [None]:
# Let us calculate specificity
TN / float(TN+FP)

In [None]:
from sklearn.metrics import precision_score, recall_score

In [None]:
precision_score(y_train_pred_final['Actual_Clicked'], y_train_pred_final.predicted)

In [None]:
recall_score(y_train_pred_final['Actual_Clicked'], y_train_pred_final.predicted)

### Predict the test set

In [None]:
X_test[['Area Income','Daily Time Spent on Site','Daily Internet Usage']] = scaler.transform(X_test[['Area Income','Daily Time Spent on Site','Daily Internet Usage']])

In [None]:
X_test = X_test[col]
X_test.head()

In [None]:
X_test_sm = sm.add_constant(X_test)
y_test_pred = res.predict(X_test_sm)

In [None]:
y_pred_1 = pd.DataFrame(y_test_pred)
y_test_df = pd.DataFrame(y_test)

In [None]:
y_pred_final = pd.concat([y_test_df, y_pred_1],axis=1)
y_pred_final

In [None]:
y_pred_final= y_pred_final.rename(columns={ 0 : 'Click_Predict', 'Clicked on Ad':'Actual_Clicked'})
y_pred_final

In [None]:
y_pred_final['final_predicted'] = y_pred_final.Click_Predict.map(lambda x: 1 if x > 0.39 else 0)
y_pred_final

In [None]:
accuracy = metrics.accuracy_score(y_pred_final['Actual_Clicked'], y_pred_final.final_predicted)

In [None]:
confusion2 = metrics.confusion_matrix(y_pred_final['Actual_Clicked'], y_pred_final.final_predicted )
confusion2

In [None]:
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [None]:
sensitivity = TP / float(TP+FN)

In [None]:
specificity = TN / float(TN+FP)

In [None]:
print('accurancy = '+ str(accuracy)+' , specificity =' +str(specificity)+', Sensitivity = '+ str(sensitivity))