# Lead Scoring Case Study
The data file contains 37 variables including target variable 'Converted'. The dataset is of an education company X which sells online cources. The important leads details are provided as part of this dataset. As datascientists our task is to provide the promising leads that have high chances of converting to the paid customers.The CEO of this company has provided a target of getting 80% conversion.

# Step 1: Import packages and Read Data

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import scale
from sklearn.datasets import fetch_mldata
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
# importing the data and the data dictionary.

dic = pd.read_excel (r'C:\Users\rachi\Downloads\Lead Scoring Assignment\Leads Data Dictionary.xlsx')

In [3]:
# dropping rows with Nan Values in data dictionary
dic = dic.drop([0, 1])

# dropping columns with NaN values in data dictionary

dic = dic.drop(dic.ix[:, 0:1].columns, axis = 1)

# Renaming columns of dictionary to make more sense.
dic.rename(columns = {'Unnamed: 1':'Variables', 'Unnamed: 2' : 'Description'}, inplace = True)

# Data Dictionary

Below mentioned are the details of the columns being used in the data set of this assignment.

In [4]:
# Reset index of dictionary dataframe
dic.reset_index()

Unnamed: 0,index,Variables,Description
0,2,Prospect ID,A unique ID with which the customer is identif...
1,3,Lead Number,A lead number assigned to each lead procured.
2,4,Lead Origin,The origin identifier with which the customer ...
3,5,Lead Source,"The source of the lead. Includes Google, Organ..."
4,6,Do Not Email,An indicator variable selected by the customer...
5,7,Do Not Call,An indicator variable selected by the customer...
6,8,Converted,The target variable. Indicates whether a lead ...
7,9,TotalVisits,The total number of visits made by the custome...
8,10,Total Time Spent on Website,The total time spent by the customer on the we...
9,11,Page Views Per Visit,Average number of pages on the website viewed ...


In [5]:
df_lead = pd.read_csv("Leads.csv")
df_lead.head()

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,...,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,2.5,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,2.0,...,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,20.0,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,1.0,...,No,Select,Mumbai,02.Medium,01.High,13.0,17.0,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,1.0,...,No,Select,Mumbai,02.Medium,01.High,15.0,18.0,No,No,Modified


# Step 2: Inspect Data

In [6]:
df_lead.dtypes

Prospect ID                                       object
Lead Number                                        int64
Lead Origin                                       object
Lead Source                                       object
Do Not Email                                      object
Do Not Call                                       object
Converted                                          int64
TotalVisits                                      float64
Total Time Spent on Website                        int64
Page Views Per Visit                             float64
Last Activity                                     object
Country                                           object
Specialization                                    object
How did you hear about X Education                object
What is your current occupation                   object
What matters most to you in choosing a course     object
Search                                            object
Magazine                       

In [7]:
df_lead.shape

(9240, 37)

# Step 3: Data Preparation

In [10]:
# Replacing select value with NaN
df_lead.replace(to_replace ="Select", value ="NaN") 

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,...,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,0.00,...,No,,,02.Medium,02.Medium,15.0,15.0,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,2.50,...,No,,,02.Medium,02.Medium,15.0,15.0,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,2.00,...,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,20.0,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,1.00,...,No,,Mumbai,02.Medium,01.High,13.0,17.0,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,1.00,...,No,,Mumbai,02.Medium,01.High,15.0,18.0,No,No,Modified
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9235,19d6451e-fcd6-407c-b83b-48e1af805ea9,579564,Landing Page Submission,Direct Traffic,Yes,No,1,8.0,1845,2.67,...,No,Potential Lead,Mumbai,02.Medium,01.High,15.0,17.0,No,No,Email Marked Spam
9236,82a7005b-7196-4d56-95ce-a79f937a158d,579546,Landing Page Submission,Direct Traffic,No,No,0,2.0,238,2.00,...,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,19.0,No,Yes,SMS Sent
9237,aac550fe-a586-452d-8d3c-f1b62c94e02c,579545,Landing Page Submission,Direct Traffic,Yes,No,0,2.0,199,2.00,...,No,Potential Lead,Mumbai,02.Medium,01.High,13.0,20.0,No,Yes,SMS Sent
9238,5330a7d1-2f2b-4df4-85d6-64ca2f6b95b9,579538,Landing Page Submission,Google,No,No,1,3.0,499,3.00,...,No,,Other Metro Cities,02.Medium,02.Medium,15.0,16.0,No,No,SMS Sent


In [11]:
# remove duplicate rows
df_lead.drop_duplicates(subset='Lead Number')
df_lead.shape

(9240, 37)

In [12]:
#check for percentage nulls

total = pd.DataFrame(df_lead.isnull().sum().sort_values(ascending=False), columns=['Total'])
percentage = pd.DataFrame(round(100*(df_lead.isnull().sum()/df_lead.shape[0]),2).sort_values(ascending=False)\
                          ,columns=['Percentage'])
pd.concat([total, percentage], axis = 1)

Unnamed: 0,Total,Percentage
Lead Quality,4767,51.59
Asymmetrique Profile Score,4218,45.65
Asymmetrique Activity Score,4218,45.65
Asymmetrique Profile Index,4218,45.65
Asymmetrique Activity Index,4218,45.65
Tags,3353,36.29
What matters most to you in choosing a course,2709,29.32
Lead Profile,2709,29.32
What is your current occupation,2690,29.11
Country,2461,26.63


In [13]:
# check for columns with only null values
df_lead.isnull().all(axis=0).any()

False

In [14]:
# check for all columns with only 0 values
df_lead.loc[:, (df_lead != 0).any(axis=0)]
df_lead.shape

(9240, 37)

In [15]:
#Removing columns which has only one unique value
df_lead= df_lead.loc[:,df_lead.nunique()!=1]
df_lead.shape

(9240, 32)

In [16]:
# Remove the columns 'Asymmetrique Activity Score' & 'Asymmetrique Profile Score' 
df_lead = df_lead.drop('Asymmetrique Activity Score', axis=1)
df_lead = df_lead.drop('Asymmetrique Profile Score', axis=1)
df_lead.shape

(9240, 30)

In [17]:
# drop 'Prospect ID' as not required in the model prediction
df_lead = df_lead.drop('Prospect ID', axis=1)
df_lead.shape

(9240, 29)

In [18]:
# drop the columns 'What matters most to you in choosing a course' as has unique values and null values.
df_lead = df_lead.drop('What matters most to you in choosing a course', axis=1)
df_lead.shape

(9240, 28)

In [19]:
# drop the columns 'How did you hear about X Education' 
df_lead = df_lead.drop('How did you hear about X Education', axis=1)
df_lead.shape

(9240, 27)

In [20]:
#Drop columns with missing values
df_lead['Lead Source'].isnull().sum()

36

In [21]:
# eliminate rows where a particular column has high missing values
df_lead = df_lead[~pd.isnull(df_lead['Lead Source'])]
df_lead.shape

(9204, 27)

In [5]:
#Impute median and mode wherever necessary
df_lead['TotalVisits'].replace(np.NaN, df_lead['TotalVisits'].median(), inplace =True)

NameError: name 'df_lead' is not defined

In [None]:
df_lead['Page Views Per Visit'].replace(np.NaN, df_lead['Page Views Per Visit'].median(), inplace =True)

In [None]:
df_lead['Country'].mode()

In [None]:
df_lead.loc[pd.isnull(df_lead['Country']), ['Country']] = 'India'

In [6]:
df_lead['Country'] = df_lead['Country'].apply(lambda x: 'India' if x=='India' else 'Not India')
df_lead['Country'].value_counts()

NameError: name 'df_lead' is not defined

In [None]:
sns.barplot(x='Country', y='Converted', palette='husl', data=df_lead, estimator=np.sum)

In [None]:
df_lead['Lead Quality'].value_counts()

In [None]:
df_lead['Lead Quality'].isnull().sum()

In [7]:
df_lead['Lead Quality'].fillna("Unknown", inplace = True)
df_lead['Lead Quality'].value_counts()

NameError: name 'df_lead' is not defined

In [None]:
sns.barplot(x='Lead Quality', y='Converted', palette='husl', data=df_lead, estimator=np.sum)

In [None]:
df_lead['Asymmetrique Profile Index'].value_counts()

In [None]:
df_lead['Asymmetrique Profile Index'].isnull().sum()

In [8]:
df_lead['Asymmetrique Profile Index'].fillna("Unknown", inplace = True)
df_lead['Asymmetrique Profile Index'].value_counts()

NameError: name 'df_lead' is not defined

In [None]:
sns.barplot(x='Asymmetrique Profile Index', y='Converted', palette='husl', data=df_lead, estimator=np.sum)

In [None]:
df_lead['Asymmetrique Activity Index'].value_counts()

In [None]:
df_lead['Asymmetrique Activity Index'].isnull().sum()

In [9]:
df_lead['Asymmetrique Activity Index'].fillna("Unknown", inplace = True)
df_lead['Asymmetrique Activity Index'].value_counts()

NameError: name 'df_lead' is not defined

In [None]:
sns.barplot(x='Asymmetrique Activity Index', y='Converted', palette='husl', data=df_lead, estimator=np.sum)

In [None]:
df_lead['City'].isnull().sum()

In [None]:
df_lead['City'].fillna("Unknown", inplace = True)
df_lead['City'].value_counts()

In [10]:
df_lead['City'].replace('Select', 'Unknown', inplace =True)
df_lead['City'].value_counts()

NameError: name 'df_lead' is not defined

In [None]:
sns.barplot(y='City', x='Converted', palette='husl', data=df_lead, estimator=np.sum)

In [None]:
df_lead['Last Activity'].value_counts()

In [None]:
df_lead['Last Activity'].isnull().sum()

In [11]:
df_lead['Last Activity'].fillna("Unknown", inplace = True)
df_lead['Last Activity'].value_counts()

NameError: name 'df_lead' is not defined

In [None]:
sns.barplot(y='Last Activity', x='Converted', palette='husl', data=df_lead, estimator=np.sum)

In [None]:
df_lead['Lead Profile'].value_counts()

In [None]:
df_lead['Lead Profile'].isnull().sum()

In [12]:
df_lead['Lead Profile'].fillna("Unknown", inplace = True)
df_lead['Lead Profile'].value_counts()

NameError: name 'df_lead' is not defined

In [None]:
df_lead['Lead Profile'].replace('Select', 'Unknown', inplace =True)
df_lead['Lead Profile'].value_counts()

In [None]:
sns.barplot(y='Lead Profile', x='Converted', palette='husl', data=df_lead, estimator=np.sum)

In [13]:
df_lead['What is your current occupation'].value_counts()

NameError: name 'df_lead' is not defined

In [None]:
df_lead['What is your current occupation'].isnull().sum()

In [14]:
df_lead['What is your current occupation'].fillna("Unknown", inplace = True)
df_lead['What is your current occupation'].value_counts()

NameError: name 'df_lead' is not defined

In [None]:
sns.barplot(y='What is your current occupation', x='Converted', palette='husl', data=df_lead, estimator=np.sum)

In [None]:
df_lead['Specialization'].value_counts()

In [15]:
df_lead['Specialization'].isnull().sum()

NameError: name 'df_lead' is not defined

In [None]:
df_lead['Specialization'].fillna("Unknown", inplace = True)
df_lead['Specialization'].value_counts()

In [None]:
sns.barplot(y='Specialization', x='Converted', palette='husl', data=df_lead, estimator=np.sum)

In [None]:
df_lead['Tags'].value_counts()

In [16]:
df_lead['Tags'].isnull().sum()

NameError: name 'df_lead' is not defined

In [None]:
df_lead['Tags'].fillna("Unknown", inplace = True)
df_lead['Tags'].value_counts()

In [None]:
sns.barplot(y='Tags', x='Converted', palette='husl', data=df_lead, estimator=np.sum)

In [17]:
#check for null values again
total = pd.DataFrame(df_lead.isnull().sum().sort_values(ascending=False), columns=['Total'])
percentage = pd.DataFrame(round(100*(df_lead.isnull().sum()/df_lead.shape[0]),2).sort_values(ascending=False)\
                          ,columns=['Percentage'])
pd.concat([total, percentage], axis = 1).head()

NameError: name 'pd' is not defined

# Check Outliers

In [None]:
#outliers check 25%,50%,75%,90%,95% and 99%
df_lead.describe(percentiles=[.25,.5,.75,.90,.95,.99]).T

In [None]:
num_vars = ['TotalVisits','Total Time Spent on Website','Page Views Per Visit']
print(num_vars)

In [None]:
num_vars = ['TotalVisits','Total Time Spent on Website','Page Views Per Visit']
#Num Var distribution plot function
def bxplt(v1_list):
    plt.figure(figsize=(15,10))
    for v1 in v1_list:
        plt.subplot(2,5,v1_list.index(v1)+1)
        sns.boxplot(y=v1,palette='cubehelix', data=df_lead)   
    plt.tight_layout()
    plt.show()
    
bxplt(num_vars)

In [18]:
fig, ax = plt.subplots(figsize=(6,4))
ax.scatter(df_lead['TotalVisits'], df_lead['Total Time Spent on Website'])
ax.set_xlabel('Proportion of non-retail business acres per town')
ax.set_ylabel('Full-value property-tax rate per $10,000')
plt.show()

NameError: name 'plt' is not defined

# Remove outliers on the Interquartile distance for the continuous variable

In [None]:
Q1 = df_lead['TotalVisits'].quantile(0.25)
Q3 = df_lead['TotalVisits'].quantile(0.75)
IQR = Q3 - Q1
df_lead=df_lead.loc[(df_lead['TotalVisits'] >= Q1 - 1.5*IQR) & (df_lead['TotalVisits'] <= Q3 + 1.4*IQR)]

Q1 = df_lead['Page Views Per Visit'].quantile(0.25)
Q3 = df_lead['Page Views Per Visit'].quantile(0.75)
IQR = Q3 - Q1
df_lead=df_lead.loc[(df_lead['Page Views Per Visit'] >= Q1 - 1.5*IQR) & (df_lead['Page Views Per Visit'] <= Q3 + 1.5*IQR)]

df_lead.shape

In [None]:
#Num Var distribution plot function
def bxplt(v1_list):
    plt.figure(figsize=(15,10))
    for v1 in v1_list:
        plt.subplot(2,5,v1_list.index(v1)+1)
        sns.boxplot(y=v1,palette='BuGn_r', data=df_lead)   
    plt.tight_layout()
    plt.show()
    
bxplt(num_vars)

In [19]:
df_lead.shape

NameError: name 'df_lead' is not defined

# binary variables (Yes/No) to 0/1 Conversion

In [None]:
vlist =  ['Search','Do Not Email', 'Do Not Call', 'Newspaper Article', 'X Education Forums', 'Newspaper', 
            'Digital Advertisement','Through Recommendations','A free copy of Mastering The Interview']

# map function
def bin_map(x):
    return x.map({'Yes': 1, "No": 0})

df_lead[vlist] = df_lead[vlist].apply(bin_map)
df_lead.head()

# Dummy Variable creation for the categorical vars

In [None]:
d1 = pd.get_dummies(df_lead[['Country', 'Lead Source','Lead Origin','Last Notable Activity']], drop_first=True)
df_lead = pd.concat([df_lead, d1], axis=1)
df_lead.shape

In [20]:
ml = pd.get_dummies(df_lead['Lead Quality'], prefix='Lead Quality')
ml1 = ml.drop(['Lead Quality_Unknown'], 1)
df_lead = pd.concat([df_lead,ml1], axis=1)

ml = pd.get_dummies(df_lead['Asymmetrique Profile Index'], prefix='Asymmetrique Profile Index')
ml1 = ml.drop(['Asymmetrique Profile Index_Unknown'], 1)
df_lead = pd.concat([df_lead,ml1], axis=1)

ml = pd.get_dummies(df_lead['Asymmetrique Activity Index'], prefix='Asymmetrique Activity Index')
ml1 = ml.drop(['Asymmetrique Activity Index_Unknown'], 1)
df_lead = pd.concat([df_lead,ml1], axis=1)

ml = pd.get_dummies(df_lead['Tags'], prefix='Tags')
ml1 = ml.drop(['Tags_Unknown'], 1)
df_lead = pd.concat([df_lead,ml1], axis=1)

ml = pd.get_dummies(df_lead['Lead Profile'], prefix='Lead Profile')
ml1 = ml.drop(['Lead Profile_Unknown'], 1)
df_lead = pd.concat([df_lead,ml1], axis=1)

ml = pd.get_dummies(df_lead['What is your current occupation'], prefix='What is your current occupation')
ml1 = ml.drop(['What is your current occupation_Unknown'], 1)
df_lead = pd.concat([df_lead,ml1], axis=1)

ml = pd.get_dummies(df_lead['Specialization'], prefix='Specialization')
ml1 = ml.drop(['Specialization_Unknown'], 1)
df_lead = pd.concat([df_lead,ml1], axis=1)

ml = pd.get_dummies(df_lead['City'], prefix='City')
ml1 = ml.drop(['City_Unknown'], 1)
df_lead = pd.concat([df_lead,ml1], axis=1)

ml = pd.get_dummies(df_lead['Last Activity'], prefix='Last Activity')
ml1 = ml.drop(['Last Activity_Unknown'], 1)
df_lead = pd.concat([df_lead,ml1], axis=1)

df_lead.shape

NameError: name 'pd' is not defined

# Remove redundant vars

In [None]:
df_lead = df_lead.drop(['Lead Quality','Asymmetrique Profile Index','Asymmetrique Activity Index','Tags','Lead Profile',
                    'Lead Origin','What is your current occupation', 'Specialization', 'City','Last Activity', 'Country', 
                    'Lead Source','Last Notable Activity'], 1)
df_lead.shape

In [None]:
df_lead.head()

In [None]:
# check no more categorical columns left in the df
cols = df_lead.columns
num_cols = df_lead._get_numeric_data().columns
list(set(cols) - set(num_cols))

In [21]:
# copy of this origial variable for later use
orig_lead = df_lead.copy()
print(orig_lead.shape)
print(df_lead.shape)

NameError: name 'df_lead' is not defined

# Step 4: Test-Train data Split

In [None]:
# Including feature variable to X
X = df_lead.drop(['Converted','Lead Number'], axis=1)

X.head()

In [None]:
# Include response variable to y
y = df_lead['Converted']

y.head()

In [None]:
# train and test split of data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

# Step 5: Scaling Features

In [22]:
scaler = StandardScaler()

X_train[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']] = scaler.fit_transform(X_train[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']])

X_train.head()

NameError: name 'StandardScaler' is not defined

In [None]:
X_train.describe()

# Lead Conversion Rate

In [None]:
converted = (sum(df_lead['Converted'])/len(df_lead['Converted'].index))*100
converted

38% lead conversion rate

# Step 6: Model Building

Run First Training Model

In [23]:
# Logistic regression model
logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

NameError: name 'sm' is not defined

# Step 7: RFE

In [None]:
logreg = LogisticRegression()

In [None]:
rfe = RFE(logreg, 20)
rfe = rfe.fit(X_train, y_train)

In [24]:
rfe.support_

NameError: name 'rfe' is not defined

In [None]:
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]
col

In [None]:
X_train.columns[~rfe.support_]

# Model Assessment with stats model

In [25]:
X_train_sm = sm.add_constant(X_train[col])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

NameError: name 'sm' is not defined

In [None]:
# Get the predicted values on the train set
y_train_pred = res.predict(X_train_sm)
y_train_pred[:10]

In [None]:
# reshaping the array of predicted values
y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

In [None]:
y_train_pred_final = pd.DataFrame({'Converted':y_train.values, 'Conversion_Prob':y_train_pred})
y_train_pred_final['LeadID'] = y_train.index
y_train_pred_final.head()

In [None]:
y_train_pred_final['predicted'] = y_train_pred_final.Conversion_Prob.map(lambda x: 1 if x > 0.5 else 0)

y_train_pred_final.head()

# Confusion Metrics

In [26]:
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.predicted )
print(confusion)

NameError: name 'metrics' is not defined

In [None]:
#overall accuracy check
print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.predicted))

# VIFs

In [27]:
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

NameError: name 'pd' is not defined

There seems to be no multicollinearity present in our model among the selected features as per their VIF values.
Now check the correlation among the features in the below heat map.

In [None]:
plt.figure(figsize=(18,13), dpi=80, facecolor='w', edgecolor='k', frameon='True')

cor = X_train[col].corr()
sns.heatmap(cor, annot=True, cmap="YlGnBu")

plt.tight_layout()
plt.show()

# Remove vars and update the Model
Some of the variable have high VIFs as well as high p-values. Those variables are insignificant and can be dropped.

In [None]:
col = col.drop('Tags_number not provided', 1)
col

In [None]:
X_train_sm = sm.add_constant(X_train[col])
logm3 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm3.fit()
res.summary()

In [28]:
# Get the predicted values on the train set
y_train_pred = res.predict(X_train_sm)
y_train_pred[:10]

NameError: name 'res' is not defined

In [None]:
y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

In [None]:
y_train_pred_final = pd.DataFrame({'Converted':y_train.values, 'Conversion_Prob':y_train_pred})
y_train_pred_final['LeadID'] = y_train.index
y_train_pred_final.head()

In [None]:
y_train_pred_final['predicted'] = y_train_pred_final.Conversion_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final.head()

In [29]:
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.predicted )
print(confusion)

NameError: name 'metrics' is not defined

In [None]:
print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.predicted))

# VIFs

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

# Remove the Variable and Updat the Model

In [30]:
col = col.drop('Tags_wrong number given', 1)
col

NameError: name 'col' is not defined

In [None]:
X_train_sm = sm.add_constant(X_train[col])
logm4 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm4.fit()
res.summary()

In [None]:
y_train_pred = res.predict(X_train_sm)
y_train_pred[:10]

In [None]:
y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

In [None]:
y_train_pred_final = pd.DataFrame({'Converted':y_train.values, 'Conversion_Prob':y_train_pred})
y_train_pred_final['LeadID'] = y_train.index
y_train_pred_final.head()

In [31]:
y_train_pred_final['predicted'] = y_train_pred_final.Conversion_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final.head()

NameError: name 'y_train_pred_final' is not defined

In [None]:
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.predicted )
print(confusion)

In [None]:
print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.predicted))

# VIFs

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

# Remove the Variable and Update the Mode

In [32]:
col = col.drop('Tags_Diploma holder (Not Eligible)', 1)
col

NameError: name 'col' is not defined

In [None]:
X_train_sm = sm.add_constant(X_train[col])
logm5 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm5.fit()
res.summary()

In [None]:
y_train_pred = res.predict(X_train_sm)
y_train_pred[:10]

In [None]:
y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

In [33]:
y_train_pred_final = pd.DataFrame({'Converted':y_train.values, 'Conversion_Prob':y_train_pred})
y_train_pred_final['LeadID'] = y_train.index
y_train_pred_final.head()

NameError: name 'pd' is not defined

In [None]:
y_train_pred_final['predicted'] = y_train_pred_final.Conversion_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final.head()

In [None]:
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.predicted )
print(confusion)

In [None]:
print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.predicted))

# VIFs

In [34]:
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

NameError: name 'pd' is not defined

# Remove the Variable and Update the Model

In [None]:
col = col.drop('Tags_invalid number', 1)
col

In [None]:
X_train_sm = sm.add_constant(X_train[col])
logm6 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm6.fit()
res.summary()

In [None]:
y_train_pred = res.predict(X_train_sm)
y_train_pred[:10]

In [None]:
y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

In [35]:
y_train_pred_final = pd.DataFrame({'Converted':y_train.values, 'Conversion_Prob':y_train_pred})
y_train_pred_final['LeadID'] = y_train.index
y_train_pred_final.head()

NameError: name 'pd' is not defined

In [None]:
y_train_pred_final['predicted'] = y_train_pred_final.Conversion_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final.head()

In [None]:
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.predicted )
print(confusion)

In [None]:
print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.predicted))

# VIFs

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [36]:
plt.figure(figsize=(18,13), dpi=80, facecolor='w', edgecolor='k', frameon='True')

cor = X_train[col].corr()
sns.heatmap(cor, annot=True, cmap="YlGnBu")

plt.tight_layout()
plt.show()

NameError: name 'plt' is not defined

# Our latest models features are as follows:
All variables have p-value < 0.05.
All the features have very low VIF values, meaning, there is not much muliticollinearity among the features as per heat map.
The overall accuracy of 0.9125 at a probability threshold of 0.05 is also very acceptable.
So we need not drop any more variables and we can proceed with making predictions using this model only

# Step 8: Calculating Sensitivity and Specificity

In [None]:
TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

In [None]:
# sensitivity of our logistic regression model
TP / float(TP+FN)

In [None]:
# calculate specificity
TN / float(TN+FP)

In [37]:
# false postive rate
print(FP/ float(TN+FP))

NameError: name 'FP' is not defined

In [None]:
# positive predicted value 
print (TP / float(TP+FP))

In [None]:
# Negative predictive value
print (TN / float(TN+ FN))

# ROC Curve

In [None]:
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return fpr,tpr, thresholds

In [None]:
fpr, tpr, thresholds = metrics.roc_curve( y_train_pred_final.Converted, y_train_pred_final.Conversion_Prob, drop_intermediate = False )

In [38]:
draw_roc(y_train_pred_final.Converted, y_train_pred_final.Conversion_Prob)

NameError: name 'draw_roc' is not defined

# The area under the curve

In [None]:
def auc_val(fpr,tpr):
    AreaUnderCurve = 0.
    for i in range(len(fpr)-1):
        AreaUnderCurve += (fpr[i+1]-fpr[i]) * (tpr[i+1]+tpr[i])
    AreaUnderCurve *= 0.5
    return AreaUnderCurve

In [None]:
auc = auc_val(fpr,tpr)
auc

# Step 10: Find Cutoff Point

In [39]:
# create columns with different probability cutoffs 
num = [float(x)/10 for x in range(10)]
for i in num:
    y_train_pred_final[i]= y_train_pred_final.Conversion_Prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

NameError: name 'y_train_pred_final' is not defined

In [None]:
#calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

# plot accuracy sensitivity and specificity for various probabilities

In [None]:
sns.set_style("whitegrid") 
sns.set_context("paper") 
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'], figsize=(12,8))
plt.xticks(np.arange(0, 1, step=0.05), size = 15)
plt.yticks(size = 15)
plt.show()

# 0.33 is the optimum point to take it as a cutoff probability

In [None]:
y_train_pred_final['final_predicted'] = y_train_pred_final.Conversion_Prob.map( lambda x: 1 if x > 0.33 else 0)

y_train_pred_final.head()

In [40]:
metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.final_predicted)

NameError: name 'metrics' is not defined

In [None]:
confusion1 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.final_predicted)
confusion1

In [None]:
TP = confusion1[1,1] # true positive 
TN = confusion1[0,0] # true negatives
FP = confusion1[0,1] # false positives
FN = confusion1[1,0] # false negatives

In [None]:
#sensitivity 
TP / float(TP+FN)

In [None]:
#specificity
TN / float(TN+FP)

In [41]:
#false postive rate
print(FP/ float(TN+FP))

NameError: name 'FP' is not defined

In [None]:
# Positive predictive value 
print (TP / float(TP+FP))

In [None]:
# Negative predictive value
print (TN / float(TN+ FN))

# Step 11: Precision and Recall

Precision
TP / TP + FP

In [None]:
precision = confusion1[1,1]/(confusion1[0,1]+confusion1[1,1])
precision

Recall
TP / TP + FN

In [None]:
recall = confusion1[1,1]/(confusion1[1,0]+confusion1[1,1])
recall

In [42]:
precision_score(y_train_pred_final.Converted, y_train_pred_final.final_predicted)

NameError: name 'precision_score' is not defined

In [None]:
recall_score(y_train_pred_final.Converted, y_train_pred_final.final_predicted)

In [None]:
y_train_pred_final.Converted, y_train_pred_final.final_predicted

In [None]:
p, r, thresholds = precision_recall_curve(y_train_pred_final.Converted, y_train_pred_final.Conversion_Prob)

In [None]:
plt.figure(figsize=(10, 6), dpi=100, facecolor='w', edgecolor='k', frameon='True')
plt.plot(thresholds, p[:-1], "g-")
plt.plot(thresholds, r[:-1], "r-")
plt.xticks(np.arange(0, 1, step=0.05))
plt.show()

# The optical threshold value  .37 
The business requirement is to have Lead Conversion Rate around 80%.
This is achieved with the earlier threshold value of 0.33. 

# F1 score

In [43]:
F1 = 2*(precision*recall)/(precision+recall)
F1

NameError: name 'precision' is not defined

# Step 12: Making predictions on the test set

In [None]:
X_test[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']] = scaler.transform(X_test[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']])
X_test.head()

In [None]:
X_test = X_test[col]
X_test.head()

In [None]:
X_test_sm = sm.add_constant(X_test)

In [44]:
y_test_pred = res.predict(X_test_sm)

NameError: name 'res' is not defined

In [None]:
y_test_pred[:10]

In [None]:
y_pred_1 = pd.DataFrame(y_test_pred)

In [None]:
y_pred_1.head()

In [None]:
y_test_df = pd.DataFrame(y_test)

In [45]:
y_test_df['LeadID'] = y_test_df.index

NameError: name 'y_test_df' is not defined

In [None]:
y_pred_1.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)

In [None]:
y_pred_final = pd.concat([y_test_df, y_pred_1],axis=1)

In [None]:
y_pred_final.head()

In [None]:
y_pred_final= y_pred_final.rename(columns={ 0 : 'Conversion_Prob'})

In [None]:
y_pred_final = y_pred_final.reindex_axis(['LeadID','Converted','Conversion_Prob'], axis=1)

In [46]:
y_pred_final.head()

NameError: name 'y_pred_final' is not defined

In [None]:
y_pred_final.shape

Using the probability threshold value 0f 0.33 on the test dataset to predict if a lead will convert

In [None]:
y_pred_final['final_predicted'] = y_pred_final.Conversion_Prob.map(lambda x: 1 if x > 0.33 else 0)

In [None]:
y_pred_final.head()

In [None]:
#accuracy.
acc_score=metrics.accuracy_score(y_pred_final.Converted, y_pred_final.final_predicted)
acc_score

In [47]:
confusion_test = metrics.confusion_matrix(y_pred_final.Converted, y_pred_final.final_predicted )
print(confusion_test)

NameError: name 'metrics' is not defined

In [None]:
TP = confusion_test[1,1] # true positive 
TN = confusion_test[0,0] # true negatives
FP = confusion_test[0,1] # false positives
FN = confusion_test[1,0] # false negatives

Sensitivity
TP / TP + FN

In [None]:
#sensitivity
TP / float(TP+FN)

Specificity
TN / TN + FP

In [None]:
#specificity
TN / float(TN+FP)

In [None]:
# false postive rate
print(FP/ float(TN+FP))

In [48]:
# Positive predictive value 
print (TP / float(TP+FP))

NameError: name 'TP' is not defined

In [None]:
# Negative predictive value
print (TN / float(TN+ FN))

Precision
TP / TP + FP

In [None]:
Precision = confusion_test[1,1]/(confusion_test[0,1]+confusion_test[1,1])
Precision

Recall
TP / TP + FN

In [None]:
Recall = confusion_test[1,1]/(confusion_test[1,0]+confusion_test[1,1])
Recall

In [None]:
F1 = 2*(Precision*Recall)/(Precision+Recall)
F1

In [49]:
print(classification_report(y_pred_final.Converted, y_pred_final.final_predicted))

NameError: name 'classification_report' is not defined

Cross Validation Score
To avoid overfitting, calculate the Cross Validation Score to see how our model performs

In [None]:
lr = LogisticRegression(solver = 'lbfgs')
scores = cross_val_score(lr, X, y, cv=10)
scores.sort()
accuracy = scores.mean()

print(scores)
print(accuracy)

# ROC for test data set

In [None]:
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return fpr,tpr, thresholds

In [None]:
fpr, tpr, thresholds = metrics.roc_curve( y_pred_final.Converted, y_pred_final.Conversion_Prob, drop_intermediate = False )

In [None]:
draw_roc(y_pred_final.Converted, y_pred_final.Conversion_Prob)

# Area under curve

In [50]:
def auc_val(fpr,tpr):
    AreaUnderCurve = 0.
    for i in range(len(fpr)-1):
        AreaUnderCurve += (fpr[i+1]-fpr[i]) * (tpr[i+1]+tpr[i])
    AreaUnderCurve *= 0.5
    return AreaUnderCurve

In [51]:
auc = auc_val(fpr,tpr)
auc

NameError: name 'fpr' is not defined

As a rule of thumb, an AUC can be classed as follows,

0.90 - 1.00 = excellent

0.80 - 0.90 = good

0.70 - 0.80 = fair

0.60 - 0.70 = poor

0.50 - 0.60 = fail

Auc is 0.9678, our model seems to be doing well on the test dataset.

# Step 13: Lead score for the complete dataset
Lead Score = 100 * ConversionProbability

This needs to be calculated for all the leads from the original dataset (train + test)

In [None]:
leads_test_pred = y_pred_final.copy()
leads_test_pred.head()

In [None]:
leads_train_pred = y_train_pred_final.copy()
leads_train_pred.head()

In [None]:
leads_train_pred = leads_train_pred[['LeadID','Converted','Conversion_Prob','final_predicted']]
leads_train_pred.head()

# Merging the train and the test dataset with the Conversion Probabilities

In [None]:
lead_full_pred = leads_train_pred.append(leads_test_pred)
lead_full_pred.head()

In [52]:
print(leads_train_pred.shape)
print(leads_test_pred.shape)
print(lead_full_pred.shape)

NameError: name 'leads_train_pred' is not defined

In [None]:
len(lead_full_pred['LeadID'].unique().tolist())

In [None]:
lead_full_pred['Lead_Score'] = lead_full_pred['Conversion_Prob'].apply(lambda x : round(x*100))
lead_full_pred.head()

In [None]:
lead_full_pred.LeadID.max()

In [None]:
lead_full_pred = lead_full_pred.set_index('LeadID').sort_index(axis = 0, ascending = True)
lead_full_pred.head()

In [53]:
# Slicing the Lead Number column from original_leads dataframe
orig_lead = orig_lead[['Lead Number']]
orig_lead.head()

NameError: name 'orig_lead' is not defined

Concatenating the 2 dataframes based on index

In [None]:
leads_with_score = pd.concat([orig_lead, lead_full_pred], axis=1)
leads_with_score.head(10)

In [None]:
leads_with_score.shape

In [None]:
total = pd.DataFrame(leads_with_score.isnull().sum().sort_values(ascending=False), columns=['Total'])
percentage = pd.DataFrame(round(100*(leads_with_score.isnull().sum()/leads_with_score.shape[0]),2).sort_values(ascending=False)\
                          ,columns=['Percentage'])
pd.concat([total, percentage], axis = 1)

# Step 14: Feature Importance

In [None]:
pd.options.display.float_format = '{:.2f}'.format
new_params = res.params[1:]
new_params

In [54]:
#feature_importance = abs(new_params)
feature_importance = new_params
feature_importance = 100.0 * (feature_importance / feature_importance.max())
feature_importance

NameError: name 'new_params' is not defined

In [None]:
sorted_idx = np.argsort(feature_importance,kind='quicksort',order='list of str')
sorted_idx

In [None]:
pos = np.arange(sorted_idx.shape[0]) + .5

featfig = plt.figure(figsize=(10,6))
featax = featfig.add_subplot(1, 1, 1)
featax.barh(pos, feature_importance[sorted_idx], align='center', color = 'tab:blue',alpha=0.8)
featax.set_yticks(pos)
featax.set_yticklabels(np.array(X_train[col].columns)[sorted_idx], fontsize=12)
featax.set_xlabel('Relative Feature Importance', fontsize=14)

plt.tight_layout()   
plt.show()

# Top 3 features selection where lead gets converted

In [None]:
pd.DataFrame(feature_importance).reset_index().sort_values(by=0,ascending=False).head(3)