In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# supressing warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Setting display option to show all columns and values
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# Importing Data
leads = pd.read_csv("Leads.csv") 

### Data Understanding

In [None]:
leads.head()

In [None]:
leads.info()

We can see from the head of the dataframe itself, many columns have entries as 'Select'. It if the default value of any field. It is as goog as null. Thus replacing 'Select' with NaN.

In [None]:
leads =leads.replace('Select',np.NaN)

In [None]:
# % of missing values column wise
round((leads.isna().sum() / len(leads.index)) * 100,2)

In [None]:
# Checking for duplicate rows
leads.duplicated().sum()
# No duplicate entries found

### Understanding the data dictionary
##### Categorical Columns

Dropping unnecessary and high misssing value columns

In [None]:
# countplot for cities
plt.figure(figsize=(12,4))
sns.countplot(x='City',data=leads)

In [None]:
# countplot for Country
plt.figure(figsize=(25,4))
sns.countplot(x='Country',data=leads)

In cities we have most of the leads from Mumbai.We don't have ciity name for almost 2300 leads and cities other than Mumbai are not specified. 
For Country column 99% of the leads are from India and thus column has very less varianvce.
Prospect id is unique number and will not help in model building.
Thus dropping these three columns.

In [None]:
# Dropping unnecessary columns and columns with more than 40% missing values
# imputing columns with more than 40% missing values may create bias in the data
# Thus removing columns with morathan 40% missing values
leads=leads.drop(['Prospect ID','Country','City'],1)
# columns with 40% missing values
leads = leads.drop(['How did you hear about X Education','Asymmetrique Profile Score','Asymmetrique Activity Score',
                    'Asymmetrique Profile Index','Asymmetrique Activity Index','Lead Quality',
                   'Lead Profile'],1)

In [None]:
# Columns with no variance i.e. same value in each eantry

# I agree to pay the amount through cheque
print(leads['I agree to pay the amount through cheque'].value_counts())
#Update me on Supply Chain Content
print(leads['Update me on Supply Chain Content'].value_counts())
#Receive More Updates About Our Courses
print(leads['Receive More Updates About Our Courses'].value_counts())
# Magzine
print(leads['Magazine'].value_counts())
# Get updates on DM Content
leads['Get updates on DM Content'].value_counts()

In [None]:
# Dropping columns that are not adding value to the model or are redundant
leads = leads.drop(['I agree to pay the amount through cheque','Update me on Supply Chain Content','Receive More Updates About Our Courses','Magazine','Get updates on DM Content'],1)

In [None]:
#The origin identifier with which the customer was identified to be a lead. Includes API, Landing Page Submission, etc.
leads['Lead Origin'].value_counts()

In [None]:
#we have only one entry for 'Quick Add Form'. It will not of much help in prediction thus deleting row.
leads=leads[leads['Lead Origin']!= 'Quick Add Form']

##### Lead Source 
The source of the lead. Includes Google, Organic Search, Olark Chat, etc.

In [None]:
# 0.39 missing values
# There are many categories with few entries. Treating them as 'Others'
leads['Lead Source'].astype('category').value_counts()

In [None]:
# Replaceing categories with few entries as others
leads['Lead Source'].replace(to_replace=['bing','Click2call','Press_Release','Social Media','Live Chat','Pay per Click Ads','welearnblog_Home','NC_EDM','WeLearn','blog','testone','youtubechannel'],value='Other',inplace=True)
leads['Lead Source'].replace(to_replace='google',value='Google',inplace=True)
# 0.39% missing values. Imputing it higly occuring category(mode)
leads['Lead Source'].fillna(value='Google',inplace=True)

#### Last Activity & Last Notable Activity

In [None]:
leads['Last Activity'].value_counts()

In [None]:
# 1.11% missing values
leads['Last Activity'].value_counts()

In [None]:
# 0% missing values
leads['Last Notable Activity'].value_counts()

'Last Notable Activity' and 'Last Activity' have similar categories. Considering these columns are showing similar or redundant information. Dropping 'Last Activity' column

In [None]:
leads.drop('Last Activity',1,inplace=True)

In [None]:
# getting 'unreachabel' and 'email bounced' in one category
leads['Last Notable Activity'].replace(to_replace ='Email Bounced',value='Unreachable',inplace=True)

In [None]:
# some categories have very less entries
# deleting such categories
activity = ['Modified','Email Opened','SMS Sent','Page Visited on Website','Olark Chat Conversation','Email Link Clicked','Unreachable','Had a Phone Conversation']
leads = leads.loc[leads['Last Notable Activity'].isin(activity)]

In [None]:
leads.columns

##### Dropping Columns which will not help in model building

In [None]:
leads = leads.drop(['Search','Newspaper Article','X Education Forums','Newspaper','Digital Advertisement'],1)

###### Tags
- Tags assigned to customers indicating the current status of the lead.
- 36% missing values

In [None]:
leads['Tags'].astype('category').value_counts()

In [None]:
# Total leads converted from each category
leads.groupby(leads['Tags']).Converted.sum()

In [None]:
# bsed of value count of each category and total leads converted category wise
# combinibg categories which depict similar information
leads['Tags'].replace(to_replace=['In confusion whether part time or DLP','Lateral student','Approached upfront',  'opp hangup', 'Still Thinking',  'Lost to Others','Shall take in the next coming month','Interested in Next batch', 'Want to take admission but has financial problems', 'number not provided'], value='Others',inplace=True)
leads['Tags'].replace(to_replace=['wrong number given','University not recognized','invalid number','Recognition issue (DEC approval)','Not doing further education','Diploma holder (Not Eligible)'], value='Not Interested',inplace=True)
leads['Tags'].replace(to_replace='in touch with EINS',value = 'Lost to EINS',inplace=True)
leads['Tags'].replace(to_replace='Interested  in full time MBA',value = 'Interested in other courses',inplace=True)
leads['Tags'].replace(to_replace=['switched off','Busy'], value='Busy-switchoff',inplace=True)

In [None]:
leads['Tags'].astype('category').value_counts()

In [None]:
leads['Tags'].fillna(value='Will revert after reading the email',inplace=True)

In [None]:
#What is your current occupation - 29% missing values
# making col name shorter 
leads['occupation'] = leads['What is your current occupation']
leads.drop('What is your current occupation',1,inplace=True)

In [None]:
# countplot category wise
plt.figure(figsize=(10,5))
sns.countplot(x='occupation',data=leads)

In [None]:
# imputing 29% of missing values with mode
leads['occupation'].fillna(value='Unemployed', inplace=True)

In [None]:
#What matters most to you in choosing a course - 29% missing values
plt.figure(figsize=(8,5))
sns.countplot(x='What matters most to you in choosing a course',data=leads)

In [None]:
leads['Specialization'].value_counts()

'Specialization' has 36% missing values. imputing 36% of data may create noise in the data. 
'What matters most to you in choosing a course' has 29% missing values. Also data is highly skewd category wise.Thus deleting this column.

In [None]:
leads = leads.drop(['Specialization','What matters most to you in choosing a course','Tags'],1)

#### Numerical Columns

###### Total Visits & Total Time Spent on Website

In [None]:
# 'Total Visits'-The total number of visits made by the customer on the website.
# 1.11% missing value
sns.boxplot(leads['TotalVisits'])

In [None]:
# Removing outliers
leads = leads[leads['TotalVisits']<=20]
leads['TotalVisits'].describe()

In [None]:
# As mean and median have same values. Imputing with median.
leads['TotalVisits'].fillna(value=3, inplace=True)

In [None]:
leads['Total Time Spent on Website'].describe()

In [None]:
#Total Time Spent on Website
sns.distplot(leads['Total Time Spent on Website'])

While mean of the total time spend on the website is 0, we see the gradual increase in the total time spent. 

In [None]:
# Page views per visit 
sns.boxplot(leads['Page Views Per Visit'])

In [None]:
#getting numeric columns in one dataframe
numeric_df = leads[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']]

In [None]:
# Correlation between numeric columns
numeric_df.corr()

In [None]:
sns.scatterplot(x = 'TotalVisits', y = 'Page Views Per Visit', data=leads)

We can see as total visits to the site increases the Page Views Per Visit also increases and have a correlation of 71% 

In [None]:
# We can see that 'Page Views Per Visit' and TotatVisits are highly correlated. Thus dropping 'page per views'
leads.drop('Page Views Per Visit',1,inplace=True)

In [None]:
leads['A free copy of Mastering The Interview'].value_counts()

### Info of dataset after cleaning

In [None]:
leads.info()

### Data Preparation

#### Converting some binary variables (Yes/No) to 0/1

In [None]:
# List of variables to map

varlist =  ['Do Not Email', 'Do Not Call', 'Through Recommendations', 'A free copy of Mastering The Interview']

# Defining the map function
def binary_map(x):
    return x.map({'Yes': 1, "No": 0})

# Applying the function 
leads[varlist] = leads[varlist].apply(binary_map)

In [None]:
leads.head()

#### Getting Dummy variable for categorical columns

In [None]:
# Creating dummines for multi-level categorical variable
d1 = pd.get_dummies(leads['Lead Origin'],drop_first=True,prefix='Lead_Origin')
d2 = pd.get_dummies(leads['Lead Source'],drop_first=True,prefix='Lead_Source')
d3 = pd.get_dummies(leads['Last Notable Activity'],drop_first=True,prefix='LastActivity')
d4 = pd.get_dummies(leads['occupation'],drop_first=True,prefix='occupation')

In [None]:
# Concating created dummies
leads = pd.concat([leads,d1,d2,d3,d4],axis=1)

In [None]:
# Removing original columns
leads.drop(['Lead Origin','Lead Source','Last Notable Activity','occupation'],1,inplace=True)

In [None]:
# Final Dataframe
leads.head()

In [None]:
# Let's see the correlation matrix 
plt.figure(figsize = (20,10))        # Size of the figure
sns.heatmap(leads.drop('Lead Number',1).corr(),annot = True)
plt.show()

'Lead_Origin_LeadAddForm' is highly correlated with 'Lead_Source_Reference' (0.85 correlation)
Thus dropping 'Lead_Origin_LeadAddForm'

In [None]:
leads.drop(['Lead_Origin_Lead Add Form','Lead_Origin_Lead Import','Lead_Source_Olark Chat','LastActivity_Modified','occupation_Unemployed'],1,inplace=True)

In [None]:
# after outlier treatment and dropping missing values we came up with final clean dataset with 9002 rows
leads.shape

In [None]:
# Conversion rate
leads['Converted'].sum() / len(leads.index)

### Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Putting independent variables in X
X = leads.drop(['Lead Number','Converted'],axis=1)
X.head()

In [None]:
# Getting dependent variable in y
y = leads['Converted']
Lead_Number = leads['Lead Number']
y.head()

In [None]:
# splitting data in train-test
X_train,X_test,y_train,y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
X_train[['TotalVisits','Total Time Spent on Website']] = scaler.fit_transform(X_train[['TotalVisits','Total Time Spent on Website']])
X_train.head()

In [None]:
y_train.shape

### Model Building

In [None]:
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

In [None]:
# Logistic Regression Model using all features - Model1
logm1 = sm.GLM(y_train, (sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

In [None]:
# Feature selection using RFE
logreg = LogisticRegression()
rfe = RFE(logreg, 17)             # running RFE with 13 variables as output
rfe = rfe.fit(X_train, y_train)

In [None]:
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]

In [None]:
X_train.columns[~rfe.support_]

In [None]:
# Model 2 - RFE 17 Features (X_train_sm)
X_train_sm = sm.add_constant(X_train[col])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res2 = logm2.fit()
res2.summary()

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# While VIF's seems to be in limit i.e. vif < 5. Some features have very high p-value.
# Removing insignificant features p-value = 0.999
col = col.drop('occupation_Housewife',1)
col

In [None]:
# Let's re-run the model using the selected variables - Model 3
X_train_sm = sm.add_constant(X_train[col])
logm3 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res3 = logm3.fit()
res3.summary()

In [None]:
# VIF- Model3
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# insignificant variable - p-val = 0.999
col = col.drop('LastActivity_Had a Phone Conversation',1)
col

In [None]:
# Let's re-run the model using the selected variables - Model 4
X_train_sm = sm.add_constant(X_train[col])
logm3 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res3 = logm3.fit()
res3.summary()

In [None]:
# VIF- Model4
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
col = col.drop('LastActivity_Olark Chat Conversation',1)
col

In [None]:
# Let's re-run the model using the selected variables - Model 5
X_train_sm = sm.add_constant(X_train[col])
logm3 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res3 = logm3.fit()
res3.summary()

In [None]:
# VIF- Model5
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
col = col.drop('occupation_Other',1)
col

In [None]:
# Let's re-run the model using the selected variables - Model 6
X_train_sm = sm.add_constant(X_train[col])
logm3 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res3 = logm3.fit()
res3.summary()

In [None]:
# VIF- Model6
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
col = col.drop('Lead_Source_Facebook',1)
col

In [None]:
# Let's re-run the model using the selected variables - Model 7
X_train_sm = sm.add_constant(X_train[col])
logm3 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res3 = logm3.fit()
res3.summary()

In [None]:
# VIF- Model7
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
col = col.drop('Lead_Source_Referral Sites',1)
col

In [None]:
# Let's re-run the model using the selected variables - Model 8
X_train_sm = sm.add_constant(X_train[col])
logm3 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res3 = logm3.fit()
res3.summary()

In [None]:
# VIF- Model8
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
y_train_pred = res3.predict(X_train_sm).values.reshape(-1)

In [None]:
y_train_pred[:10]

In [None]:
y_train_pred_final = pd.DataFrame({'Converted':y_train.values, 'Converted_Prob':y_train_pred})
y_train_pred_final['Lead_NUmber'] = y_train.index
y_train_pred_final.head()

In [None]:
y_train_pred_final.sort_values(by='Converted_Prob',ascending=False)

In [None]:
# taking 0.4 as Cut-off probability 
y_train_pred_final['predicted'] = y_train_pred_final.Converted_Prob.map(lambda x: 1 if x > 0.4 else 0)

# Let's see the head
y_train_pred_final.head()

In [None]:
from sklearn import metrics

In [None]:
# Confusion matrix 
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.predicted )
print(confusion)

In [None]:
# Let's check the overall accuracy.
print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.predicted))

In [None]:
TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
TP / float(TP+FN)

### Plotting ROC Curve

In [None]:
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
fpr, tpr, thresholds = metrics.roc_curve( y_train_pred_final.Converted, y_train_pred_final.Converted_Prob, drop_intermediate = False )

In [None]:
draw_roc(y_train_pred_final.Converted, y_train_pred_final.Converted_Prob)

We got the ROC curve hugging the upper left corner i.e. curve with high True Positive Rate and low False Positive Rate. Higher the area under the curve better is the model.

### Optimal Cutoff point

In [None]:
# Let's create columns with different probability cutoffs 
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.Converted_Prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
# Let's plot accuracy sensitivity and specificity for various probabilities.
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

From the above graph taking 0.35 as optimum cutoff.

In [None]:
y_train_pred_final['final_predicted'] = y_train_pred_final.Converted_Prob.map( lambda x: 1 if x > 0.35 else 0)

y_train_pred_final.head()

In [None]:
# Assigning 'Lead Score' to each 'Lead Number' 
y_train_pred_final['Lead Score'] = y_train_pred_final.Converted_Prob.map( lambda x: round(x*100,2))

y_train_pred_final.head()

In [None]:
# Let's check the overall accuracy.
metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final['final_predicted'])

We have got the accuracy as 79%

In [None]:
confusion2 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.final_predicted )
confusion2

In [None]:
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
round((TP / float(TP+FN))*100,2)

##### We have got 'Sensitivity' of 81%. This means outof total converted leads we are able to identify 81% leads.

### Making predictions on the test set

In [None]:
X_test[['TotalVisits','Total Time Spent on Website']] = scaler.transform(X_test[['TotalVisits','Total Time Spent on Website']])
X_test = X_test[col]
X_test.head()

In [None]:
# Using stats model, adding constant
X_test_sm = sm.add_constant(X_test)

In [None]:
y_test_pred = res3.predict(X_test_sm)

In [None]:
y_test_pred.values.reshape(-1)

In [None]:
y_test_pred_final = pd.DataFrame({'Converted':y_test.values, 'Converted_Prob':y_test_pred})
y_test_pred_final['Lead_Number'] = y_test.index
y_test_pred_final.head()

In [None]:
y_test_pred_final['final_predicted'] = y_test_pred_final.Converted_Prob.map(lambda x: 1 if x>0.35 else 0)
y_test_pred_final.head()

In [None]:
y_test_pred_final['Lead Score'] = y_test_pred_final.Converted_Prob.map(lambda x: round(x*100,2))

In [None]:
y_test_pred_final.head()

In [None]:
#Let's check the accuracy for test set
metrics.accuracy_score(y_test_pred_final.Converted, y_test_pred_final.final_predicted)

We got 77% of accuracy in test set

In [None]:
# Confusion matrix for test set
confusion_test = metrics.confusion_matrix(y_test_pred_final.Converted, y_test_pred_final.final_predicted )
confusion_test

In [None]:
# Model Evaluation matrics for test set
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
TP / float(TP+FN)

With this model we got 81% accuracy on the test set as well. That means out of total leads converted our model is able to predict 81% leads. 

In [None]:
# Let us calculate specificity
TN / float(TN+FP)

We got specificity of the model to be 78%. That means total leads that are not converted (0) we are able to identify 78% of the leads.

In [None]:
y_train_pred_final.head()

In [None]:
y_test_pred_final.head()

In [None]:
Y_train=y_train_pred_final[['Lead_NUmber','Converted','final_predicted','Lead Score']]

In [None]:
Y_test =y_test_pred_final[['Lead_NUmber','Converted','final_predicted','Lead Score']]

In [None]:
# Final Dataframe of Lead_Number with Lead Score assigned to it
LeadNumber_wt_LeadScore = pd.concat([Y_train,Y_test])
LeadNumber_wt_LeadScore.head(20)

In [None]:
LeadNumber_wt_LeadScore.shape

We came up with final model with:
- Accuracy : 77% 
- Sensitivity : 81%
- Specificity : 78%

for the test dataset. 
*We got the similar values for training dataset as well

As we need to maximise the conversion rate of the leads, i.e. identifying potential lead accurately we need to have sensitivity more than specificity. So that we do not loose on potential lead.

We have assigned 'Lead Score' to each Lead. And identified leads with score more than 35% as hot leads or potential leads.

#### ****************************************************************************************************************************
#### ****************************************************************************************************************************

#Question 3
We can achieve this by increasing TPR i.e. True Positive Rate
TPR = Predicted positves / Actual Positives. That is predicting almost all the positives.

In [None]:
y_train_pred_final['predicted_Q3'] = y_train_pred_final.Converted_Prob.map( lambda x: 1 if x > 0.2 else 0)

y_train_pred_final.head()

In [None]:
# Confusion matrix for 0.2 as cutoff
confusion_Q3 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.predicted_Q3 )
confusion_Q3

In [None]:
# Model Evaluation matrics 
TP = confusion_Q3[1,1] # true positive 
TN = confusion_Q3[0,0] # true negatives
FP = confusion_Q3[0,1] # false positives
FN = confusion_Q3[1,0] # false negatives

In [None]:
True_Positive_Rate = TP / (FN + TP)
print("True Positive Rate:", True_Positive_Rate)

In [None]:
# Q4
hot_leads = y_train_pred_final.sort_values(by='Lead Score', ascending=False).head(20)
hot_leads[['Lead_NUmber','Converted','final_predicted','Lead Score']]

In [None]:
y_train_pred_final['predicted_Q4'] = y_train_pred_final.Converted_Prob.map( lambda x: 1 if x > 0.97 else 0)

y_train_pred_final

In [None]:
# Confusion matrix for improving precision
confusion_Q4 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.predicted_Q3 )
confusion_Q4

In [None]:
# Model Evaluation matrics 
TP = confusion_Q4[1,1] # true positive 
TN = confusion_Q4[0,0] # true negatives
FP = confusion_Q4[0,1] # false positives
FN = confusion_Q4[1,0] # false negatives

In [None]:
FP

In [None]:
precision = TP / (TP + FP)

In [None]:
print('Precision:', precision)