### Importing the required libraries to perform Logistic Regression

In [None]:
#import all the necessary libraries

import pandas as pd
import numpy as np
import pandas as pd

# For Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# To Scale our data
from sklearn.preprocessing import scale

import warnings
warnings.filterwarnings("ignore")

### Step 1: Reading and Understanding the Data

In [None]:
leads_scoring=pd.read_csv("./Leads.csv")

### Inspecting the data

In [None]:
leads_scoring.head(5)

In [None]:
leads_scoring.info()

#### Replacing the Select option from categorical variables as it is esentially just a null value 

In [None]:
leads_scoring=leads_scoring.replace('Select',np.nan)

In [None]:
leads_scoring.describe()

#### Dropping duplicate records

In [None]:
leads_scoring.drop_duplicates(inplace=True)

**Missing values along rows**

In [None]:
leads_scoring.isnull().sum(axis=1)

**Missing values along columns**

In [None]:
round(100*leads_scoring.isnull().sum()/len(leads_scoring.index),2)

#### Treatment of missing values

**Dropping collumn with 70% or higher percentage of empty records** 

In [None]:
leads_scoring=leads_scoring.drop(['How did you hear about X Education','Lead Profile'],axis=1)

#### Removing Information about customer that is for company purpose and doesn't serve any use in analysis

In [None]:
leads_scoring=leads_scoring.drop(['Prospect ID','Lead Number'],axis=1)

In [None]:
#Finding the number of unique values under each collumn
leads_scoring.nunique()

**Dropping Collumns with single value as it doesn't serve any use for analysis**

In [None]:
leads_scoring=leads_scoring.drop(['Magazine','Receive More Updates About Our Courses','Update me on Supply Chain Content','Get updates on DM Content','I agree to pay the amount through cheque'],axis=1)

In [None]:
leads_scoring.isnull().sum()

#### Imputing Missing values in Lead Quality

In [None]:
leads_scoring.groupby(by='Lead Quality').count()

In [None]:
round(100*(leads_scoring['Lead Quality'].isnull().sum()/len(leads_scoring.index)),2)

There are more than 50% missing values in 'Lead Quality' column because of no assignment by X Education employee.
We don't have any information about these missing fields hence replacing them by 'Unassigned'

In [None]:
leads_scoring['Lead Quality']=leads_scoring['Lead Quality'].replace(np.nan,"Unassigned")

In [None]:
round(100*leads_scoring.isnull().sum()/len(leads_scoring.index),2)

#### Inspecting & Treating missing values in 'Asymmetrique Profile Index','Asymmetrique Profile Score'

In [None]:
leads_scoring.groupby(['Asymmetrique Activity Index']).Converted.count()

In [None]:
leads_scoring.groupby(['Asymmetrique Profile Index']).Converted.count()

In [None]:
leads_scoring.groupby(['Asymmetrique Activity Score']).Converted.count()

In [None]:
leads_scoring.groupby(['Asymmetrique Profile Score']).Converted.count()

#### Dropping Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score                       

In [None]:
leads_scoring=leads_scoring.drop(['Asymmetrique Activity Index','Asymmetrique Activity Score','Asymmetrique Profile Index','Asymmetrique Profile Score'],axis=1)

In [None]:
round(100*leads_scoring.isnull().sum()/len(leads_scoring.index),2)

#### Treating City Collumn missing values

In [None]:
round(100*(leads_scoring.groupby('City').City.count()/len(leads_scoring.index)),2)

#### 'City' column has approximately 40% missing values. The Collumn has maximum occurence of 'Mumbai' and other values have very few occurences .Thus we cannot impute the collumn with any value and decide to drop it as well

In [None]:
leads_scoring.drop('City',axis=1,inplace=True)

In [None]:
round(100*leads_scoring.isnull().sum()/len(leads_scoring.index),2)

#### Treating Tags collumn missing values

In [None]:
round(100*(leads_scoring.groupby('Tags').Tags.count()/len(leads_scoring.index)),2)

Since we don't know what might be the status of missing value 'Tags', it is better to replace them with value 'Unknown'

In [None]:
leads_scoring['Tags']=leads_scoring['Tags'].replace(np.nan,'Unknown')

In [None]:
round(100*leads_scoring.isnull().sum()/len(leads_scoring.index),2)

#### Treating Specialization collumn missing values

In [None]:
round(100*(leads_scoring.groupby('Specialization').Specialization.count()/len(leads_scoring.index)),2)

37% values are missing in 'Specialization' & we don't have any information about those missing value prospects. Hence replacing the null values with 'Specialization Not given'

In [None]:
leads_scoring['Specialization']=leads_scoring['Specialization'].replace(np.nan,'Specialization Not given')

In [None]:
round(100*leads_scoring.isnull().sum()/len(leads_scoring.index),2)

#### Treating What is your current occupation missing values

In [None]:
round(100*(leads_scoring.groupby('What is your current occupation')['What is your current occupation'].count()/len(leads_scoring.index)),2)

60% of Data is Unemployed, however it'd be wrong to impute this value ,hence we will impute it with Other

In [None]:
leads_scoring['What is your current occupation']=leads_scoring['What is your current occupation'].replace(np.nan,'Other')

In [None]:
round(100*leads_scoring.isnull().sum()/len(leads_scoring.index),2)

#### Treating What matters most to you in choosing a course collumn missing values

In [None]:
round(100*(leads_scoring.groupby('What matters most to you in choosing a course')['What matters most to you in choosing a course'].count()/len(leads_scoring.index)),2)

In 'What matters most to you in choosing a course' 71% values are 'Better Career Prospects'. Missing values are 29%. It makes sense both logically as well as business point of view to impute the collumn value with 'Better Career Prospects'

In [None]:
leads_scoring['What matters most to you in choosing a course']=leads_scoring['What matters most to you in choosing a course'].replace(np.nan,'Better Career Prospects')

In [None]:
round(100*(leads_scoring.groupby('What matters most to you in choosing a course')['What matters most to you in choosing a course'].count()/len(leads_scoring.index)),2)

#### After imputing the values we can see that 99.97% of the collumn value is Better Career Prospects . Thus it can be dropped as the main reason that customers take course is for Better Career Prospects and doesn't help in analysis

In [None]:
leads_scoring.drop('What matters most to you in choosing a course',axis=1,inplace=True)

'What is your current occupation' has ~29% missing values. 60% prospects are Unemployed. But it is unsafe to replace the missing fields with 'Unemployed'
Hence replacing the missing field with 'Other'

In [None]:
round(100*leads_scoring.isnull().sum()/len(leads_scoring.index),2)

#### Treating Country collumn missing values

In [None]:
round(100*(leads_scoring.groupby('Country').Country.count()/len(leads_scoring.index)),2)

Country India is the maximum occuring value in Collumn Country thus imputing missing values with this value

In [None]:
leads_scoring['Country']=leads_scoring['Country'].replace(np.nan,'India') 

In [None]:
round(100*leads_scoring.isnull().sum()/len(leads_scoring.index),2)

#### Inspecting & Treating missing values in 'Lead Source'

In [None]:
round(100*(leads_scoring.groupby('Page Views Per Visit')['Page Views Per Visit'].count()/len(leads_scoring.index)),2)

0.0 is the maximum occuring values , thus we will impute collumn with the same

In [None]:
leads_scoring['Page Views Per Visit'].replace(np.nan,0.0,inplace=True)

In [None]:
round(100*leads_scoring.isnull().sum()/len(leads_scoring.index),2)

#### Treating missing values in Total Visits column 

In [None]:
round(100*(leads_scoring.groupby('TotalVisits')['TotalVisits'].count()/len(leads_scoring.index)),2)

0.0 is the maximum occuring values , thus we will impute collumn with the same

In [None]:
leads_scoring['TotalVisits'].replace(np.nan,0.0,inplace=True)

In [None]:
round(100*leads_scoring.isnull().sum()/len(leads_scoring.index),2)

#### Treating missing values in Last Activity collumn

In [None]:
round(100*(leads_scoring.groupby('Last Activity')['Last Activity'].count()/len(leads_scoring.index)),2)

Replacing nan values with maximum occuring value that is Email Opened

In [None]:
leads_scoring['Last Activity']=leads_scoring['Last Activity'].replace(np.nan,'Email Opened')

In [None]:
round(100*leads_scoring.isnull().sum()/len(leads_scoring.index),2)

#### Treating Lead Source missing values

In [None]:
round(100*(leads_scoring.groupby('Lead Source')['Lead Source'].count()/len(leads_scoring.index)),2)

In [None]:
#Google is appearing twice in different case letters, removing this inconsistency
leads_scoring['Lead Source']=np.where(leads_scoring['Lead Source']=="google","Google",leads_scoring['Lead Source'])

In [None]:
round(100*(leads_scoring['Lead Source'].value_counts()/len(leads_scoring.index)),2)

In lead Source column, replacing null values with most occurring value "Google"

In [None]:
leads_scoring['Lead Source']=leads_scoring['Lead Source'].replace(np.nan,'Google')

In [None]:
round(100*leads_scoring.isnull().sum()/len(leads_scoring.index),2)

#### Thus all missing values have been handled

**Treating columns based on value frequency**

In [None]:
leads_scoring.nunique()

**Checking the column frequencies where only '2' types of values exits.**

In [None]:
round(100*(leads_scoring['Do Not Email'].value_counts()/len(leads_scoring.index)),2)

In [None]:
round(100*(leads_scoring['Do Not Call'].value_counts()/len(leads_scoring.index)),2)

#### The column "Do not Call"  has almost all values as "No", hence this column can be safely dropped in absence of variabilty.

In [None]:
leads_scoring.drop('Do Not Call',axis=1,inplace=True)

In [None]:
round(100*(leads_scoring['Search'].value_counts()/len(leads_scoring.index)),2)

In [None]:
round(100*((leads_scoring.groupby(['Search','Converted']).Converted.count())/len(leads_scoring.index)),2)

In [None]:
round(100*(leads_scoring['Newspaper Article'].value_counts()/len(leads_scoring.index)),2)

In [None]:
round(100*((leads_scoring.groupby(['Newspaper Article','Converted']).Converted.count())/len(leads_scoring.index)),2)

In [None]:
round(100*(leads_scoring['X Education Forums'].value_counts()/len(leads_scoring.index)),2)

In [None]:
round(100*((leads_scoring.groupby(['X Education Forums','Converted']).Converted.count())/len(leads_scoring.index)),2)

In [None]:
round(100*(leads_scoring['Newspaper'].value_counts()/len(leads_scoring.index)),2)

In [None]:
round(100*((leads_scoring.groupby(['Newspaper','Converted']).Converted.count())/len(leads_scoring.index)),2)

In [None]:
round(100*(leads_scoring['Digital Advertisement'].value_counts()/len(leads_scoring.index)),2)

In [None]:
round(100*((leads_scoring.groupby(['Digital Advertisement','Converted']).Converted.count())/len(leads_scoring.index)),2)

In [None]:
round(100*(leads_scoring['Through Recommendations'].value_counts()/len(leads_scoring.index)),2)

In [None]:
round(100*((leads_scoring.groupby(['Through Recommendations','Converted']).Converted.count())/len(leads_scoring.index)),2)

**It can be noted that customer who said "Yes" in above 6 columns is a very small percentage and the conversion percentage of each is almost negligible.So the fields marked "Yes" don't serve the variance of the dataset and thus doesn't prove to be useful to the analysis.
Hence we have chosen to drop the following columns** 
* Search
* Newspaper Article
* X Education Forums
* Newspaper
* Digital Advertisement
* Through Recommendations


In [None]:
leads_scoring=leads_scoring.drop(['Search','Newspaper Article','X Education Forums','Newspaper','Digital Advertisement','Through Recommendations'],axis=1)

In [None]:
round(100*(leads_scoring['A free copy of Mastering The Interview'].value_counts()/len(leads_scoring.index)),2)

In [None]:
round(100*((leads_scoring.groupby(['A free copy of Mastering The Interview','Converted']).Converted.count())/len(leads_scoring.index)),2)

**Outlier Treatment**

In [None]:
round(leads_scoring.describe(),2)

**Retaining data within 3 time Std. Dev  for each column of the following collumns**
* Total Time Spent on Website
* Page Views Per Visit

In [None]:
def remove_outlier(df, Data):
        df_out=df[np.abs(df[Data]-df[Data].mean()) <= (3*df[Data].std())]
        return df_out

In [None]:
leads_scoring=remove_outlier(leads_scoring,'Total Time Spent on Website')
leads_scoring=remove_outlier(leads_scoring,'Page Views Per Visit')

In [None]:
round(leads_scoring.describe(),2)

In [None]:
round(100*(leads_scoring.Converted.value_counts()/len(leads_scoring.index)),2)

####  After Outlier Treatment  the data  has 38.49% cases of Conversion.

**Dummy variable creation for Logistic regression.**

In [None]:
dummy_df = pd.get_dummies(leads_scoring[['Lead Origin','Lead Source','Last Activity','Country','Specialization','What is your current occupation','Tags','Lead Quality','Last Notable Activity']], drop_first=True)

# Adding the results to the master dataframe
leads_scoring_model = pd.concat([leads_scoring, dummy_df], axis=1)

In [None]:
# List of variables to map

varlist =  ['Do Not Email', 'A free copy of Mastering The Interview']

# Defining the map function
def binary_map(x):
    return x.map({'Yes': 1, "No": 0})

# Applying the function to the housing list
leads_scoring_model[varlist] = leads_scoring_model[varlist].apply(binary_map)

In [None]:
leads_scoring_model.drop(['Lead Origin','Lead Source','Last Activity','Country','Specialization','What is your current occupation','Tags','Lead Quality','Last Notable Activity'],axis=1,inplace=True)

In [None]:
leads_scoring_model.info()

In [None]:
leads_scoring_model.shape

In [None]:
leads_scoring_model.head()

## Data Modelling
### Test Train Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Putting feature variable to X
X = leads_scoring_model.drop(['Converted'], axis=1)
X.head()

In [None]:
# Putting response variable to y
y = leads_scoring_model['Converted']

y.head()

#### Splitting the data into train and test

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

**Feature Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
X_train[['Total Time Spent on Website','TotalVisits','Page Views Per Visit']] = scaler.fit_transform(X_train[['Total Time Spent on Website','TotalVisits','Page Views Per Visit']])
X_train.head()

In [None]:
X_train.describe()

In [None]:
# Importing matplotlib and seaborn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Model Building

In [None]:
import statsmodels.api as sm

In [None]:
X_train.groupby('Lead Quality_Low in Relevance').count()

In [None]:
# Logistic regression model
logistic_model = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logistic_model.fit().summary()

**Feature Selection Using RFE**

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [None]:
from sklearn.feature_selection import RFE
rfe = RFE(logreg, 15)             # running RFE with 15 variables as output
rfe = rfe.fit(X_train, y_train)

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
cols = X_train.columns[rfe.support_]

In [None]:
X_train.columns[~rfe.support_]

### The following columns are required for Building the model

In [None]:
cols

In [None]:
X_train.shape

In [None]:
X_train[cols].shape

##### Assessing the model with StatsModels

In [None]:
X_train_sm = sm.add_constant(X_train[cols])
logistic_model2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logistic_model2.fit()
res.summary()

#### Getting the predicted values on the train data  set

In [None]:
y_train_pred = res.predict(X_train_sm)
y_train_pred[:10]

In [None]:
y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

#### Creating a dataframe with the original 'Converted' flag and the  'Predicted_Conversion' flag value also  calculating 'Converted_Prob' & 'Lead_Score'

In [None]:
y_train_pred_final = pd.DataFrame({'Converted':y_train.values, 'Converted_Probability':y_train_pred})
y_train_pred_final.head()

In [None]:
y_train_pred_final['Lead_Score']=round((y_train_pred_final['Converted_Probability']*100))
y_train_pred_final.head()

##### Creating new column 'predicted' with 1 if Converted_Prob > 0.5 else 0

In [None]:
y_train_pred_final['Predicted_Conversion'] = y_train_pred_final.Converted_Probability.map(lambda x: 1 if x > 0.5 else 0)
# Let's see the head
y_train_pred_final.head()

In [None]:
from sklearn import metrics

In [None]:
# Confusion matrix 
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Predicted_Conversion)
print(confusion)

In [None]:
# Let's check the overall accuracy.
print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.Predicted_Conversion))

#### Checking VIFs

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train[cols].columns
vif['VIF'] = [variance_inflation_factor(X_train[cols].values, i) for i in range(X_train[cols].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Dropping high p value (0.999) column 'Tags_wrong number given'

In [None]:
cols = cols.drop('Tags_Lateral student', 1)
cols

In [None]:
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[cols])
logistic_model3 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logistic_model3.fit()
res.summary()

In [None]:
y_train_pred = res.predict(X_train_sm).values.reshape(-1)

In [None]:
y_train_pred[:10]

In [None]:
y_train_pred_final['Converted_Prob'] = y_train_pred

In [None]:
# Creating new column 'predicted' with 1 if Churn_Prob > 0.5 else 0
y_train_pred_final['Predicted_Conversion'] = y_train_pred_final.Converted_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final['Lead_Score']=round(y_train_pred_final['Converted_Prob']*100)
y_train_pred_final.head()

In [None]:
# Let's check the overall accuracy.
print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.Predicted_Conversion))

So overall the accuracy hasn't dropped much

#### Checking the VIFs again

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train[cols].columns
vif['VIF'] = [variance_inflation_factor(X_train[cols].values, i) for i in range(X_train[cols].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Let's drop 'Tags_wrong number given' since it has a high p value
cols = cols.drop('Tags_wrong number given')
cols

In [None]:
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[cols])
logm4 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm4.fit()
res.summary()

In [None]:
y_train_pred = res.predict(X_train_sm).values.reshape(-1)

In [None]:
y_train_pred[:10]

In [None]:
y_train_pred_final['Converted_Prob'] = y_train_pred

In [None]:
# Creating new column 'predicted' with 1 if Churn_Prob > 0.5 else 0
y_train_pred_final['Predicted_Conversion'] = y_train_pred_final.Converted_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final['Lead_Score']=round(y_train_pred_final['Converted_Prob']*100)
y_train_pred_final.head()

In [None]:
# Let's check the overall accuracy.
print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.Predicted_Conversion))

##### Let's now check the VIFs again

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train[cols].columns
vif['VIF'] = [variance_inflation_factor(X_train[cols].values, i) for i in range(X_train[cols].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Let's take a look at the confusion matrix again 
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Predicted_Conversion )
confusion

In [None]:
# Let's check the overall accuracy.
metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.Predicted_Conversion)

**Dropping 'Tags_Closed by Horizzon' since it has a high p value**

In [None]:
cols = cols.drop('Tags_Closed by Horizzon')
cols

In [None]:
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[cols])
logm5 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm5.fit()
res.summary()

In [None]:
y_train_pred = res.predict(X_train_sm).values.reshape(-1)

In [None]:
y_train_pred[:10]

In [None]:
y_train_pred_final['Converted_Prob'] = y_train_pred

In [None]:
# Creating new column 'predicted' with 1 if Converted_Prob > 0.5 else 0
y_train_pred_final['Predicted_Conversion'] = y_train_pred_final.Converted_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final['Lead_Score']=round(y_train_pred_final['Converted_Prob']*100)
y_train_pred_final.head()

In [None]:
# Let's check the overall accuracy.
print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.Predicted_Conversion))

**Checking the VIFs**

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train[cols].columns
vif['VIF'] = [variance_inflation_factor(X_train[cols].values, i) for i in range(X_train[cols].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Let's check the overall accuracy.
metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.Predicted_Conversion)

In [None]:
cols = cols.drop('Tags_Busy')
cols

In [None]:
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[cols])
logm7 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm7.fit()
res.summary()

In [None]:
y_train_pred = res.predict(X_train_sm).values.reshape(-1)

In [None]:
y_train_pred[:10]

In [None]:
y_train_pred_final['Converted_Prob'] = y_train_pred

In [None]:
# Creating new column 'predicted' with 1 if Converted_Prob > 0.5 else 0
y_train_pred_final['Predicted_Conversion'] = y_train_pred_final.Converted_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final['Lead_Score']=round(y_train_pred_final['Converted_Prob']*100)
y_train_pred_final.head()

In [None]:
# Let's check the overall accuracy.
print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.Predicted_Conversion))

Now all the variables are within critical VIF limit of <5 and p value of <0.05. Hence it is a good model with accuracy of ~89.21%.

In [None]:
# Let's take a look at the confusion matrix again 
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Predicted_Conversion )
confusion

### Finding Optimal Cutoff Point

Optimal cutoff probability is that prob where we get balanced sensitivity and specificity

In [None]:
# Let's create columns with different probability cutoffs 
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.Converted_Prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
# Let's plot accuracy sensitivity and specificity for various probabilities.
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

#### From the curve above,the three curves are intersecting ~0.30 taking it as cutoff probability.

In [None]:
y_train_pred_final['final_predicted'] = y_train_pred_final.Converted_Prob.map( lambda x: 1 if x > 0.30 else 0)

y_train_pred_final.head()

In [None]:
# Let's check the overall accuracy.
metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.final_predicted)

In [None]:
confusion2 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.final_predicted )
confusion2

In [None]:
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
TP / float(TP+FN)

In [None]:
# Let us calculate specificity
TN / float(TN+FP)

In [None]:
# Calculate false postive rate - predicting churn when customer does not have churned
print(FP/ float(TN+FP))

In [None]:
# Positive predictive value 
print (TP / float(TP+FP))

In [None]:
# Negative predictive value
print (TN / float(TN+ FN))

### Making predictions on the test set

In [None]:
X_test[['Total Time Spent on Website','TotalVisits','Page Views Per Visit']] = scaler.transform(X_test[['Total Time Spent on Website','TotalVisits','Page Views Per Visit']])

In [None]:
X_test = X_test[cols]
X_test.head()

In [None]:
X_test_sm = sm.add_constant(X_test)

In [None]:
X_test.shape

Making predictions on the test set

In [None]:
y_test_pred = res.predict(X_test_sm)

In [None]:
y_test_pred[:10]

In [None]:
# Converting y_pred to a dataframe which is an array
y_pred_1 = pd.DataFrame(y_test_pred)

In [None]:
# Let's see the head
y_pred_1.head()

In [None]:
# Converting y_test to dataframe
y_test_df = pd.DataFrame(y_test)

In [None]:
# Appending y_test_df and y_pred_1
y_pred_final = pd.concat([y_test_df, y_pred_1],axis=1)

In [None]:
y_pred_final.head()

In [None]:
# Renaming the column 
y_pred_final= y_pred_final.rename(columns={ 0 : 'Converted_Probability'})

In [None]:
# Let's see the head of y_pred_final
y_pred_final.head()

In [None]:
y_pred_final['Predicted_Conversion'] = y_pred_final.Converted_Probability.map(lambda x: 1 if x > 0.38 else 0)

In [None]:
y_pred_final['Lead_Score'] =round(y_pred_final['Converted_Probability']*100)

In [None]:
y_pred_final.head()

In [None]:
# Let's check the overall accuracy.
metrics.accuracy_score(y_pred_final.Converted, y_pred_final.Predicted_Conversion)

In [None]:
confusion2 = metrics.confusion_matrix(y_pred_final.Converted, y_pred_final.Predicted_Conversion)
confusion2

In [None]:
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
TP / float(TP+FN)

In [None]:
# Let us calculate specificity
TN / float(TN+FP)

## Checking Conversion % 

**Using the model the cutoff score has to be decided the which company can employ to identify the hot leads. And in those identified Hot leads it can get conversion rate of minimum 80%*

In [None]:
y_train_pred_final.head(2)

In [None]:
df_train=y_train_pred_final[['Converted','Lead_Score','Predicted_Conversion']]

In [None]:
y_pred_final.head()

In [None]:
df_test=y_pred_final[['Converted','Lead_Score','Predicted_Conversion']]

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
df=pd.concat([df_train,df_test])

In [None]:
df.shape

In [None]:
df.head()

**At value 38, desired Converison of >80% ie that  is obtained, hence cutoff Lead score is chosen as 38.**

In [None]:
df_optimal_conversion=df[df['Lead_Score']>=32]

In [None]:
df_optimal_conversion.shape

In [None]:
conversion_rate=round(100*(df_optimal_conversion['Converted'].sum()/len(df_optimal_conversion.index)))

In [None]:
conversion_rate

### Conclusion

## Thus, the Lead_Score cutoff has been set at 32. Conversion rate of 88% has been  achieved.Hence it is a good model with accuracy of 88.88%. Thus X Education should try to reach only prospects with Lead score of >=30 to achieve >80% conversion rate. ##