# Lead Score Case Study

In [None]:
# importing all the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns 
from matplotlib.pyplot import xticks
%matplotlib inline
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
path = '/kaggle/input/leads-dataset/Leads.csv' 

In [None]:
leadScoreDf = pd.read_csv( path )

In [None]:
leadScoreDf.head( )

In [None]:
leadScoreDf.shape

In [None]:
leadScoreDf.describe( )

In [None]:
leadScoreDf.info( )

## Data Cleaning

### Finding the missing values - 

'Select' value is similiar to missing value, as the user did not select the option for a given dropdown feild.
Thus is best to convert it to nan and then try handling the missing values

In [None]:
leadScoreDf = leadScoreDf.replace('Select', np.nan)

In [None]:
# Checking for null values for all the columns
leadScoreDf.isnull( ).sum( axis = 0 )

In [None]:
# Looking at the percentages would give a better insight - 
round( 100*( leadScoreDf.isnull( ).sum( )/len( leadScoreDf.index ) ), 2 )

As the percentages of the missing values are not that high( mostly around 30% ), we can impute the missing values depending on the values of the caolumn
For e.g. if it is a categorical variable, we can impute the missing value with mode.

However columns with more than 70% missing values should be dropped.

### Imputing the columns - 

Imputing the columns, based on the % of missing values starting with the highest % - 

In [None]:
#Dropping columns with more than 70% missing values - 

leadScoreDf = leadScoreDf.drop( leadScoreDf.loc[:,list( round( 100*( leadScoreDf.isnull().sum()/len( leadScoreDf.index ) ), 2 ) > 70 ) ].columns, 1 )

#### Lead Quality - 

In [None]:
leadScoreDf[ 'Lead Quality' ].describe( )

In [None]:
leadScoreDf[ 'Lead Quality' ].value_counts( )

Imputing the missing values for 'Lead Quality' with 'Not sure' as we are imputing the columns based on intuition and not some logic.
We cannot impute them with 'Might be', even though it has the highest count as it can be quite misleading.

In [None]:
leadScoreDf[ 'Lead Quality' ] = leadScoreDf[ 'Lead Quality' ].replace( np.nan, 'Not sure' )

#### Asymmetrique -

In [None]:
fig, axs = plt.subplots(2,2, figsize = (10,8))
plt1 = sns.countplot(leadScoreDf['Asymmetrique Activity Index'], ax = axs[0,0])
plt2 = sns.boxplot(leadScoreDf['Asymmetrique Activity Score'], ax = axs[0,1])
plt3 = sns.countplot(leadScoreDf['Asymmetrique Profile Index'], ax = axs[1,0])
plt4 = sns.boxplot(leadScoreDf['Asymmetrique Profile Score'], ax = axs[1,1])
plt.tight_layout()

In [None]:
leadScoreDf[ 'Asymmetrique Activity Index' ].describe( )

In [None]:
leadScoreDf[ 'Asymmetrique Activity Index' ].value_counts( )

In [None]:
leadScoreDf[ 'Asymmetrique Profile Index' ].value_counts( )

In [None]:
leadScoreDf[ 'Asymmetrique Activity Score' ].value_counts( )

In [None]:
leadScoreDf[ 'Asymmetrique Profile Score' ].value_counts( )

Based on the value counts and the plots, we can see that the values are varying.
It would be the best if we would just drop the column rather than imputing the missing values.


In [None]:
leadScoreDf = leadScoreDf.drop( [ 'Asymmetrique Activity Index','Asymmetrique Activity Score',
                                 'Asymmetrique Profile Index','Asymmetrique Profile Score' ], 1 )

#### City -

In [None]:
leadScoreDf[ 'City' ].value_counts( )

Based on the value counts, we can see that 'Mumbai' is most common city.
We can impute missing values with 'Mumbai'


This column can be dropped as well as X is an online Education system. City and Country does not really matter

In [None]:
leadScoreDf['City'] = leadScoreDf['City'].replace( np.nan, 'Mumbai' )

#### Tags -

In [None]:
leadScoreDf[ 'Tags' ].value_counts( )

'Will revert after reading the email' is the most commonly selected option.
We can impute the missing values with this option

In [None]:
leadScoreDf[ 'Tags' ] = leadScoreDf[ 'Tags' ].replace( np.nan, 'Will revert after reading the email' )

#### Specialization - 

In [None]:
leadScoreDf[ 'Specialization' ].value_counts( )

Even though, the most commonly used option is 'Finance', lead may have some specialization which is not mentioned in the options.
It would be better to impute the missing values with 'Others'

In [None]:
leadScoreDf[ 'Specialization' ] = leadScoreDf[ 'Specialization' ].replace( np.nan, 'Others' )

#### What is your current occupation - 

In [None]:
leadScoreDf[ 'What is your current occupation'].value_counts( )

'Unemployed' would be the best option for imputing the missing values as it the mode for this column.

In [None]:
leadScoreDf[ 'What is your current occupation'] = leadScoreDf[ 'What is your current occupation'].replace( np.nan, 'Unemployed' )

#### What matters most to you in choosing a course -

In [None]:
leadScoreDf[ 'What matters most to you in choosing a course' ].value_counts( )

As mode of the column is 'Better Career Prospects', we will impute the missing values with this option.

In [None]:
leadScoreDf['What matters most to you in choosing a course'] = leadScoreDf['What matters most to you in choosing a course'].replace( np.nan, 'Better Career Prospects' )

#### Country -

In [None]:
leadScoreDf[ 'Country' ].value_counts( )

Imputing the missing values with 'India'

In [None]:
leadScoreDf[ 'Country' ] = leadScoreDf[ 'Country' ].replace( np.nan, 'India' )

Rest of the missing values are below 2%, and thus those rows can be dropped

In [None]:
leadScoreDf.dropna( inplace = True )

In [None]:
# Checking the % of missing values after the imputation - 
round( 100*( leadScoreDf.isnull( ).sum( )/len( leadScoreDf.index ) ), 2 )

In [None]:
# We can drop the last five columns as they have only 1 unique value
leadScoreDf.drop(['Receive More Updates About Our Courses', 'Update me on Supply Chain Content', 'Get updates on DM Content', 'Magazine',  
       'I agree to pay the amount through cheque'], axis = 1, inplace = True)

In [None]:
# Dropping Prospect ID & Lead Number as they are only IDs
leadScoreDf.drop(['Prospect ID','Lead Number'], axis = 1, inplace = True)

In [None]:
for var in leadScoreDf.select_dtypes(exclude = ['int64', 'float64']).columns:
    print(leadScoreDf[var].value_counts(), '\n')

## Exploratory Data Analysis

Now that the data is cleaned, we can start with exploring the data and try to find patters

#### Lead Origin-

We can analyze how many leads got converted and how the lead originated

In [None]:
sns.countplot(x = "Lead Origin", hue = "Converted", data = leadScoreDf )
xticks(rotation = 90)

Based on the graph, we can see that -
- most conversions have been when the lead is on 'Landing Page'.
- The least conversions are for lead Imports

To increase the conversion rate, 'Lead Add Form' and 'Lead Import' will have to be improved

#### Lead Source-

In [None]:
sns.countplot(x = "Lead Source", hue = "Converted", data = leadScoreDf )
xticks( rotation = 90 )

In [None]:
leadScoreDf['Lead Source'].value_counts( )

Most of the categories for 'Lead Source' have almost negligible entries, thus we can combine them into 1 category 'Others'
Also there are 2 variations for google i.e. 'Google' and 'google'. We should change this and only keep 1 category.

In [None]:
leadScoreDf['Lead Source'] = leadScoreDf[ 'Lead Source'].replace(['Click2call', 'Live Chat', 'NC_EDM', 'Pay per Click Ads', 'Press_Release',
  'Social Media', 'WeLearn', 'bing', 'blog', 'testone', 'welearnblog_Home', 'youtubechannel'], 'Others')

leadScoreDf['Lead Source']  = leadScoreDf[ 'Lead Source' ].replace( 'google', 'Google' )

In [None]:
sns.countplot(x = "Lead Source", hue = "Converted", data = leadScoreDf )
xticks( rotation = 90 )

Based on the updated plot, we can observe the following -
- Most of the conversions are for 'Google' and 'Direct Traffic'
- Others, Facebook and Reference have a very low conversion rate

#### Do not Email and Do Not Call - 

In [None]:
fig, axs = plt.subplots(1,2, figsize = (10,8))
plt1 = sns.countplot(x = "Do Not Email", hue = "Converted", data = leadScoreDf, ax = axs[ 0]  )
xticks( rotation = 90 )

plt2 = sns.countplot(x = "Do Not Call", hue = "Converted", data = leadScoreDf, ax = axs[ 1 ] )
xticks( rotation = 90 )

- Leads who select 'No' i.e. want the email communications, tend to have a higher conversion rate then people who dont want the email communications
- Leads who dont want calls are never converted.

#### Last Activity -

In [None]:
sns.countplot(x = "Last Activity", hue = "Converted", data = leadScoreDf )
xticks( rotation = 90 )

categories like - 
- 'Had a Phone Conversation', 
- 'Visited Booth in Tradeshow', 
- 'Approached upfront', 
- 'Resubscribed to emails',
- 'Email Marked Spam' 

have very low/negligible values.
It would be best to merge these columns into a single category 'Other Activity'

In [None]:
leadScoreDf['Last Activity'] = leadScoreDf['Last Activity'].replace(['Had a Phone Conversation', 'View in browser link Clicked', 
                                                       'Visited Booth in Tradeshow', 'Approached upfront',
                                                       'Resubscribed to emails','Email Received', 'Email Marked Spam'], 'Other Activity')

In [None]:
sns.countplot(x = "Last Activity", hue = "Converted", data = leadScoreDf )
xticks( rotation = 90 )

Based on the updated plot, we can see that the leads who 'Opened the email' have the highest conversion rates, followed by Leads who 'Sent SMS'.
The least conversions have happened for Leads 'Other Activity' and leads who 'Unsubscribed'

#### Country - 

In [None]:
fig, axs = plt.subplots(figsize = (15,5))
sns.countplot(x = "Country", hue = "Converted", data = leadScoreDf )
xticks( rotation = 90 )

Most of the values for Country is 'India' followed by 'Russia' and the rest are all negligible values.
As the company is an online education system, the Country doesnt really matter and it can be dropped.

In [None]:
leadScoreDf.drop( 'Country', inplace = True, axis = 1 )

#### Specialization -

In [None]:
fig, axs = plt.subplots(figsize = (15,5))
sns.countplot(x = "Specialization", hue = "Converted", data = leadScoreDf )
xticks( rotation = 90 )

We can determine that most of the conversions are for the 'Others' category of specialization.

Focus should be put on improving the conersion rates of categories like 'Retail Management', 'Services Excellence','E-Business'

#### What is your current occupation and  What matters most to you in choosing a course - 

In [None]:
fig, axs = plt.subplots(2,1, figsize = (15,10))
plt1 = sns.countplot(y = "What is your current occupation", hue = "Converted", data = leadScoreDf, ax = axs[ 0]  )
xticks( rotation = 90 )

plt2 = sns.countplot(y = "What matters most to you in choosing a course", hue = "Converted", data = leadScoreDf, ax = axs[ 1 ] )
xticks( rotation = 90 )

In [None]:
leadScoreDf['What matters most to you in choosing a course'].value_counts( )

In 'What is your current occupation', most of the entries are 'Unemployed' and they have a high conversion rate.


In 'What matters most to you in choosing a course' has only 1 value for 'Flexibility & Convience' and 'Other'. Thus providing no real information. It would be better to drop this column

In [None]:
leadScoreDf.drop( 'What matters most to you in choosing a course', inplace = True, axis = 1 )

In [None]:
# 'Others' category is already present in Specialization, it would be better to rename this category to 'Other Occupation'
leadScoreDf['What is your current occupation'] = leadScoreDf['What is your current occupation'].replace( 'Others', 'Other Occupation' )

#### Search - 

In [None]:
sns.countplot( x = 'Search', hue = 'Converted', data = leadScoreDf )

In [None]:
leadScoreDf['Search'].value_counts( )

Most of the values are 'No', no inference can be drawn from this column.
It would be best to drop this column

In [None]:
leadScoreDf.drop( 'Search', axis = 1, inplace = True )

#### Newspaper Article and Newspaper-

In [None]:
fig, axs = plt.subplots(1,3, figsize = (15,5))
plt1 = sns.countplot(x = "Newspaper Article", hue = "Converted", data = leadScoreDf, ax = axs[ 0]  )
xticks( rotation = 90 )

plt2 = sns.countplot(x = "Newspaper", hue = "Converted", data = leadScoreDf, ax = axs[ 1 ] )
xticks( rotation = 90 )

plt2 = sns.countplot( x = 'Digital Advertisement', hue = 'Converted', data = leadScoreDf, ax = axs[ 2 ] )
xticks( rotation = 90 )

In [None]:
leadScoreDf['Newspaper Article'].value_counts( )

In [None]:
leadScoreDf['Newspaper'].value_counts( )

In [None]:
leadScoreDf['Digital Advertisement'].value_counts( )

'Digital Advertisement', 'Newspaper' and 'Newspaper Article' have maximum values for 'No'.

There are hardly 2-3 rows with 'Yes'.

No Inference can be drawn from these columns, also they would not help in the model. Thus we should drop these columns

In [None]:
leadScoreDf.drop(['Newspaper', 'Newspaper Article', 'Digital Advertisement' ], axis = 1, inplace = True )

#### X Education Forums - 

In [None]:
sns.countplot( x = 'X Education Forums', hue = 'Converted', data = leadScoreDf )
xticks( rotation = 90 )

In [None]:
leadScoreDf[ 'X Education Forums' ].value_counts( )

As there is only 1 value for 'Yes', this column would not provide any insights.

Hence we should drop the column

In [None]:
leadScoreDf.drop( 'X Education Forums', axis = 1, inplace = True )

#### Through Recommendations - 

In [None]:
sns.countplot( x = 'Through Recommendations', hue = 'Converted', data = leadScoreDf )

In [None]:
leadScoreDf['Through Recommendations'].value_counts( )

There are very few values for 'Through Recommendations', it would be for the best to just drop the column 

In [None]:
leadScoreDf.drop( 'Through Recommendations', axis = 1, inplace = True )

#### Tags-

In [None]:
plt.subplots(figsize = (15,5))
sns.countplot( x = 'Tags', hue = 'Converted', data = leadScoreDf )
xticks( rotation = 90 )

In [None]:
leadScoreDf['Tags'].value_counts( )

We can see that there are certain categories which have very less values.

These categories can be clubbed together into a single category 'Other Tags'

In [None]:
leadScoreDf['Tags'] = leadScoreDf['Tags'].replace( ['in touch with EINS', 'Lost to Others', 'Still Thinking', 
                              'Want to take admission but has financial problems', 'Interested in Next batch',
                              'Shall take in the next coming month', 'University not recognized'
                              'In confusion whether part time or DLP', 'Lateral student', 
                              'University not recognized', 'Recognition issue (DEC approval)'], 'Other Tags' )

In [None]:
plt.subplots(figsize = (15,5))
sns.countplot( x = 'Tags', hue = 'Converted', data = leadScoreDf )
xticks( rotation = 90 )

Based on the new plot we can say that -
- leads with tags 'Will revert after reading the email' have the highest conversion rate. 
- tag 'Will revert after reading the email' also has a high non conversion rate.
- leads with tags 'Closed by Horizon' and 'Lost to EINS' do not get converted.
- tags 'Already a student' are usually converted

#### Lead Quality - 

In [None]:
sns.countplot( x = 'Lead Quality', hue = 'Converted', data = leadScoreDf )
xticks( rotation = 90 )

We can see that there are 2 variations of not sure, we can combine them.
On the basis of the plot, we can infere that - 
- Not Sure has a high conversion rate
- High ub Relevance, Might be and Low in Relevance have a low conversion rate

In [None]:
leadScoreDf['Lead Quality'] = leadScoreDf['Lead Quality'].replace('Not sure', 'Not Sure')

#### City - 

In [None]:
plt.subplots(figsize = (15,5))
sns.countplot( x = 'City', hue = 'Converted', data = leadScoreDf )
xticks( rotation = 90 )

- Mumbai has highest lead conversion rate
- There are very few leads from Tier II cities


In [None]:
leadScoreDf.shape

Now that the data is cleaned and analysis has been done, we can move onto Preparing the data for model building

## Data Preparation -

In [None]:
leadScoreDf.head( )

### Converting Binary variables

We need to convert the 'Yes' and 'No' values into 1 and 0

In [None]:
# columns to be mapped -
colList =  ['Do Not Email', 'Do Not Call', 'A free copy of Mastering The Interview']

# Defining the map function
def binary_map(x):
    return x.map({'Yes': 1, "No": 0})

# Applying the function to the columns
leadScoreDf[colList] = leadScoreDf[colList].apply(binary_map)

### Creating Dummy Variables - 

The categorical variables with multiple levels, need to be converted to dummy variables.
For e.g. 'Lead Origin' can be broken down into 'Landing Page Submission', 'Lead Add form' and 'Lead Import'.
- 001 would mean its a 'Lead Import' 
- 100 would mean its a 'Landing Page Submission'
- 000 would mean its an 'API' Lead Origin.

In [None]:
dummyCols = [ 'Lead Origin', 'Lead Source', 'Last Activity', 'Specialization','What is your current occupation',
                              'Tags','Lead Quality','City','Last Notable Activity' ]

In [None]:
# Creating a dummy variable for some of the categorical variables and dropping the first one.
dummyDf = pd.get_dummies( leadScoreDf [ dummyCols ], drop_first=True )
dummyDf.head()

In [None]:
# Joining the dummy dataframe and the leadScore dataframe -

leadScoreDf = pd.concat( [ leadScoreDf, dummyDf ], axis=1 )
leadScoreDf.head()

Original Categorical Columns are not required as they have been converted to the dummy variables.
Thus we can drop them.

In [None]:
leadScoreDf.drop( dummyCols, axis = 1, inplace = True )

In [None]:
leadScoreDf.head()

## Splitting the data into Train and Test Data set - 

In [None]:
# Putting feature variable to X
X = leadScoreDf.drop( 'Converted', axis=1 )
y = leadScoreDf[ 'Converted' ]

In [None]:
y.head( )

In [None]:
X.head( )

In [None]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.7, test_size=0.3, random_state=100 )

## Feature Scaling - 

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']] = scaler.fit_transform(X_train[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']])

X_train.head( )

#### Conversion Rate - 

In [None]:
Converted = ( sum( leadScoreDf[ 'Converted' ] ) / len( leadScoreDf[ 'Converted' ] ) ) * 100
Converted

The current conversion rate is around 38%.

## Model Building - 

In [None]:
# Logistic regression model
logm1 = sm.GLM( y_train,( sm.add_constant( X_train ) ), family = sm.families.Binomial( ) )
logm1.fit( ).summary() 

### Feature Selection using RFE - 

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

from sklearn.feature_selection import RFE
rfe = RFE(logreg, 15)             # running RFE with 15 variables as output
rfe = rfe.fit(X_train, y_train)

In [None]:
rfe.support_

In [None]:
rfeCols = X_train.columns[rfe.support_]
rfeCols

In [None]:
X_train_sm = sm.add_constant( X_train[ rfeCols ] )
logm2 = sm.GLM( y_train,X_train_sm, family = sm.families.Binomial( ) )
res = logm2.fit( )
res.summary( )

In [None]:
rfeCol1 = rfeCols.drop( [ 'Tags_invalid number', 'Tags_number not provided' ], 1 )

In [None]:
# Rebuilding the model - 
X_train_sm = sm.add_constant( X_train[ rfeCol1 ] )
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

#### Predicting values on the training data set - 

In [None]:
y_train_pred = res.predict(X_train_sm)
y_train_pred[ : 5 ]

In [None]:
y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[ : 5 ]

#### DataFrame with Converted Values and the Converted Probability - 

In [None]:
y_train_pred_final = pd.DataFrame( { 'Converted':y_train.values, 'Converted_prob':y_train_pred } )
y_train_pred_final['Prospect ID'] = y_train.index
y_train_pred_final.head()

#### Predicted value is set to 1 if the probability is greater than 0.5 - 

In [None]:
y_train_pred_final['predicted'] = y_train_pred_final.Converted_prob.map(lambda x: 1 if x > 0.5 else 0)

y_train_pred_final.head()

#### Getting the Confusion matrix, Accuracy - 

In [None]:
from sklearn import metrics

# Confusion matrix -
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.predicted )
print(confusion)

In [None]:
# Overall accuracy -
print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.predicted))

#### Checking VIFs - 

In [None]:
#feature variables and their respective VIFs - 
vif = pd.DataFrame()
vif[ 'Features' ] = X_train[ rfeCol1 ].columns
vif[ 'VIF' ] = [ variance_inflation_factor(X_train[ rfeCols ].values, i ) for i in range( X_train[ rfeCol1 ].shape[1] ) ]
vif[ 'VIF' ] = round( vif[ 'VIF' ], 2 )
vif = vif.sort_values( by = "VIF", ascending = False )
vif

As the VIF for all the features is below 5, we dont need to remove any feature and build the model again.

### Metrics beyond Accuracy - 

In [None]:
TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

#### Sensitivity - 

In [None]:
TP / float(TP+FN)

#### Specificity - 

In [None]:
TN / float(TN+FP)

#### Positive Predictive Value - 

In [None]:
print (TP / float(TP+FP))

#### Negative Predictive Value - 

In [None]:
print (TN / float(TN+ FN))

## Plotting the ROC Curve - 

In [None]:
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
fpr, tpr, thresholds = metrics.roc_curve( y_train_pred_final.Converted, y_train_pred_final.Converted_prob, drop_intermediate = False )

In [None]:
draw_roc( y_train_pred_final.Converted, y_train_pred_final.Converted_prob )

Based on the ROC Curve we can see that the curve is closer to the left hand corner, indicating that the test is accurate

## Finding the Cut-off Point - 

Optimal cut-off point would be the one where the specificity and sensitivity are balanced.

In [None]:
# create columns with different probability cutoffs 
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.Converted_prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
# Plotting accuracy sensitivity and specificity for various probabilities.
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

On the basis of the plot, we can see that the point at which all the 3 values i.e. accuracy, sensitivity and specificity are balanced for probability of 0.2 .
Thus we will take 0.2 as the cut-off probability.

In [None]:
y_train_pred_final['final_predicted'] = y_train_pred_final.Converted_prob.map( lambda x: 1 if x > 0.2 else 0)

y_train_pred_final.head()

## Assigning the Lead Score - 

In [None]:
y_train_pred_final[ 'Lead_Score' ] = y_train_pred_final.Converted_prob.map( lambda x: round(x*100))

y_train_pred_final.head()

### Checking the overall Accuracy - 

In [None]:

metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.final_predicted)

confusion2 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.final_predicted )
confusion2

TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

####  Sensitivity -

In [None]:
TP / float(TP+FN)

#### Specificity - 

In [None]:
TN / float(TN+FP)

### Precision and Recall - 

#### Precision - 

In [None]:

TP / TP + FP

confusion[1,1]/(confusion[0,1]+confusion[1,1])

In [None]:
precision_score(y_train_pred_final.Converted , y_train_pred_final.predicted)

#### Recall - 

In [None]:
TP / TP + FN

confusion[1,1]/(confusion[1,0]+confusion[1,1])

In [None]:
recall_score(y_train_pred_final.Converted, y_train_pred_final.predicted)

## Predictions on the Test Data Set - 

In [None]:
X_test[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']] = scaler.fit_transform(X_test[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']])

X_train.head()

In [None]:
X_test = X_test[ rfeCol1 ]
X_test.head()

In [None]:
X_test_sm = sm.add_constant(X_test)

#### Making predictions - 

In [None]:
y_test_pred = res.predict( X_test_sm )

In [None]:
# Converting y_pred and y_test to a dataframe which is an array
y_pred_1 = pd.DataFrame(y_test_pred)
y_test_df = pd.DataFrame(y_test)

In [None]:
y_test_df['Prospect ID'] = y_test_df.index

In [None]:
y_pred_1.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)
y_pred_final = pd.concat([y_test_df, y_pred_1],axis=1)

In [None]:
y_pred_final.head()

In [None]:
# Renaming the column 
y_pred_final= y_pred_final.rename(columns={ 0 : 'Converted_prob'})

y_pred_final['final_predicted'] = y_pred_final.Converted_prob.map(lambda x: 1 if x > 0.2 else 0)

In [None]:
y_pred_final.head()

#### Checking the overall accuracy of the test data - 

In [None]:
metrics.accuracy_score(y_pred_final.Converted, y_pred_final.final_predicted)

In [None]:
confusion3 = metrics.confusion_matrix(y_pred_final.Converted, y_pred_final.final_predicted )
confusion3

In [None]:
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

#### Sensitivity -

In [None]:
TP / float(TP+FN)

#### Specificity - 

In [None]:
TN / float(TN+FP)

In [None]:
res.params