In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import xticks
import seaborn as sns


# Data display coustomization
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', 100)

In [None]:
leads_data_dict = pd.read_excel('/kaggle/input/leads-dataset/Leads Data Dictionary.xlsx', skiprows=2)
leads_data_dict.drop(leads_data_dict.columns[0], axis=1,inplace=True)
leads_data_dict

In [None]:
leadscore = pd.read_csv('/kaggle/input/leads-dataset/Leads.csv')

In [None]:
leadscore.head()

###### There are lot of rows in various categorical columns which have value 'Select' values present in the dataset. `Select` corresponds to the user having not made any selection. So we can replace it with NaN.

In [None]:
leadscore = leadscore.replace('Select', np.nan)

###### Before moving ahead  with EDA, we will define some helper functions which will be used frequently in the rest of the analysis.

In [None]:
### This function will generate a table of features, total NULL values, and %age of NULL values in it.
def findNullValuesPercentage(dataframe):
    totalNullValues = dataframe.isnull().sum().sort_values(ascending=False)
    percentageOfNullValues = round((dataframe.isnull().mean()).sort_values(ascending=False),2)
    featuresWithPrcntgOfNullValues = pd.concat([totalNullValues, percentageOfNullValues], axis=1, keys=['Total Null Values', 'Percentage of Null Values'])
    return featuresWithPrcntgOfNullValues

In [None]:
### this function will create BarPlot for our visualization.

def createCountPlot(keyVariable, plotSize):
    fig, axs = plt.subplots(figsize = plotSize)
    plt.xticks(rotation = 90)
    dataframe = leadscore.copy()
    dataframe[keyVariable] = dataframe[keyVariable].fillna('Missing Values')
    ax = sns.countplot(x=keyVariable, data=dataframe)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}'.format(height/len(dataframe) * 100),
                ha="center") 

In [None]:

#This function will just drop the list of features(inplace) provided to it and print them.
def dropTheseFeatures(features):
    print('Dataset shape before dropping the features {}'.format(leadscore.shape))
    print('*****------------------------------------------*****')
    for col in features:
        print('Removing the column {}'.format(col))
        leadscore.drop(col, axis=1, inplace=True)
    print('*****------------------------------------------*****')
    print('Dataset shape after dropping the features {}'.format(leadscore.shape))

In [None]:
#This function will genrate a table which is populated with feature name and the %age of count of unique values in it.
def genarateUniqueValuePercentagePlot(features):
    cols=4
    rows = len(features)//cols +1
    fig = plt.figure(figsize=(16, rows*5))
    for plot, feature in enumerate(features):
        fig.add_subplot(rows,cols,plot+1)
        ax = sns.countplot(x=leadscore[feature], data=leadscore) 
        for p in ax.patches:
            height = p.get_height()
            ax.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(height/len(leadscore) * 100),
                ha="center") 

In [None]:
### Function to generate heatmaps
def generateHeatmaps(df, figsize):
    plt.figure(figsize = figsize)        # Size of the figure
    sns.heatmap(df.corr(),annot = True, annot_kws={"fontsize":7})


In [None]:
#As the name suggests this function will plot AUC-ROC curve.
def plot_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs)
                                            #, drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
def getRegressionMetrics(actual,predicted):
    from sklearn.metrics import precision_score, recall_score
    m={}
    confusion = metrics.confusion_matrix(actual, predicted )
    TP = confusion[1,1] # true positive 
    TN = confusion[0,0] # true negatives
    FP = confusion[0,1] # false positives
    FN = confusion[1,0] # false negatives
    m['sensitivity']=TP / float(TP+FN)
    m['specificity']=TN / float(TN+FP)
    m['recall']=recall_score(actual, predicted)
    m['precision']=precision_score(actual, predicted)
    m['accuracy']=metrics.accuracy_score(actual, predicted)
    m['F1-score']=metrics.f1_score(actual, predicted, average='weighted')
    
    print(confusion)
    for metric in m:
        print(metric + ': ' + str(round(m[metric],2)))

### 1.  Performing EDA on the Lead Score Dataset

###### We will perform some exploratory data analysis and understand the data better by following the below steps:

- Checking the shape, columns, datatypes etc. of the dataset
- Assessing out of place values
- Checking for duplicate values
- Checking for null values
- Dropping unnecessary columns

In [None]:
leadscore.shape

In [None]:
leadscore.info()

###### We will check the 'Prospect ID' & for 'Lead Number' columns for each datapoint to check for any duplicates.

In [None]:
#Check if any duplicated value is present in the ID and Lead Number columns
print(sum(leadscore.duplicated('Prospect ID')) == 0)
print(sum(leadscore.duplicated('Lead Number')) == 0)

###### The Prospect ID and Lead Number are all unique values and not required for the model building.
> We will drop one of them, since Lead Number seems be simply an index number so we will keep it and remove
the prospectID

In [None]:
dropTheseFeatures(['Prospect ID'])

###### A feature with only one unique value is not useful for model building because this feature has zero variance. So we will drop all such clumns.

In [None]:
numUniquesInFeatures = leadscore.nunique().sort_values()
numUniquesInFeatures

In [None]:
dropFeaturesWithSingleVal=[]
for feature in numUniquesInFeatures.index:
#     print(feature, numUniquesInFeatures[feature])
    if numUniquesInFeatures[feature] == 1:
        dropFeaturesWithSingleVal.append(feature)
dropFeaturesWithSingleVal

In [None]:
dropTheseFeatures(dropFeaturesWithSingleVal)

###### Now we check the features with 2 unique values
- A free copy of Mastering The Interview              
- Newspaper Article                                   
- Search                                              
- Through Recommendations                             
- X Education Forums                                  
- Converted (Target Variable)                                     
- Do Not Call                                         
- Do Not Email                                        
- Newspaper                                           
- Digital Advertisement                               

In [None]:
genarateUniqueValuePercentagePlot(['A free copy of Mastering The Interview', 'Newspaper Article', 'Search','Through Recommendations',
             'X Education Forums', 'Converted', 'Do Not Call', 'Do Not Email', 'Newspaper', 'Digital Advertisement'])

###### From the abve plots, we can see that  `A free copy of Mastering The Interview` and `Converted (Target variable)`, all other features are higly skewed. These variables will not help in the model buidling so we will drop these.

In [None]:
dropHighySkewedFeatures = ['Newspaper Article', 'Search','Through Recommendations',
             'X Education Forums', 'Do Not Call', 'Do Not Email', 'Newspaper', 'Digital Advertisement']
dropTheseFeatures(dropHighySkewedFeatures)

######  Now check the NULL values in remaining features.

In [None]:
findNullValuesPercentage(leadscore)

###### We can see from the above table that `Lead Profile` and `How did you hear about X Education` columns have 74% and above missing values. Also assuming a cut-off of 45% the `Asymmetrique` score and Index features are also on higher side. Imputing these features will not be a good idea, because it can make the model biased.   So we decide to drop these columns.

In [None]:
dropHighMissingValuesFeatues = ['How did you hear about X Education', 'Lead Profile', 'Lead Quality', 
                                'Asymmetrique Profile Score','Asymmetrique Activity Score','Asymmetrique Profile Index',
                                'Asymmetrique Activity Index']
dropTheseFeatures(dropHighMissingValuesFeatues)

In [None]:
findNullValuesPercentage(leadscore)

###### Handling the `Score Variables`. These variables are the ones which were not present with the sales team before making the call. After making the calls the sales team assigned their values to each lead after their discussion. So, in the real scenario these will not be availabe at the time of model building. So we will drop these columns. The score variables in this datasets are:
- Tags
- Lead Quality
- Lead Profile
- Asymmetrique Activity Index
- Asymmetrique Profile Index
- Asymmetrique Activity Score
- Asymmetrique Profile Score 

###### From the above list, we have already dropped all the features in ealier steps except `Tag` variable.

In [None]:
dropTheseFeatures(['Tags'])

In [None]:
findNullValuesPercentage(leadscore)

###### Next we will deal with the reamianing columns with high NULL values. These are:

###### `City`
###### `Specialization`
###### `What matters most to you in choosing a course` 
###### `What is your current occupation`
###### `Country`

In [None]:
createCountPlot('City', (10,5))

###### Here we will impute the Missing Values with mode , ie, `Mumbai` in this case.

In [None]:
leadscore['City'].fillna('Mumbai', inplace=True)

createCountPlot('City', (10,5))

###### So we can see from the above plot that most of the leads are coming from Mumbai city, how ever the conversion rate is considerably low.

###### Treating Specialization 

In [None]:
createCountPlot('Specialization', (15,5))


###### Here we observe that, the specializations realted to `Finance, HR and  Marketing Management` have most  of the leads and reamaining  catergories share a comparably less percentage. 
###### So we will take the following steps for the imputations.

- impute the `Missing Values` with `Finance, HR and  Marketing Management`, each of them equally.
- `Business Administration (4.36%)` and `Operations Management (5.44%)` share a considerable share. So we will keep them as it is.
- Combine remaining Management specialization in to a sigle category,`Other Managements`
- The specializations with less than 4% share, into a single category as `Other Specializations`

###### This will  help us in reducing the complexity of the model later, by keeping the count of dummy variables low.

In [None]:
leadscore['Specialization'].value_counts(normalize=True, dropna=False).head()

In [None]:

# impute the Missing Values with Finance, HR and  Marketing Management, each of them equally.
leadscore['Specialization'].iloc[:1000].fillna('Human Resource Management', inplace=True)
leadscore['Specialization'].iloc[1001:2000].fillna('Marketing Management', inplace=True)
leadscore['Specialization'].iloc[2000:].fillna('Finance Management', inplace=True)

In [None]:
leadscore['Specialization'].unique()

In [None]:
leadscore.Specialization.replace(to_replace=['Supply Chain Management',
       'IT Projects Management', 
       'Marketing Management',
       'Retail Management',
       'Hospitality Management',
       'Healthcare Management'], value='Other Management', inplace=True)

In [None]:
leadscore.Specialization.replace(to_replace=[
       'Media and Advertising',
       'Travel and Tourism', 
       'Banking, Investment And Insurance', 'International Business',
       'E-COMMERCE',
       'Services Excellence',
       'Rural and Agribusiness',
       'E-Business'], value='Others', inplace=True)

In [None]:
leadscore['Specialization'].value_counts(normalize=True, dropna=False)

In [None]:
createCountPlot('Specialization', (10,5))

In [None]:
createCountPlot('What matters most to you in choosing a course', (15,4))

###### For the above feature also the data seems to be skewed, so we assume that almost all the leads wan to join the academy for Better career prospects. So we wil remove this feature as well, later.

In [None]:
createCountPlot('What is your current occupation', (10,7.5))

###### For the current occupation column we will impute the missing values with mode, ie, `Unemployed`.

In [None]:
leadscore['What is your current occupation'].fillna('Not Specified', inplace=True)

leadscore['What is your current occupation'] = leadscore['What is your current occupation'].replace(['Student', 'Housewife','Businessman'], 'Other')

leadscore['What is your current occupation'].value_counts()

In [None]:
createCountPlot('What is your current occupation', (10,7.5))

###### We see that the lead conversion rate is high in case of Working Profesionals. Also most of the leads are `Unemployed`. Other categories in occupation are negligible.

###### Now we take a look a Country column

In [None]:
fig, axs = plt.subplots(figsize = (20,4))
plt.xticks(rotation = 90)
sns.countplot('Country', data=leadscore)

###### The country plot also seems to be skewed. After imputing the Missing Values with mode, `India`, it will become 97%. So we will drop this as well. 

##### Dropping `What matters most to you in choosing a course` and `Country` in this step.

In [None]:
dropTheseFeatures(['What matters most to you in choosing a course','Country'])

In [None]:
findNullValuesPercentage(leadscore)


###### Now we have only few NULL values left. So we will drop those rows with NULLs

In [None]:
leadscore.dropna(inplace=True)

In [None]:
findNullValuesPercentage(leadscore)

In [None]:
leadscore.shape

###### Finally we get the all cleaned dataset with 9074 rows and 12 features.

### We will perform some univariate and bivariate analysis 

###### Starting with `Last Notable Activity` 

In [None]:
leads_data_dict[(leads_data_dict['Variables']=='Last Activity') | (leads_data_dict['Variables']=='Last Notable Activity')]

###### From the data dictionary  we see that `Last Notable Activity` is defined very similar to `Last Activity`

In [None]:
fig, axs = plt.subplots(figsize = (12,4))
plt.xticks(rotation = 90)
sns.countplot('Last Notable Activity', data=leadscore)

#####  The `Last Notable Activity` is  last  activity performed by the student. It is not relevant for our modelling purpose. So we will drop it here.

In [None]:
dropTheseFeatures(['Last Notable Activity'])

###### We will perform the some analysis and cleaning operation for `Last Activity`

In [None]:
round(leadscore['Last Activity'].value_counts(normalize=True, ascending=False), 2)

In [None]:
leadscore['Last Activity'] = leadscore['Last Activity'].replace([           
                                                                'Form Submitted on Website',       
                                                                'Unreachable',                     
                                                                'Unsubscribed',                    
                                                                'Had a Phone Conversation',        
                                                                'View in browser link Clicked',    
                                                                'Approached upfront',              
                                                                'Email Received',                  
                                                                'Email Marked Spam',               
                                                                'Resubscribed to emails',          
                                                                'Visited Booth in Tradeshow'], 'Miscellaneous')

In [None]:

createCountPlot('Last Activity', (7, 5))


###### A free copy of Mastering The Interview

In [None]:
leadscore['A free copy of Mastering The Interview'].value_counts()

###### For this column we will just replace Yes:1, No:0

In [None]:
leadscore['A free copy of Mastering The Interview'].replace({'Yes':1, 'No':0}, inplace=True)
leadscore['A free copy of Mastering The Interview'].value_counts()

###### Analyzing City columns

In [None]:
createCountPlot('City', (7, 5))

In [None]:
leadscore['City'] = leadscore['City'].replace(['Thane & Outskirts', 'Other Metro Cities', 'Other Cities',
       'Other Cities of Maharashtra', 'Tier II Cities'], 'Not Mumbai Cities')

In [None]:
leadscore['City'].value_counts()

In [None]:
createCountPlot('City', (7, 5))

In [None]:
leadscore['What is your current occupation'].value_counts()

In [None]:
createCountPlot('What is your current occupation', (10, 5))

In [None]:
sns.countplot(leadscore['What is your current occupation'], hue=leadscore.Converted)

plt.show()


###### `Working Professional` are can be a important feature to since their conversion rate is very high.
###### However the datasets contains mostly Unemployed leads

In [None]:
fig, axs = plt.subplots(figsize = (15, 7.5))
sns.countplot(leadscore['Specialization'], hue=leadscore.Converted)
axs.set_xticklabels(axs.get_xticklabels(),rotation=90)
plt.show()


###### We see that  leads who have done specialization in  `Management` specially in `FInance Management` have higher number of leads as well as leads converted. This variable is highly significant and will help in model building.

In [None]:
findNullValuesPercentage(leadscore)

###### analyzing the trend of `Page Views Per Visit' using boxplot 

In [None]:
sns.boxplot('Page Views Per Visit', data=leadscore)

###### We see that there are many outliers in the higher side of the data. 
###### We will remove theese the outliers by capping using soft range capping.

In [None]:
q1 = leadscore['Page Views Per Visit'].quantile(0.05) #---- lower range taken
q4 = leadscore['Page Views Per Visit'].quantile(0.95) #----- higher range taken

leadscore['Page Views Per Visit'][leadscore['Page Views Per Visit']<=q1] = q1 #----- capping of lower range 
leadscore['Page Views Per Visit'][leadscore['Page Views Per Visit']>=q4] = q4 #----- capping of higher range

In [None]:
sns.boxplot('Page Views Per Visit', data=leadscore)

In [None]:
sns.boxplot(x=leadscore.Converted,y=leadscore['Page Views Per Visit'])
plt.show()


###### From above plots we see that Median for converted and not converted leads are almost same. So We cannot say anything about the lead conversion based on Page Views 

###### Analyzing Total Time Spent on Website

In [None]:
leadscore['Total Time Spent on Website'].describe()

   ###### The above summary shows that the total time is given in minutes. We will convert these to Hours first

In [None]:
sns.distplot(leadscore['Total Time Spent on Website'])

In [None]:
leadscore['Total Time Spent on Website'] = leadscore['Total Time Spent on Website'].apply(lambda x: round((x/60), 2))
sns.distplot(leadscore['Total Time Spent on Website'], )

In [None]:
sns.boxplot(x=leadscore.Converted, y=leadscore['Total Time Spent on Website'])
plt.show()

###### The above plot tells that  Leads spending more time on the website are more likely to be converted.
###### Therefore, we can suggest to the company to make the website more reliable and attractable for the leads, so that they spend more time on the website.

In [None]:
sns.boxplot(y = 'TotalVisits', x = 'Converted', data = leadscore)
plt.show()

In [None]:
Q3 = leadscore.TotalVisits.quantile(0.95)
leadscore = leadscore[(leadscore.TotalVisits <= Q3)]
Q1 = leadscore.TotalVisits.quantile(0.05)
leadscore = leadscore[(leadscore.TotalVisits >= Q1)]
sns.boxplot(y=leadscore['TotalVisits'])
plt.show()

In [None]:
sns.boxplot(y='TotalVisits', x='Converted', data=leadscore)

###### From above plots we see that Median for converted and not converted leads are almost same. So We cannot say anything about the lead conversion based on TotalVisits
###### Analyzing Lead Source and Lead Origin

In [None]:
createCountPlot('Lead Source', (10,5))

###### Replace the ctaegories with lower count with `Others` category to reduce the dummy variable. This is avoid the model complexity

In [None]:
leadscore['Lead Source'].unique()

In [None]:
leadscore['Lead Source'] = leadscore['Lead Source'].replace(['blog', 'Pay per Click Ads', 
                                                'bing', 'Social Media','WeLearn', 'Click2call', 'Live Chat', 
                                                'welearnblog_Home', 'youtubechannel', 'testone', 'Press_Release', 'NC_EDM'], 'Other')



In [None]:
leadscore['Lead Source'] = leadscore['Lead Source'].replace('google', 'Google')

In [None]:
fig, axs = plt.subplots(figsize = (10, 5))
sns.countplot('Lead Source', hue='Converted', data=leadscore)

###### From above plot we see that most of the leads are coming from sources like, `Olark Chat`, `Organic Search` , `Direct Traffic` and `Google` However their conversion rate is low. But the conversion rate for `Reference` and `Welingak WebSite` is quite high. So we should definitely consider these Lead Source in our model.

In [None]:
leadscore['Lead Origin'].describe()

In [None]:
fig, axs = plt.subplots(figsize = (10, 5))
sns.countplot('Lead Origin', hue='Converted', data=leadscore)

###### From the `Lead Origin` plot, we can see that most of the leads were identified from Landing Page submission and then by APIs. 
###### However the conversion rate of `Lead Add Form` is comparitably good. 
##### So we will drop only the `Lead Import` which is insignificant in this case, and it will unnecessarily create extra dummy variable.

In [None]:
#Drop all the rows with `Lead Import` as Lead Origin
leadscore.drop(leadscore[leadscore['Lead Origin'] == 'Lead Import'].index, inplace=True)

In [None]:
fig, axs = plt.subplots(figsize = (10, 5))
sns.countplot('Lead Origin', hue='Converted', data=leadscore)

###### Lead Add form have higest Conversion rate

In [None]:
fig, axs = plt.subplots(figsize = (15, 5))
sns.countplot('Last Activity', hue='Converted', data=leadscore)

###### SMS sent has highest conversion rate.

In [None]:


leadscore.columns

In [None]:
leadscore_corr = leadscore[['Lead Origin', 'Lead Source', 'Converted', 'TotalVisits',
       'Total Time Spent on Website', 'Page Views Per Visit', 'Last Activity',
       'Specialization', 'What is your current occupation', 'City',
       'A free copy of Mastering The Interview']]

In [None]:
generateHeatmaps(leadscore_corr, (12,8))

###### Based on the above heatmap we can say that `Page Views Per Visit` and `Total Visit` are correlated, Othere than these, there aren't any highly correlated features. 
###### Finally we have completed our analysis using visualization . Now we start the Data preparation with the cleansed dataset.

In [None]:
leadscore.shape

In [None]:
leadscore.head()

In [None]:
leadscore.info()

###### Creating dummy variables for all categorical columns

In [None]:
categorical_feature =  leadscore.select_dtypes(include=['object']).columns
categorical_feature

###### For the above 7 categorical columns, dummy encoding is required.
###### We will create the dummies and drop the catergory which has the least frequency in each column


In [None]:
dummy = pd.get_dummies(leadscore['Specialization'], prefix  = 'Specialization')
dummy = dummy.drop(['Specialization_Business Administration'], 1)
leads_dummified = pd.concat([leadscore, dummy], axis = 1)

In [None]:
dummy = pd.get_dummies(leadscore['Lead Source'], prefix  = 'Lead_Source')
dummy = dummy.drop(['Lead_Source_Facebook'], 1)
leads_dummified = pd.concat([leads_dummified, dummy], axis = 1)

In [None]:
dummy = pd.get_dummies(leadscore['Last Activity'], prefix  = 'Last_Activity')
dummy = dummy.drop(['Last_Activity_Email Link Clicked'], 1)
leads_dummified = pd.concat([leads_dummified, dummy], axis = 1)

In [None]:
leadscore['Lead Origin'].value_counts()

In [None]:
dummy = pd.get_dummies(leadscore['Lead Origin'], prefix = 'Lead_Origin', drop_first=True)
# dummy = dummy.drop(['Lead_Origin_Lead Add Form'], 1)
leads_dummified = pd.concat([leads_dummified, dummy], axis = 1)

In [None]:
dummy = pd.get_dummies(leadscore['What is your current occupation'], prefix = 'Occupation')
dummy = dummy.drop(['Occupation_Other'], 1)
leads_dummified = pd.concat([leads_dummified, dummy], axis = 1)

In [None]:
dummy = pd.get_dummies(leadscore['City'], prefix = 'City')
dummy = dummy.drop(['City_Not Mumbai Cities'], 1)
leads_dummified = pd.concat([leads_dummified, dummy], axis = 1)

In [None]:
leads_dummified.drop(['Lead Origin', 'Lead Source', 'Last Activity', 'Specialization',
       'What is your current occupation', 'City'], axis=1, inplace=True)

In [None]:
leads_dummified.shape

In [None]:
leads_dummified.head(5)

###### Test and Train Split

In [None]:
from sklearn.model_selection import train_test_split
np.random.seed(0)
lead_df_train,lead_df_test=train_test_split(leads_dummified,train_size=0.7,random_state=100)

In [None]:
X_train = lead_df_train.drop(['Converted','Lead Number'], axis=1)
y_train = lead_df_train['Converted']

In [None]:
X_train.info()

###### Scaling the numerical columns

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
num_cols=X_train.select_dtypes(include=['float64', 'int64']).columns
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_train.head(5)

###### Now we will start with model building using the stats model and select top 15 significant features using the RFE

In [None]:
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

leads_reg = LogisticRegression()

rfe = RFE(leads_reg, 15)
rfe = rfe.fit(X_train, y_train)

###### Check the columns reatained by rfe using rfe support and ranking

In [None]:
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
rfe_selected_features = X_train.columns[rfe.support_]
rfe_selected_features

In [None]:
X_train_sm = sm.add_constant(X_train[rfe_selected_features])
model1 = sm.GLM(y_train, X_train_sm, family = sm.families.Binomial())
result = model1.fit()
result.summary()

In [None]:
rfe_selected_features = rfe_selected_features.drop('Lead_Source_Organic Search', 1)
rfe_selected_features

In [None]:
X_train_sm = sm.add_constant(X_train[rfe_selected_features])
model2 = sm.GLM(y_train, X_train_sm, family = sm.families.Binomial())
result = model2.fit()
result.summary()

In [None]:
rfe_selected_features = rfe_selected_features.drop('Lead_Source_Reference', 1)
rfe_selected_features

In [None]:
X_train_sm = sm.add_constant(X_train[rfe_selected_features])
model3 = sm.GLM(y_train, X_train_sm, family = sm.families.Binomial())
result = model3.fit()
result.summary()

In [None]:

rfe_selected_features = rfe_selected_features.drop('Last_Activity_Miscellaneous', 1)
rfe_selected_features

In [None]:
X_train_sm = sm.add_constant(X_train[rfe_selected_features])
model4 = sm.GLM(y_train, X_train_sm, family = sm.families.Binomial())
result = model4.fit()
result.summary()

###### SInce all the p-values are less as ecpexted, so we can check the VIF for multicolinearity among the variables. 

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif['Features'] = X_train[rfe_selected_features].columns
vif['VIF'] = [variance_inflation_factor(X_train[rfe_selected_features].values, i) for i in range(X_train[rfe_selected_features].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

###### VIF for all the variables are also less than 3. SO we can go ahead with this model with these features. 

###### Now we can proceed to derive the predictions, probabilities and  LeadScore on the trainin data.

In [None]:
final_selected_features = rfe_selected_features

In [None]:
y_train_pred = result.predict(X_train_sm)
y_train_pred[:10]

In [None]:
y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

In [None]:
y_train_pred_final = pd.DataFrame({'Converted':y_train.values, 'Lead_Score_Prob':y_train_pred})
y_train_pred_final['Lead Number'] = leadscore['Lead Number']
y_train_pred_final.head()

###### Find the lead score and add a `Lead Score` column to the above table

In [None]:
y_train_pred_final['Lead_Score'] = round((y_train_pred_final['Lead_Score_Prob'] * 100),0)

y_train_pred_final.head()

###### We will add a new column `Predicted Hot Lead` assuming  threshold value as 0.5

In [None]:
y_train_pred_final['Predicted_Hot_Lead'] = y_train_pred_final.Lead_Score_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final.head()

###### Calaculate the Lead score based on the `Lead_Score_Prob` and generate the confusion matrix.

In [None]:
y_train_pred_final['Lead_Score'] = round((y_train_pred_final['Lead_Score_Prob'] * 100),0)
y_train_pred_final['Lead_Score'] = y_train_pred_final['Lead_Score'].astype(int)
y_train_pred_final.head()

In [None]:
from sklearn import metrics
confusion_matrix = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Predicted_Hot_Lead )
print(confusion_matrix)

###### With the help of the confusion matrix, we will calculate the below metrics, which will be used late for model evaluation later.


###### 1. Accuracy

In [None]:
print('Accuracy for the Model 4 is {}%'.format(round(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.Predicted_Hot_Lead),2)*100 ))

In [None]:
TP = confusion_matrix[1,1] # true positive 
TN = confusion_matrix[0,0] # true negatives
FP = confusion_matrix[0,1] # false positives
FN = confusion_matrix[1,0] # false negatives

In [None]:
sensitivity = round((TP / float(TP+FN)),2)
specificity = round((TN / float(TN+FP)),2)

print('Sensitivity is {}% and Specificity is {}%'.format(sensitivity*100, specificity*100))

######  This value of Sensitivity does not fullfill our objective ,as the CEO of the company has given a ballpark of the target lead conversion rate to be around 80%. This implies that the threshold value we chose ealier`[50%]` is not correct. Thus we need a better threshold.

In [None]:
plot_roc(y_train_pred_final.Converted, y_train_pred_final.Predicted_Hot_Lead)

###### The ROC AUC value, we got, is 0.80. This  indicates the we have  a good  model and is capable of distinguising the classes.

###### Now we will find the prediction on the train data using this model and also calculate the cut-off threshold values 

In [None]:
y_train_pred_final

In [None]:
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.Lead_Score_Prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['Probability','Accuracy','Sensitivity','Specificty', 'Precision'])

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    preci = cm1[1,1]/(cm1[1,1]+cm1[0,1])   #TP/TP+FP
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci, preci]   
print(cutoff_df)

In [None]:
sns.set_style('whitegrid')
sns.set_context('paper')

cutoff_df.plot.line(x='Probability', y=['Accuracy','Sensitivity','Specificty'], figsize=(10,6))
plt.xticks(np.arange(0,1,step=.05), size=8)
plt.yticks(size=12)
plt.show()

###### As you can see, at about a threshold of 0.36, the curves of accuracy, sensitivity and specificity intersect, and they all take a value of around 80%. With this threshold we will predict the Hot Lead again.

In [None]:
cut_off = 0.36

In [None]:
y_train_pred_final['Predicted_Hot_Lead'] = y_train_pred_final.Lead_Score_Prob.map( lambda x: 1 if x > cut_off else 0)
y_train_pred_final.head()

In [None]:
getRegressionMetrics(y_train_pred_final.Converted,y_train_pred_final.Predicted_Hot_Lead)

###### The above confusion matrix shows better results than the previously calculated one, using 0.5 as cutoff.
###### Our  business requirement of getting the sensitvity value above 80% is also achieved in this model. Also the accuracy and F1 score is pretty good.

In [None]:
from sklearn.metrics import precision_recall_curve
p, r, thresholds = precision_recall_curve(y_train_pred_final.Converted, y_train_pred_final.Lead_Score_Prob)

In [None]:

plt.figure(figsize=(8, 4), dpi=100, facecolor='w', edgecolor='k', frameon='True')
plt.title('Precision vs Recall')
plt.plot(thresholds, p[:-1], "g-")
plt.plot(thresholds, r[:-1], "r-")
plt.xticks(np.arange(0, 1, step=0.05))
plt.show()

##### The precision vs recall tradeoff value from the above graph is at 0.41
#### From the precision-recall graph above, we get the optimum threshold value as close to .41. However our business requirement here is to have Lead Conversion Rate around 80%.  

#### This is already achieved with our earlier threshold  value of 0.36. So we will stick to this value.
###### Now making predcitions on test data

In [None]:
lead_df_test.head()

In [None]:
lead_df_test[num_cols] = scaler.transform(lead_df_test[num_cols])

###### The below features were selected for the model in the train step

In [None]:
rfe_selected_features

In [None]:
X_test = lead_df_test[rfe_selected_features]
y_test = lead_df_test[['Lead Number', 'Converted']]

In [None]:
print(X_test.shape)
print(y_test.shape)

In [None]:
X_test_sm = sm.add_constant(X_test)
y_test_pred = result.predict(X_test_sm)
y_test_pred[:10]

In [None]:
# Coverting it to df
y_pred_df = pd.DataFrame(y_test_pred)
# Converting y_test to dataframe
y_test_df = pd.DataFrame(y_test)
# Remove index for both dataframes to append them side by side 
y_pred_df.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)
# Append y_test_df and y_pred_df
y_pred_final = pd.concat([y_test_df, y_pred_df],axis=1)
# Renaming column 
y_pred_final= y_pred_final.rename(columns = {0 : 'Lead_Score_Prob'})
y_pred_final.head()

In [None]:
y_pred_final['Predicted_Hot_Lead'] = y_pred_final.Lead_Score_Prob.map(lambda x: 1 if x > cut_off else 0)
y_pred_final.head()

###### Find metrics of the test data resutls

In [None]:
getRegressionMetrics(y_pred_final.Converted,y_pred_final.Predicted_Hot_Lead)

###### So we getting almost similar results for test data as well, with 1% of deviation
###### Now we will add the Lead Score to each leads

In [None]:
y_pred_final['Lead_Score'] = round((y_pred_final['Lead_Score_Prob'] * 100),0)
y_pred_final['Lead_Score'] = y_pred_final['Lead_Score'].astype(int)

In [None]:
y_pred_final.head()

###### FInally we get the final result table of the test data, which contains the Leads Score for each leads, based on which we can determine whether a lead is HotLead or Cold Lead.
###### Generate leadscore for the whole dataset, which was created after the EDA, before train_test_split. 

In [None]:
leads_dummified.shape

In [None]:
leads_dummified.head()

###### Scaling the cleaned dataset again 

In [None]:
leads_dummified[num_cols] = scaler.transform(leads_dummified[num_cols])
leads_dummified.head()

In [None]:
cleaned_lead_sm = sm.add_constant(leads_dummified[rfe_selected_features])
cleaned_predicted = result.predict(cleaned_lead_sm)
cleaned_predicted

In [None]:
final_lead_score_df = leadscore.copy()
final_lead_score_df.head()

In [None]:
final_lead_score_df['Lead Score']=round(cleaned_predicted*100,2)
final_lead_score_df.head()

In [None]:
hot_leads = final_lead_score_df.sort_values(by='Lead Score',ascending=False)[['Lead Number','Lead Score']]
hot_leads[hot_leads['Lead Score']>36] 

##### The above list is showing the list of customers having score more than 36 .

In [None]:
final_lead_score_df['Is_Hot_Lead'] = final_lead_score_df['Lead Score'].map(lambda x: 1 if x > 36 else 0)
final_lead_score_df.sort_values(by='Lead Score',ascending=False).head(10)

#### The above are top 10 leads which have high score and hence they have high chance of convertion
#### Now we will determine the importance or the selected features
###### Getting the coefficients from the final model summary

In [None]:
coeff = result.params[1:]
coeff

###### To understand this better, we will find relative coefficients of the features. This will help us in camparing the features better.


In [None]:
feature_relevance = 100.0 * (coeff / coeff.max())
feature_relevance

In [None]:
sorted_idx = np.argsort(feature_relevance,kind='quicksort',order='list of str')
sorted_idx

###### Now, we will plot the above data in a bar plot, to visulaize it better

In [None]:

pos = np.arange(sorted_idx.shape[0]) + .5

fig = plt.figure(figsize=(10,6))
ax =  fig.add_subplot(1, 1, 1)
ax.barh(pos, feature_relevance[sorted_idx], align='center', color = 'tab:blue',alpha=0.8)
ax.set_yticks(pos)
ax.set_yticklabels(np.array(rfe_selected_features)[sorted_idx], fontsize=12)
ax.set_xlabel('Relative Feature Importance', fontsize=14)

plt.tight_layout()   
plt.show()

###### Fromt the above plot and data, we can find out the TOP  3 features, using the feature_relevance dataframe

In [None]:

pd.DataFrame(feature_relevance).reset_index().sort_values(by=0,ascending=False).head(3)

###### After trying several models, we finally chose this Model, becuase it fullfilled the below criteria

-  <font color = blue> All variables have p-value < 0.05.
-  All the features have very low VIF values, meaning, there is hardly any muliticollinearity among the features. This is also evident from the heat map as well.
-  The overall accuracy of 79% at a probability threshold of <bold> 0.36 </bold> on the test dataset is also acceptable.</font>

###### <font color = blue> We can also tweak the  probability threshold value with in turn will decrease or increase the Sensitivity and increase or decrease the Specificity of the model, based on the business requirements. </font>
    
###### <font color = blue> High Sensitivity ensures that the leads who are likely to be convertted are correctly predicted where as high Specificity will ensure that leads that are on the cut-off of the probability of getting converted or not are not selected. </font>



### The top three variables which contributed the most in lead conversion are: 
1. Lead Origin
1. What is the Current Occupation? 
1. LeadSource.


> The categories in each variable which are important are:
1. Working Professionals (in Current Occupation)
1. Lead Add Form (Lead Origin) and 
1. Welingak Website (in Lead Source).

###### X Education has a period of 2 months every year during which they hire some interns. The sales team has around 10 interns allotted to them. So, during this phase, they wish to make the lead conversion more aggressive. So, they want almost all of the potential leads (i.e. the customers who have been predicted as 1 by the model) to be converted and hence, want to make phone calls to as much of such people as possible. Suggest a good strategy they should employ at this stage.

> Since the resources in the sales team has increased, they can target for as much leads as possible. The cutoff criteria of Hot Leads can be lowered. We can target for 90% sensitivity which is at Lead score 20. At this point the accuracy is also 75% which is considerably good. At this cut off the salesperson can first target the top Hot Leads, which have more chances to get converted.

###### Similarly, at times, the company reaches its target for a quarter before the deadline. During this time, the company wants the sales team to focus on some new work as well. So, during this time, the company’s aim is to not make phone calls unless it’s extremely necessary, i.e. they want to minimize the rate of useless phone calls.

> In this case, as there is a restriction on calling the leads, the sales team should not call, those leads which have very less chance of conversion. They should try to reach those customers which have more chance.
Here, we should avoid false positive count as much as possible. That means we should target to achieve high precision. Precision is the positive predictive value or the fraction of the positive predictions that are actually positive. So, the sales team should focus on the leads having score more than 65 (as at lead score 60 the precision is 82 and at 70 the precision is ~82%, at this cutoff the accuracy is ~80%) to achieve more than 85% precision.

>Also, for the remaining leads with low Lead Score, since the company doesn’t aim to make phone calls, they can use other methods of reaching the customers like, automated emails and SMSes. By this strategy, we can reach to most of the customers.