In [None]:
# creating a reuseable function that will help us in ploting our barplots for analysis

def BarPlot(df, Variable, plotSize):
    fig, axs = plt.subplots(figsize = plotSize)
    plt.xticks(rotation = 45)
    ax = sns.countplot(x=Variable, data=df)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}'.format(height/len(df) * 100),
                ha="center")

In [None]:
# creating a reuseable function that will help us in Bivariate for analysis
def CountPlot(df,Variable,title,plotsize,hue=None):
    plt.figure(figsize=plotsize)
    plt.xticks(rotation=90)
    plt.title(title)
    sns.countplot(data = df, x=Variable, order=df[Variable].value_counts().index,hue = hue)
    plt.show()
    
    convertcount=df.pivot_table(values='Lead Number',index=Variable,columns='Converted', aggfunc='count').fillna(0)
    convertcount["Conversion(%)"] =round(convertcount[1]/(convertcount[0]+convertcount[1]),2)*100
    return print(convertcount.sort_values(ascending=False,by="Conversion(%)"))


## Step 1: Reading and Understanding the Data

In [None]:
#Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import re
from scipy import stats 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Importing RFE and LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm  
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve

In [None]:
# Reading File
df= pd.read_csv("../input/lead-scoring-x-online-education/Leads X Education.csv")

In [None]:
df.head(20)

In [None]:
# Getting Original onversion rate for the data set
orgConversionRate = round(100*(sum(df['Converted'])/len(df['Converted'].index)), 2)
print("The conversion rate of leads is: ",orgConversionRate)

In [None]:
# Checking shape of dataframe
df.shape

In [None]:
# Checking columns name
df.columns

In [None]:
# Checking columns type in dataframe
df.info()

In [None]:
# checking attributes for continuous variables
df.describe()

#### Observation
The shape of the dataset is 9240x37

Original conversion rate of company X is 38.54%.

Large number of 'Select' values present for Lead Profile and City in the dataset. These values correspond to the user having   not made any selection.

There are 7 numerical columns and 30 categorical columns

In [None]:
# AS Value  select represent that User has not selecte any values for that, Hence it can be converted to Null
# so that it can be treated as Null
df = df.replace('Select', np.nan)
df.head(20)

In [None]:
# Checking if any duplicate value in Lead Number and Prospect ID
print(sum(df.duplicated(subset= 'Lead Number'))!=0)
print(sum(df.duplicated(subset= 'Prospect ID'))!=0)

#### Observation
No Duplicates value for Prospect ID and Lead Number

In [None]:
# Checking Null Values
print(df.isnull().sum(axis=0))

In [None]:
# Checking column-wise null percentages here
print(round(100*(df.isnull().sum()/len(df)).sort_values(ascending= False), 2))

#### Observation:
There are some columns with over 50% of null values.


## Step 2: Data Cleaning

In [None]:
# Droping columns having null percentage >50%
df = df.drop(df.columns[df.apply(lambda col: col.isnull().sum()/len(df) > 0.70)], axis=1)
df

In [None]:
## Checking number of unique values per column 
df.nunique()

#### Observation
There are a lot of columns with 1 or two unique values.

Below are the columns that have only one unique value, since they won't have anything to contribute to the model significantly,  we will remove these columns.

- Get updates on DM Content
-  Update me on Supply Chain Content
- I agree to pay the amount through cheque
- Receive More Updates About Our Courses
- Magazine

Note: There are no null values in these columns as seen from the null values table above.

In [None]:
df = df.drop(df.columns [df.apply(lambda col: col.nunique()==1)], axis=1)
df

In [None]:
### We can also drop the column 'Prospect ID' as we already have an identifying column with unique values: 'Lead Number'
df = df.drop('Prospect ID', axis=1)
df.head()

#### Checking all columns individually having null values in order to decide to impute/drop the column
As we can see some of the columns have substantial number of null or missing values. If we drop all these columns we will lose a lot of information so instead of dropping them, for some of the feature variables we will create a new value as 'Unknown' 

#### Asymmetrique Activity/Profile Index Activity and Score



Let us now look at the following columns: Asymmetrique Activity Index, Asymmetrique Profile Index,Asymmetrique Profile Score and Asymmetrique Activity Score.

We know from the data dictionary that these are scores assigned to a customer based on their activity and profile Via X- Education employee after calling to Lead.

In [None]:
plt.figure(figsize=(20,10))
plt.subplot(2,2,1)
sns.countplot(df['Asymmetrique Activity Index'])

plt.subplot(2,2,2)
sns.boxplot(df['Asymmetrique Activity Score'])

plt.subplot(2,2,3)
sns.countplot(df['Asymmetrique Profile Index'])

plt.subplot(2,2,4)
sns.boxplot(df['Asymmetrique Profile Score'])

Observation
There is a lot of variation in the data and the number of null values is also very high at 45.65% and its assigned to leas aster calling them via Employee of X-education. Therefore we will drop these columns.

Let us now drop all the 4 columns: **Asymmetrique Activity Index, Asymmetrique Activity Score, Asymmetrique Profile Index and Asymmetrique Profile Score**.

In [None]:
colsToDrop = ['Asymmetrique Activity Index','Asymmetrique Activity Score','Asymmetrique Profile Index','Asymmetrique Profile Score']
df =df.drop(colsToDrop, axis=1)

#### Lead Quality

In [None]:
# Checking Lead Quality
df['Lead Quality'].value_counts()

In [None]:
# Since 'Lead Quality' is based on an employees intuition, let us inpute any NAN values with 'Not Sure'
df['Lead Quality'] = df['Lead Quality'].fillna('Not Sure')
df['Lead Quality'].value_counts()

In [None]:
BarPlot(df, 'Lead Quality', (10,10))

##### Observation
From the abover barplot we see that the number of values for 'Not Sure' are considerable high at 63.14%. We will drop this column

In [None]:
df =df.drop('Lead Quality', axis=1)
df

### City

In [None]:
df['City'].value_counts()

In [None]:
df.City.describe()

In [None]:
BarPlot(df, 'City', (10,10))

##### Observation
We will impute the missing values with 'Mumbai' since it has the highest count.

In [None]:
df.City= df.City.fillna('Mumbai')

In [None]:
df['City'].value_counts(normalize= True)

### Specialization

In [None]:
### Exploring 'Specialization' column which hs 36.58% null values
df.Specialization.describe()

In [None]:
BarPlot(df, 'Specialization', (10,10))

##### Note:
Since there are 36% null values that haven't yet been accounted for, we will replace those with 'Others'. This is being done because the NaN values have the highest percentage of values that haven't been shown above. It simply means that the user did not have any option relevant to them in this field.

In [None]:
df.Specialization= df['Specialization'].fillna('Others')
df.Specialization.value_counts(normalize=True)

### Tags

In [None]:
df.Tags.describe()

In [None]:
BarPlot(df,'Tags', (10,10))

##### Observation
From the above bar plot it looks like we have a lot of small categories within the tags section. Moreover these tags are added by the sales team. We can safely drop this column as this doesn't provide a lot of insight.

In [None]:
df = df.drop(['Tags'], axis = 1)

In [None]:
# Checking again the Null values
print(round(100*(df.isnull().sum(axis=0)/len(df.index)).sort_values(ascending=False),2))

##### Observation
We still have a few columns that have a high number of null values.

### What matters most to you in choosing a course

In [None]:
df['What matters most to you in choosing a course'].value_counts()

In [None]:
BarPlot(df, 'What matters most to you in choosing a course',(8,5))

##### Observation
We can clearly see that this column is heavily skewed towards better career prospects. Since it doesn't really provide any more information, we can drop this column and keep note that all candidates that take this course are looking to have a better career.

In [None]:
# Dropping 'What matters most to you in choosing a course'
df= df.drop(['What matters most to you in choosing a course'], axis =1)

### What is your current occupation

In [None]:
df['What is your current occupation'].value_counts(normalize= True)

In [None]:
BarPlot(df, 'What is your current occupation',(10,10))

##### Observation
The data in this column looks skewed, however, it also defines potential target market for company X. We will impute the missing values with 'Unemployed' and drop the column if analysis further down seems it necessary

In [None]:
df['What is your current occupation']=df['What is your current occupation'].fillna('Unemployed')

In [None]:
BarPlot(df, 'What is your current occupation',(10,10))

### Country

In [None]:
df['Country'].value_counts()

In [None]:
df['Country'].describe()

##### Note
Since a mode value for this column is India, we can replace the missing values with India. Since this will potentially skew the data heavily in the model we will drop this column.

In [None]:
df['Country']= df['Country'].fillna('India')
df['Country'].value_counts()

In [None]:
df=df.drop(['Country'], axis=1)

In [None]:
df.head()

In [None]:
# Checking Null Values Again

print(round(100*(df.isnull().sum()/len(df)).sort_values(ascending= False), 2))

##### Observation
We have reduced a lot of columns and imputed missing values in a few of them. In the remaining columns we can safely impute the missing value with the mode value since it has less than 5% missing values.

In [None]:
### Imputing missing values in Lead Source
df['Lead Source'].value_counts()

In [None]:
### Imputing missing values with 'Google'

df['Lead Source'] = df['Lead Source'].fillna('Google')
df['Lead Source'].value_counts()

In [None]:
df['Lead Source'] =  df['Lead Source'].apply(lambda x: x.capitalize())
df['Lead Source'].value_counts()

### Page Views Per Visit

In [None]:
df['Page Views Per Visit'].value_counts()

In [None]:
df['Page Views Per Visit'].describe()

In [None]:
df['Page Views Per Visit'].median()

In [None]:
#### Imputing the missing values with '2.0' which is the median value
df['Page Views Per Visit']= df['Page Views Per Visit'].fillna(2.0)

In [None]:
df['Page Views Per Visit']

### TotalVisits

In [None]:
df.TotalVisits.describe()

In [None]:
### We will impute this value with the meadian value since the 
### mean and the median values are relatively close to each other
df.TotalVisits = df.TotalVisits.fillna(3.0)
df.TotalVisits.value_counts()

### Last Activity

In [None]:
df['Last Activity'].value_counts()

In [None]:
#### Imputing the missing values with 'Email Opened'
df['Last Activity'] = df['Last Activity'].fillna('Email Opened')
df['Last Activity'].value_counts()

### Finally Checking Dataset

In [None]:
df.shape

In [None]:
print(round(100*(df.isnull().sum()/len(df)).sort_values(ascending= False), 2))

### Observation
Now that our dataset is clear of all the null values we can begin performing analysis on the remaining columns

In [None]:
# Getting shape of dataframe after cleanup
df.shape

## Step 3: EDA
Steps Perfomed in this section
- Outlier Treatment
- Univariate Analysis
- Bivariate Analysis

In [None]:
df.info()

In [None]:
features = ['TotalVisits','Total Time Spent on Website','Page Views Per Visit']
# Plotting Box plot for continuous columns
plt.figure(figsize = (20,12))
for i in enumerate(features):
    plt.subplot(2,2,i[0]+1)
    sns.boxplot(df[i[1]])

##### Observation
There are lot of outliers in **TotalVisits** and **Page Views per Visit**

In [None]:
### Caping data at the 1% & 95% mark so as to not lose any values or drop rows
q1 = df['Page Views Per Visit'].quantile(0.01)
df['Page Views Per Visit'][df['Page Views Per Visit']<= q1] = q1

q3 = df['Page Views Per Visit'].quantile(0.95)
df['Page Views Per Visit'][df['Page Views Per Visit']>= q3] = q3

In [None]:
### Caping data at the 1% & 95% mark so as to not lose any values or drop rows
q1 = df['TotalVisits'].quantile(0.01)
df['TotalVisits'][df['TotalVisits']<= q1] = q1

q3 = df['TotalVisits'].quantile(0.95)
df['TotalVisits'][df['TotalVisits']>= q3] = q3

In [None]:
# Plotting Box plot for continuous columns to check after caping outliers
plt.figure(figsize = (20,12))
for i in enumerate(features):
    plt.subplot(2,2,i[0]+1)
    sns.boxplot(df[i[1]])

### Checking Categorical Variables

#### Lead Origin

In [None]:
df['Lead Origin'].value_counts()

In [None]:
BarPlot(df,'Lead Origin', (15,10))

In [None]:
CountPlot(df,'Lead Origin','Conversion based on Lead Origin',(15,10),hue='Converted')

### Observation
From the barplot above we can infer that:

- Lead Add Form has the highest conversion rate at 92%
- Quick Add Form has 100% conversion rate but it has only 1 entry, so it might not be that reliable as a lead to go on
- API has the least amount of conversions

#### Lead Source

In [None]:
df['Lead Source'].value_counts()

In [None]:
BarPlot(df,'Lead Source', (15,4))

In [None]:
plt.figure(figsize=(20,30))
sns.countplot(data = df, x= 'Lead Source', order=df['Lead Source'].value_counts().index,hue = 'Converted')                      
plt.xticks(rotation=45)
plt.show()


In [None]:
## Printing % of converted Lead with respect to Lead Source
convertcount=df.pivot_table(values='Lead Number',index='Lead Source',columns='Converted', aggfunc='count').fillna(0)
convertcount["Conversion(%)"] =round(convertcount[1]/(convertcount[0]+convertcount[1]),2)*100
print(convertcount.sort_values(ascending=False,by="Conversion(%)"))

#### Observations
Based on the plots above we observe that:

- Most students found X education via 'Google' search
- However, most of the google search leads weren't converted to actual students of the platform
- References had the highest number of conversions at 92%
- Welingak website also had a significantly high number of conversions at 99%
- Welearn & Nc_edm had 100% conversion but due to their low numbers overall it might not be a correct picture of the situation
- No conversions were made through the youtube channel, blog, press releases, pay per click ads or Welearnblog_home

**Note:** Let us merge the columns with low numbers into a common category: 'Others'

In [None]:
cols=['Click2call', 'Live chat', 'Nc_edm', 'Pay per click ads', 'Press_release',
  'Social media', 'Welearn', 'Bing', 'Blog', 'Testone', 'Welearnblog_home', 'Youtubechannel']
df['Lead Source'] = df['Lead Source'].replace(cols, 'Others')

In [None]:
BarPlot(df,'Lead Source', (15,10))

In [None]:
plt.figure(figsize=(20,30))
sns.countplot(data = df, x= 'Lead Source', order=df['Lead Source'].value_counts().index,hue = 'Converted')                      
plt.xticks(rotation=45)
plt.show()

convertcount=df.pivot_table(values='Lead Number',index='Lead Source',columns='Converted', aggfunc='count').fillna(0)
convertcount["Conversion(%)"] =round(convertcount[1]/(convertcount[0]+convertcount[1]),2)*100
print(convertcount.sort_values(ascending=False,by="Conversion(%)"))

#### Observation
To imporve the conversion rate X education should focus on providing incentives to referrals as well as improve the lead conversion through olark chat, organic search, direct traffic, and google leads and generate more leads from reference and welingak website.

#### Do Not Email

In [None]:
df['Do Not Email'].value_counts()

In [None]:
BarPlot(df,'Do Not Email', (15,10))

In [None]:
CountPlot(df,'Do Not Email','Do Not Email',(15,10),hue='Converted')

#### Observation
Based on the above we can see that the data is highly skewed. Therefore we will be dropping this column eventually

#### Do Not Call

In [None]:
df['Do Not Call'].value_counts()

In [None]:
BarPlot(df,'Do Not Call', (15,10))

In [None]:
CountPlot(df,'Do Not Call','Conversion based on Do Not Call',(15,10),hue='Converted')

### Observations
Based on the above we can see that the data is highly skewed. Therefore we will be dropping this column eventually

Note
We will be dropping the columns Do Not Email & Do Not Call as the data is highly skewed towards the No section. These two columns can be take as safe assumptions by company X that 99% of their prospective customers do not like to be called or receive emails.

In [None]:
df = df.drop(['Do Not Call', 'Do Not Email'], axis=1)
df.shape

In [None]:
df.info()

#### Last Activity

In [None]:
df['Last Activity'].value_counts()

In [None]:
BarPlot(df,'Last Activity', (15,5))

In [None]:
CountPlot(df,'Last Activity','Conversion based on Last Activity',(15,10),hue='Converted')

#### Observation
Based on the plots above we can infer that:

- People interacting with the portal usually send sms the most
- Only 24% of people who visit the website convert to actual students

#### Note:
**Search, Newspaper Article, Education Forums, Newspaper , Digital Advertisement and Through Recommendations**, are already represented in the 'Lead Source' column.

We will carry out basic univariate analysis on them and make a decision on if we need to drop them or not.

In [None]:
features = ['Search', 'Newspaper Article', 'X Education Forums', 'Newspaper' , 'Digital Advertisement','Through Recommendations']
for i in enumerate(features):
    print(df[i[1]].value_counts())

In [None]:
features = ['Search', 'Newspaper Article', 'X Education Forums', 'Newspaper' , 'Digital Advertisement','Through Recommendations']
plt.figure(figsize = (20,20))
for i in enumerate(features):
    plt.subplot(3,2,i[0]+1)
    sns.countplot(x = df[i[1]], data = df) 
plt.show()

#### Observation
The data feels highly skewed in these columns and confirms our assumption that it is correctly represented in the Lead Source Column

Since the data for these columns is already correctly represented in the 'Lead Source' column, we will drop these columns

In [None]:
Cols = ['Search', 'Newspaper', 'X Education Forums', 'Newspaper Article' , 'Digital Advertisement','Through Recommendations']
df = df.drop(Cols,axis=1)
df.head()

#### A free copy of Mastering The Interview

In [None]:
df['A free copy of Mastering The Interview'].value_counts()

In [None]:
BarPlot(df,'A free copy of Mastering The Interview', (15,10))

In [None]:
CountPlot(df,'A free copy of Mastering The Interview','Conversion based on A free copy of Mastering The Interview',(15,10),hue='Converted')

#### Observation
A large number of candidates, 64% didnt opt for any course even though the would like a free copy. 60% of the candidates didn't opt for any course or the free book.

### Last Notable Activity 

In [None]:
df['Last Notable Activity'].value_counts()

In [None]:
BarPlot(df,'Last Notable Activity', (15,10))

#### Observation
From our analysis we see that this column holds similar data represented in the **Last Activity column**. We will drop this one and keep the Last Activity column.

In [None]:
## Droping Last Notable Activity 
df= df.drop(['Last Notable Activity'], axis=1)
df.head()

In [None]:
df.shape

### Specialization

In [None]:
BarPlot(df,'Specialization', (15,10))

In [None]:
CountPlot(df,'Specialization','Conversion based on Specialization',(15,10),hue='Converted')

#### Observation
- Healthcare Management specialization is having 50% conversion rate
- Finance Management specialization is having 45% conversion rate

### City

In [None]:
BarPlot(df,'City', (15,10))

In [None]:
CountPlot(df,'City','Conversion based on City',(15,10),hue='Converted')

#### Observation
- Mumbai is having 37% Conversion rate

### What is your current occupation

In [None]:
df['What is your current occupation'].value_counts()

In [None]:
BarPlot(df,'What is your current occupation', (15,10))

In [None]:
CountPlot(df,'What is your current occupation','Conversion based on What is your current occupation',(15,10),hue='Converted')

#### Observation
- Housewife are having 100% Conversion rate but their count are very less
- Unemployed are having 34% Conversion rate
- Student are having 37% Conversion rate

### Checking Continuous Variables

#### Total  Visit

In [None]:
BarPlot(df,'TotalVisits', (15,10))

In [None]:
CountPlot(df,'TotalVisits','Conversion based on Total Visit',(15,10),hue='Converted')

### Observations
From the above analysis we can conclude that:

- People who make more than 10 visits are almost 50% likely to be converted
- Only 15% of people who visited the website once converted to student. This could imply that people weren't able to gather all the information they needed easily. Hence, decided not to opt for any course.

#### Total Time Spent on Website

#### Page Views Per Visit  

In [None]:
BarPlot(df,'Page Views Per Visit', (25,20))

In [None]:
CountPlot(df,'Page Views Per Visit','Page Views Per Visit vs Conversion',(20,20),hue='Converted')

### Observation
From the above plots it is safe to infer that:

- People who dont visit any pages have the highest count of conversion overall
- Less than half the people who visit 2 pages on average convert to students

#### Total Time Spent on Website

In [None]:
df['Total Time Spent on Website'].value_counts()

#### Observation
The above looks like time spent was recorded in minutes. We will convert the entire column into hours for ease of analysis

In [None]:
df['Total Time Spent on Website'] = df['Total Time Spent on Website'].apply(lambda x: round((x/60), 2))
df.head()

In [None]:
sns.distplot(df['Total Time Spent on Website'])
plt.show()

In [None]:
# Let us split our dataframe to perform better analysis
df1=df[df['Total Time Spent on Website']>=1.0]
df1["Hours Spent"]= df1["Total Time Spent on Website"].astype(int)

df1.head()

In [None]:
CountPlot(df1,'Hours Spent','Conversion based on Last Activity',(15,10),hue='Converted')

In [None]:
plt.figure(figsize=(20,20))
plt.xticks(rotation=45)
plt.yscale('log')
sns.boxplot(data =df1, x='TotalVisits',y='Total Time Spent on Website', hue ='Converted',orient='v')
plt.title('Total Time Spent Vs Total Visits based on Conversion')
plt.show()

### Observation
From the above bar plot we can infer that:

- The highest number of conversions happen when people are spending around 18 hours or above on the website
- People who spent around 3 hours on the website didn't opt for any courses.
- From the boxplot we can see better that the longer you stay on the website, the higher your chances of conversion as well.
 
 Overall more time the user spends on the website, the better their chances of becoming a student.

### Checking  Corelation b/w Variables

In [None]:
plt.figure(figsize=(20, 20))
sns.heatmap(df.corr(), cmap='YlGnBu',annot=True)

#### Observation
Based on the above heatmap we can see that we don't have any highly correlated features. Therefore there is no multicollinearity in the dataset.

In [None]:
# Final Dataframe
df.head()

## Step 4 Data Preparation

### Creating Dummy Values

In [None]:
### First we will convert the Yes/No values in the 'A free copy of Mastering The Interview' column to 1/0

df['A free copy of Mastering The Interview'] = df['A free copy of Mastering The Interview'].map(dict(Yes=1, No=0))
df.head()

In [None]:
dummy_Cols = ['Lead Origin', 'Lead Source', 'Last Activity', 'Specialization','What is your current occupation','City']
dummy = pd.get_dummies(df[dummy_Cols],drop_first=True)
dummy.head()

In [None]:
combined = df.copy()
combined.shape

In [None]:
combined = pd.concat([combined, dummy], axis=1)
combined.head()

In [None]:
### We will now drop the original columns and the columns that have 'Others' as a sub heading since we had 
### combined various values to create those columns

cols = ['Lead Origin', 'Lead Source', 'Last Activity', 'Specialization','What is your current occupation','City',
                     'Lead Source_Others','Specialization_Others']
combined = combined.drop(cols, axis=1)
combined.head()

In [None]:
combined.shape

In [None]:
combined.info()

### Performing Train - Test Split

In [None]:
### First we will drop the Converted & Lead Number columns 
X = combined.drop(['Converted','Lead Number'], axis=1)
X.head()

In [None]:
X.shape

In [None]:
### Adding the target variable 'Converted' to y
y = combined['Converted']

y.head()

In [None]:
# Splitting the data into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
X_train.head()

In [None]:
X_train.shape

### Scaling

In [None]:
## Scaling numeric Variables
scaler = StandardScaler()

X_train[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']] = scaler.fit_transform(X_train[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']])


X_train.head()

In [None]:
X_train.shape

In [None]:
combined.corr()

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(combined.corr(),cmap='YlGnBu',annot=True)

In [None]:
### Dropping highly correlated variables
X_train = X_train.drop(['Lead Origin_Lead Add Form', 'Lead Source_Facebook'], axis=1)
X_test = X_test.drop(['Lead Origin_Lead Add Form', 'Lead Source_Facebook'], axis=1)

In [None]:
X_train.head()

In [None]:
X_train.shape

In [None]:
plt.figure(figsize=(30,20))
sns.heatmap(combined[X_train.columns].corr(),cmap='YlGnBu',annot=True)

#### Observation
We have successfully removed the highly corelated variables from the traiing and test datasets.

In [None]:
X_train.info()

## Step 5 Modeling

In [None]:
## Creating Logistic Regression Model
logisticRegressionModel = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logisticRegressionModel.fit().summary()

### Feature Selection Using RFE

In [None]:
logreg = LogisticRegression()

rfe = RFE(logreg, 15)
rfe= rfe.fit(X_train,y_train)
rfe.support_

In [None]:
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]

In [None]:
X_train.columns[~rfe.support_]

##### Assessing the model with StatsModels

In [None]:
X_train1= X_train[col]
X_train1

### Model 2

In [None]:
X_train_sm = sm.add_constant(X_train1)
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
# Getting the predicted values on the train set
y_train_pred = res.predict(X_train_sm)
y_train_pred[:10]

In [None]:
y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

In [None]:
y_train_pred_final = pd.DataFrame({'Converted':y_train.values, 'Lead_Score_Prob':y_train_pred})
y_train_pred_final['Lead'] = y_train.index
y_train_pred_final.head()

#### Creating new column 'predicted' with 1 if Churn_Prob > 0.5 else 0

In [None]:
y_train_pred_final['Final_Predicted_Hot_Lead'] = y_train_pred_final.Lead_Score_Prob.map(lambda x: 1 if x > 0.5 else 0)

# Let's see the head
y_train_pred_final.head()

In [None]:
from sklearn import metrics
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Final_Predicted_Hot_Lead)
print(confusion)

In [None]:
# Let's check the overall accuracy.
print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.Final_Predicted_Hot_Lead))

In [None]:
### Checking VIF values
vif = pd.DataFrame()
vif['Features'] = X_train1.columns
vif['VIF'] = [variance_inflation_factor(X_train1.values, i) for i in range(X_train1.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

#### Observation
- The VIF values for all the variables in the model look to be under control
- The p value for **Last Activity_Resubscribed to emails is very high** at 1 & above the threshold 0.05
- We will be dropping Last Activity_Resubscribed to emails in the next model
- Model's accuracy is 79%

### Model 3

In [None]:
X_train2 = X_train1.drop('Last Activity_Resubscribed to emails', axis=1)
X_train2

In [None]:
X_train_sm = sm.add_constant(X_train2)
logm3 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm3.fit()
res.summary()

In [None]:
# Getting the predicted values on the train set
y_train_pred = res.predict(X_train_sm).values.reshape(-1)

In [None]:
y_train_pred[:10]

In [None]:
y_train_pred_final['Lead_Score_Prob'] = y_train_pred

In [None]:
y_train_pred_final['Final_Predicted_Hot_Lead'] = y_train_pred_final.Lead_Score_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final.head()

In [None]:
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Final_Predicted_Hot_Lead)
print(confusion)

In [None]:
# Let's check the overall accuracy.
print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.Final_Predicted_Hot_Lead))

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train2.columns
vif['VIF'] = [variance_inflation_factor(X_train2.values, i) for i in range(X_train2.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

#### Observation
- The VIF values for all the variables in the model look to be under control
- The p value for What is your current occupation_Housewife is very high at 0.999 & above the threshold 0.05
- We will be dropping What is your current occupation_Housewife in the next model
- Accuracy is same as previous model 79.80

### Model 4

In [None]:
X_train3 = X_train2.drop('What is your current occupation_Housewife', axis=1)
X_train3.columns

In [None]:
X_train_sm = sm.add_constant(X_train3)
logm4 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm4.fit()
res.summary()

In [None]:
y_train_pred = res.predict(X_train_sm).values.reshape(-1)

In [None]:
y_train_pred[:10]

In [None]:
y_train_pred_final['Lead_Score_Prob'] = y_train_pred

In [None]:
y_train_pred_final['Final_Predicted_Hot_Lead'] = y_train_pred_final.Lead_Score_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final.head()

In [None]:
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Final_Predicted_Hot_Lead)
print(confusion)

In [None]:
# Let's check the overall accuracy.
print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.Final_Predicted_Hot_Lead))

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train3.columns
vif['VIF'] = [variance_inflation_factor(X_train3.values, i) for i in range(X_train3.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

#### Observation
The p value for What is your current occupation_Working Professional is above the threshold at 0.440

Accuracy is same as previous model 79.80

We will be dropping this variable in the next model

### Model 5

In [None]:
X_train4 = X_train3.drop('What is your current occupation_Working Professional', axis=1)
X_train4.columns 

In [None]:
X_train_sm = sm.add_constant(X_train4)
logm4 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm4.fit()
res.summary()

In [None]:
y_train_pred = res.predict(X_train_sm).values.reshape(-1)

In [None]:
y_train_pred[:10]

In [None]:
y_train_pred_final['Lead_Score_Prob'] = y_train_pred

In [None]:
y_train_pred_final['Final_Predicted_Hot_Lead'] = y_train_pred_final.Lead_Score_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final.head()

In [None]:
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Final_Predicted_Hot_Lead)
print(confusion)

In [None]:
# Let's check the overall accuracy.
print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.Final_Predicted_Hot_Lead))

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train4.columns
vif['VIF'] = [variance_inflation_factor(X_train4.values, i) for i in range(X_train4.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

### Observation
Model 5 meets all our criteria:

- The VIF values are under 3
- The p values are under 0.05
- The 12 selected features look significant

Let us generate a heatmap to confirm that there is no multicollinearity

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(X_train_sm.corr(),cmap='YlGnBu',annot=True)

#### Observation
As we can confirm with our heatmap, there is no multicollinearity in the model.

#### Generating predicted values on the training set

In [None]:
y_train_pred = res.predict(X_train_sm)
y_train_pred.head()

In [None]:
y_train_pred_final['Final_Predicted_Hot_Lead'] = y_train_pred_final.Lead_Score_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final.head()

In [None]:
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Final_Predicted_Hot_Lead)
print(confusion)

In [None]:
# Let's check the overall accuracy.
print(round(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.Final_Predicted_Hot_Lead),2))

#### Creating new column 'Lead_Score'
Lead_Score would be equal to (Lead_Score_Prob * 100)

In [None]:
y_train_pred_final['Lead_Score'] = round((y_train_pred_final['Lead_Score_Prob'] * 100),0)

y_train_pred_final['Lead_Score'] = y_train_pred_final['Lead_Score'].astype('int')

# Let's see the head
y_train_pred_final.head()

In [None]:
TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
round((TP / float(TP+FN)),2)

In [None]:
# Let us calculate specificity
round((TN / float(TN+FP)),2)

#### Observation
Based on the above statistics for Accuracy(80%), Sensitivity(64%) and Specificity(89%) we can say that our trained model is currently highly specific but not very sensitive. Our objective is to create a highly sensitive model with 80% sensitivity. Let us find cut-off values using ROC curves to improve this.

#### Plotting the ROC Curve

In [None]:
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
fpr, tpr, thresholds = metrics.roc_curve( y_train_pred_final.Converted, y_train_pred_final.Lead_Score_Prob, drop_intermediate = False )

In [None]:
draw_roc(y_train_pred_final.Converted, y_train_pred_final.Lead_Score_Prob)

#### Observation
From the ROC curve we can say that the model will be able to provide us with a good result overall.

### Finding Optimal Cutoff Point
Optimal cutoff probability is that prob where we get balanced sensitivity and specificity

In [None]:
# Let's create columns with different probability cutoffs 
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.Lead_Score_Prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['Probability','Accuracy','Sensitivity','Specificty'])


num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
# Let's plot accuracy sensitivity and specificity for various probabilities.
cutoff_df.plot.line(x='Probability', y=['Accuracy','Sensitivity','Specificty'])
plt.show()

#### Observation
From the above curve we can see that the optimal cutoff is at 0.33. This is the point where all the parameters are equally balanced

In [None]:
y_train_pred_final['Final_Predicted_Hot_Lead'] = y_train_pred_final.Lead_Score_Prob.map( lambda x: 1 if x > 0.33 else 0)

y_train_pred_final.head()

In [None]:
# Accuracy
round(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.Final_Predicted_Hot_Lead),2)

In [None]:
confusion2 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Final_Predicted_Hot_Lead )
confusion2

In [None]:
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
round(TP / float(TP+FN),2)

In [None]:
# Let us calculate specificity
round(TN / float(TN+FP),2)

#### Observation
As we can see above, when we are selecting the optimal cutoff = 0.33, the various performance parameters Accuracy, Sensitivity & Specificity are all around 80%

This meets our objective of getting a highly sensitive model with 80% sensitivity

### Precision and Recall Curve

In [None]:
### Calculating Precision
precision =round(TP/float(TP+FP),2)
precision

In [None]:
### Calculating Recall
recall = round(TP/float(TP+FN),2)
recall

#### Observation
From the above scores we note that our model has a good overall relevancy, defined by Precision, at 70% & a great return of relevant results, defined by Recall, at 80%.

For the purposes of our model we will focus on the Recall result as we would not like to miss out on any hot leads that are willing to be converted.

In [None]:
### Let us generate the Precision vs Recall tradeoff curve 
p ,r, thresholds=precision_recall_curve(y_train_pred_final.Converted,y_train_pred_final['Lead_Score_Prob'])
plt.title('Precision vs Recall tradeoff')
plt.plot(thresholds, p[:-1], "g-")    # Plotting precision
plt.plot(thresholds, r[:-1], "r-")    # Plotting Recall
plt.show()

#### Observation
The precision vs recall tradeoff value from the above graph is at 0.4

In [None]:
### The F statistic is given by 2 * (precision * recall) / (precision + recall)
## The F score is used to measure a test's accuracy, and it balances the use of precision and recall to do it.
### The F score can provide a more realistic measure of a test's performance by using both precision and recall
F1 =2 * (precision * recall) / (precision + recall)
round(F1,2)

#### Observation
Based on the F1 score we can say that our model is fairly accurate. Let us test this accuracy on the test set.

### Making Predictions on test set

In [None]:
X_test[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']] = scaler.transform(X_test[['TotalVisits',
                                'Total Time Spent on Website','Page Views Per Visit']])

In [None]:
X_train4.shape

In [None]:
X_test = X_test[X_train4.columns]

X_test.shape

In [None]:
X_test.head()

In [None]:
X_test_sm = sm.add_constant(X_test)

##### Making predictions on the test set

In [None]:
y_test_pred = res.predict(X_test_sm)

In [None]:
y_test_pred[:10]

In [None]:
# Converting y_pred to a dataframe which is an array
y_pred_1 = pd.DataFrame(y_test_pred)
y_pred_1.head()

In [None]:
# Converting y_test to dataframe
y_test_df = pd.DataFrame(y_test)

In [None]:
y_test_df['Lead'] = y_test_df.index

In [None]:
# Removing index for both dataframes to append them side by side 
y_pred_1.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)

In [None]:
# Appending y_test_df and y_pred_1
y_pred_final = pd.concat([y_test_df, y_pred_1],axis=1)

In [None]:
y_pred_final.shape

In [None]:
# Renaming the column 

y_pred_final= y_pred_final.rename(columns={ 0 : 'Lead_Score_Prob'})

In [None]:
# Rearranging the columns

y_pred_final = y_pred_final.reindex(['Lead','Converted','Lead_Score_Prob'], axis=1)

In [None]:
# Adding Lead_Score column

y_pred_final['Lead_Score'] = round((y_pred_final['Lead_Score_Prob'] * 100),0)

y_pred_final['Lead_Score'] = y_pred_final['Lead_Score'].astype(int)

In [None]:
# Let's see the head of y_pred_final
y_pred_final.head()

In [None]:
y_pred_final['Final_Predicted_Hot_Lead'] = y_pred_final.Lead_Score_Prob.map(lambda x: 1 if x > 0.33 else 0)

In [None]:
y_pred_final.head()

In [None]:
# Let's check the overall accuracy.
round(metrics.accuracy_score(y_pred_final.Converted, y_pred_final.Final_Predicted_Hot_Lead),2)

In [None]:
confusion3 = metrics.confusion_matrix(y_pred_final.Converted, y_pred_final.Final_Predicted_Hot_Lead )
confusion3

In [None]:
TP = confusion3[1,1] # true positive 
TN = confusion3[0,0] # true negatives
FP = confusion3[0,1] # false positives
FN = confusion3[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
round((TP / float(TP+FN)),2)

In [None]:
# Let us calculate specificity
round(TN / float(TN+FP),2)

#### Observation And Conclusion (on Test Set)
As we can see above when cut-off = 0.33, the various Model Performance parameters on test set are as per below
- Sensitivity = 80%
- Specificity = 80%
- Accuracy = 80%%

All the 3 performance parameters on test set appear to be almost same with no much variation, so we are good with the modeling now.

### Final Model Reporting & Equation

**log odds = 1.3837 +(1.0659 Total Time Spent on Website) + (1.1280 Lead Source_Olark chat) + (3.5984 Lead Source_Reference) + (5.4963 Lead Source_Welingak website) + (-1.2127 Last Activity_Converted to Lead) + (-1.7984 Last Activity_Email Bounced) + (2.1604 Last Activity_Had a Phone Conversation) + (-1.4009 Last Activity_Olark Chat Conversation) + (1.1884 Last Activity_SMS Sent)+(-2.8435 What is your current occupation_Other)+(-2.3752 What is your current occupation_Student)+(-2.7984 * What is your current occupation_Unemployed)**

### Insights
- Hot Leads are identified as 'Customers having lead score of 33 or above'
- Sales Team of the company should first focus on the 'Hot Leads'
- Higher the Lead Score, higher the chances of conversion of 'Hot Leads' into 'Paying Customers'
- The 'Cold Leads'(Customer having lead score < 33) should be focused after the Sales Team is done with the 'Hot Leads'

### Generating Leads Table
Assigning Lead score to the respective lead numbers present in our original dataset.

### For training set

In [None]:
y_train_pred_final = y_train_pred_final.reindex(['Lead','Converted','Lead_Score_Prob','Lead_Score','Final_Predicted_Hot_Lead'], axis=1)
y_train_pred_final

In [None]:
### Generating table
resultingTable1 = pd.merge(y_train_pred_final,df,how='inner',left_on='Lead',right_index=True)
resultingTable1[['Lead Number','Lead_Score']].head()

### For testing set

In [None]:
### Generating table
resultingTable2 = pd.merge(y_pred_final,df,how='inner',left_on='Lead',right_index=True)
resultingTable2[['Lead Number','Lead_Score']].head()

### Merging both resulting tabel

In [None]:
result_df= pd.concat([resultingTable1, resultingTable2])

In [None]:
result_df

In [None]:
### renaming Converted_x to Converted and droping Converted_y as both are same
result_df=result_df.rename(columns={'Converted_x' : 'Converted'})
result_df= result_df.drop(['Converted_y'], axis=1)

In [None]:
result_df

### Observation
From the resultant shape we can confirm that the number of rows in the final dataset are the same as it's in original. Therefore we can use the values from the 'result_df' dataset to pursue the leads.based on the key insights identified above.

In [None]:
# coefficients of our final model 

pd.options.display.float_format = '{:.2f}'.format
new_params = res.params[1:]
new_params

In [None]:
# Getting a relative coeffient value for all the features wrt the feature with the highest coefficient

feature_importance = new_params
feature_importance = 100.0 * (feature_importance / feature_importance.max())
feature_importance