# LEAD SCORING CASE STUDY - (PGDDS - C28)

## BY SEYED JAVIDH & VIVEK CHOWDHURY

### Importing Libraries

In [None]:
#Importing all the necessary libraries.

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn import metrics
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, precision_score, recall_score, precision_recall_curve, f1_score, accuracy_score

pd.set_option("display.max_columns", 50)

### Reading the DataFrame

In [None]:
#Reading the data into a dataframe.

leads=pd.read_csv("../input/lead-score/Leads.csv")
leads.head()

### DataFrame Inspections

In [None]:
#Cheking the number of Rows and Columns.

leads.shape

In [None]:
#Checking the descriptive statistics of all the numerical columns.

leads.describe()

In [None]:
#Checking the data types of all the columns.

leads.info()

In [None]:
#Analyzing null values.

leads.isnull().sum()

In [None]:
#Understanding the percentage of Null values in the 

round(100*(leads.isnull().sum()/len(leads.index)), 2)

In [None]:
#Checking for distinct elements.

leads.nunique()

## Data Cleaning and Visualization

In [None]:
#Now, we need to check if there are any duplicate rows of information for any lead.
#If it exists, we need to remove the duplicates.

leads.drop_duplicates(subset=['Prospect ID'], inplace=True)

#Checking the shape to see if there were any duplicates present.

leads.shape

In [None]:
#Now, it is better to drop all the columns which have only one unique value.

leads= leads.drop(['Magazine','Receive More Updates About Our Courses','Update me on Supply Chain Content',
                   'Get updates on DM Content','I agree to pay the amount through cheque'],axis=1)

In [None]:
#Also,the value "Select" is equivalent to no value provided, so let's go ahead and do that.

leads = leads.replace('Select', np.nan)

In [None]:
#When we observe the percentage of nulls, we take a call to drop columns with 45% or more null values.

col=leads.columns

for i in col:
    if((100*(leads[i].isnull().sum()/len(leads.index))) >= 45):
        leads.drop(i, 1, inplace = True)

In [None]:
#Checking for null values after dropping those columns with > 45% nulls.

leads.isnull().sum()

In [None]:
#Since Prospect ID and Lead Numbers are not useful for our model building and analysis, it is better to drop them.

leads.drop(['Prospect ID', 'Lead Number'], 1, inplace = True)

In [None]:
#Checking the values of the column: Lead Source.

leads['Lead Source'].value_counts(dropna=False)

In [None]:
#Here we can see a lot of values with low occurances, so it is better to club them together.

leads['Lead Source'] = leads['Lead Source'].replace('google','Google')
leads['Lead Source'] = leads['Lead Source'].replace(np.nan,'Others')
leads['Lead Source'] = leads['Lead Source'].replace(['bing','Click2call','Press_Release','Live Chat','NC_EDM','testone','youtubechannel',
                                                     'Pay per Click Ads','welearnblog_Home','WeLearn','blog','Facebook','Social Media'] ,'Others')

In [None]:
#Visualizing the data from this column:

plt.figure(figsize=[20,5])

sns.barplot(x=leads['Lead Source'].value_counts().index, 
            y=leads['Lead Source'].value_counts().values).set_title("Lead Source", fontsize=30, color='Teal', pad = 20)

plt.xlabel('Lead Source', fontsize= 20, color='Brown')
plt.xticks(rotation=45, size = 12)
plt.yticks(size = 12)

plt.show()

##### **`INFERENCE`** 

- From the above graph we can see that most of the lead had come from Google or a result of Direct Traffic (directly typing in the URL)
- It is also worth noticing that the 3rd place is occupied by leads coming from Olark chat rather than Organic Search.

In [None]:
#Checking the values of the column: Last Activity.

leads['Last Activity'].value_counts(dropna=False)

In [None]:
#Here we can see a lot of values with low occurances, so it is better to club them together.

leads['Last Activity'] = leads['Last Activity'].replace(np.nan,'Others')
leads['Last Activity'] = leads['Last Activity'].replace(['Resubscribed to emails','Visited Booth in Tradeshow','Email Received',
                                                         'Email Marked Spam','View in browser link Clicked',
                                                         'Approached upfront','Had a Phone Conversation','Unsubscribed','Unreachable'] ,'Others')

In [None]:
#Visualizing the data from this column:

plt.figure(figsize=[22,5])

sns.barplot(x=leads['Last Activity'].value_counts().index, 
            y=leads['Last Activity'].value_counts().values).set_title("Last Activity", fontsize=30, color='Teal', pad = 20)


plt.xticks(rotation=45, size = 12)
plt.yticks(size = 12)

plt.show()

##### **`INFERENCE`**

- Here, we can notice that most of the leads had opened their email to check the news about X Education.
- Followed by sending SMS and Olark Chat conversation.
- We can also see that a very less population of leads had either clicked on the email link or submitted the form on the website.

In [None]:
# Checking how skewed the data in the column(TotalVisits) is:

plt.figure(figsize=[15,5])

sns.distplot(leads['TotalVisits']).set_title("TotalVisits", fontsize=20, color='Teal', pad=20)
plt.show()

In [None]:
#Futher checking if there are any null values in the column: TotalVisits.

print("Percentage of nulls: " + str(round(leads.TotalVisits.isnull().sum()/len(leads.TotalVisits),2)))

In [None]:
#Checking the: TotalVisits column.

plt.figure(figsize=[15,8])

sns.set_style('darkgrid')

plt.subplot(2,1,1)
sns.boxplot(leads['TotalVisits']).set_title("TotalVisits", fontsize=20, color='Teal', pad=20)

plt.subplot(2,1,2)
sns.distplot(leads['TotalVisits'], color='g')

plt.show()

##### **`INFERENCE`**

- If we observe the boxplot we can see that the there are definitely some outliers in the range of 250. It show that people are visiting the page for 250 times.
- It can also be observed from the histogram that most of the visits are in the range of 0 to 25. There are very less leads who have visited the page for more than 25 times.

In [None]:
#Checking the percentile values of the TotalVisits Column

leads.describe(percentiles = [0.05, 0.5, 0.75, 0.90, 0.95, 0.98, 0.99])

In [None]:
#Since in the TotalVisits column, we can see that there is a huge jump from 99th percentile to the max. And also below 5,
#we are removing the top 1 percentile and the bottom 5 percentile.

plt.figure(figsize=[8,5])

Q3 = leads.TotalVisits.quantile(0.99)
leads = leads[(leads.TotalVisits <= Q3)]
Q1 = leads.TotalVisits.quantile(0.05)
leads = leads[(leads.TotalVisits >= Q1)]
sns.boxplot(y=leads['TotalVisits']).set_title("TotalVisits", fontsize=20, color='Teal', pad=20)
plt.ylabel("Total Visits",size = 15)
plt.show()

In [None]:
# Checking how skewed the data in the column(Page Views Per Visit) is:

plt.figure(figsize=[15,5])

sns.distplot(leads['Page Views Per Visit']).set_title("Page Views Per Visit", fontsize=20, color='Teal', pad=20)
plt.show()

In [None]:
#Futher checking if there are any null values in the column: Page Views Per Visit.

print("Percentage of nulls: " + str(round(leads['Page Views Per Visit'].isnull().sum()/len(leads['Page Views Per Visit']),2)))

In [None]:
# Checking the Page Views Per Visit column

plt.figure(figsize=[15,8])

plt.subplot(2,1,1)
sns.boxplot(leads['Page Views Per Visit']).set_title("Page Views Per Visit", fontsize=20, color='Teal', pad=20)

plt.subplot(2,1,2)
sns.distplot(leads['Page Views Per Visit'], color='g')

plt.show()

##### **`INFERENCE`** 

- From the boxplot we can definitely see that there are outliers in the data.
- And on the other hand, from the histogram we can see that the data is definitely skewed. With most of the data near the 0 to 10 bin.

In [None]:
#Checking the percentile values of the Page Views Per Visit column

leads.describe(percentiles = [0.05, 0.5, 0.75, 0.90, 0.95, 0.98, 0.99])

In [None]:
#Since in the Page Views Per Visit column, we can see that there is a huge jump from 99th percentile to the max. And also below 5,
#we are removing the top 1 percentile and the bottom 5 percentile.

plt.figure(figsize=[8,5])

Q3 = leads["Page Views Per Visit"].quantile(0.99)
leads = leads[(leads["Page Views Per Visit"] <= Q3)]
Q1 = leads.TotalVisits.quantile(0.05)
leads = leads[(leads["Page Views Per Visit"] >= Q1)]
sns.boxplot(y=leads["Page Views Per Visit"]).set_title("Page Views Per Visit", fontsize=20, color='Teal', pad=20)
plt.ylabel("Page Views Per Visit",size = 15)
plt.show()

In [None]:
#Checking for all the values and their occurances in the column: Country.

leads['Country'].value_counts(dropna=False)

In [None]:
#Visualizing the Country column.

plt.figure(figsize=[20,5])

sns.barplot(x=leads['Country'].value_counts().index, 
            y=leads['Country'].value_counts().values).set_title("Country", fontsize=30, color='Teal', pad = 20)

plt.xlabel('Country', fontsize= 20, color='Brown')
plt.xticks(rotation=45, size = 12)
plt.yticks(size = 12)

plt.show()

##### **`INFERENCE`**

- Since the column, country is highly dominated by the value of India, it is best to leave it out of our model.
- Dropping this column will be best.

In [None]:
#Dropping the Country column.

leads.drop(columns = "Country", inplace = True)

In [None]:
#Checking for all the values and their occurances in the column: Specialization.

leads['Specialization'].value_counts(dropna=False)

In [None]:
#Here, we can see that the NaN values occur 3x that of the 2nd highest value in this column. So we create a new category here.

leads['Specialization'] = leads['Specialization'].replace(np.nan, 'Unknown')

In [None]:
# We also notice that there are several Management related Specializations, Hence we can have a single bin for them.

leads['Specialization'] = leads['Specialization'].replace(['Finance Management','Human Resource Management','Marketing Management'
                                                           ,'Operations Management','IT Projects Management','Supply Chain Management',
                                                         'Healthcare Management','Hospitality Management','Retail Management'],'Management Specialization')

In [None]:
#Visualizing the "Specialization" column.

plt.figure(figsize=[25,5])

sns.barplot(x=leads['Specialization'].value_counts().index, 
            y=leads['Specialization'].value_counts().values).set_title("Specialization", fontsize=30, color='Teal', pad = 20)

plt.xticks(rotation=45, size = 13)
plt.yticks(size = 13)

plt.show()

##### **`INFERENCE`** 

- From the above bar graph, we can see that people with specialization in Management domains are the most common visitors. 
- However, people from Rural and Agricultural, E-Business and Services Excellence are among the least visited people.

In [None]:
#Checking for all the values and their occurances in the column: "What is your current occupation".

leads['What is your current occupation'].value_counts(dropna=False)

In [None]:
#Since the occurance of Housewife and Businessman is less than others, we can bin them together with Others.

leads['What is your current occupation'] = leads['What is your current occupation'].replace(np.nan, 'Unemployed')
leads['What is your current occupation'] = leads['What is your current occupation'].replace(['Housewife','Businessman','Other'] ,'Others')

In [None]:
#Visualizing the "What is your current occupation" column.

plt.figure(figsize=[20,5])

sns.barplot(x=leads['What is your current occupation'].value_counts().index, 
            y=leads['What is your current occupation'].value_counts().values).set_title("What is your current occupation", 
                                                                                        fontsize=30, color='Teal', pad = 20)

plt.xticks(rotation=45, size = 12)
plt.yticks(size = 12)

plt.show()

##### **`INFERENCE`**

- Here we can observe that most of the audience belong are unemployed with a very tiny proportion of people who are working or studying.

In [None]:
#Checking for all the values and their occurances in the column: "What matters most to you in choosing a course".

leads['What matters most to you in choosing a course'].value_counts(dropna=False)

In [None]:
#Imputing the NaN values with Better Career Prospects

leads['What matters most to you in choosing a course'] = leads['What matters most to you in choosing a course'].replace(np.nan,'Better Career Prospects')

In [None]:
#Visualizing the "What matters most to you in choosing a course" column.

plt.figure(figsize=[20,5])

leads['What matters most to you in choosing a course'].value_counts(normalize=True).plot.barh(color='c').set_title("What matters most to you in choosing a course", 
                                                                                            fontsize=20, color='Teal', pad=20)

plt.xticks(rotation=45, size = 12)
plt.yticks(size = 12)


plt.show()

##### **`INFERENCE`**

- The above Bar graph concludes that most of the people are looking for Better Career Prospects.
- But since this value is highly dominated by just one value, we can drop this column.

In [None]:
#Dropping the "What matters most to you in choosing a course" column:

leads.drop(columns = 'What matters most to you in choosing a course', axis=1, inplace=True)

In [None]:
#Checking for all the values and their occurances in the column: "Tags".

leads['Tags'].value_counts(dropna=False)

In [None]:
#Since the occurance of anything below 100 is less, we can bin them together with Others.

leads['Tags'] = leads['Tags'].replace(np.nan, 'Unknown')
leads['Tags'] = leads['Tags'].replace(['In confusion whether part time or DLP', 'in touch with EINS','Diploma holder (Not Eligible)',
                                     'Approached upfront','number not provided', 'opp hangup','Still Thinking',
                                    'Lost to Others','Shall take in the next coming month','Lateral student','Interested in Next batch',
                                    'Recognition issue (DEC approval)','Want to take admission but has financial problems',
                                    'University not recognized','switched off','Already a student','Not doing further education','invalid number','wrong number given',
                                       'Interested  in full time MBA'] ,'Other_Reasons')

In [None]:
#Visualizing the "Tags" column.

plt.figure(figsize=[20,5])

sns.barplot(x=leads['Tags'].value_counts().index, 
            y=leads['Tags'].value_counts().values).set_title("Tags", fontsize=30, color='Teal', pad = 20)

plt.xticks(rotation=45, size=13)
plt.yticks(size=13)

plt.show()

In [None]:
#Checking for all the values and their occurances in the column: "City".

leads['City'].value_counts(dropna=False)

In [None]:
#Imputing the value of NaN as Mumbai here.

leads['City'] = leads['City'].replace(np.nan,'Mumbai')

In [None]:
#Visualizing the column: City.

plt.figure(figsize=[20,5])

leads['City'].value_counts(normalize=True).plot.barh(color='orchid').set_title("City", fontsize=20, color='Teal', pad=20)

plt.xticks(rotation=45, size=13)
plt.yticks(size=13)

plt.show()

In [None]:
#Since this column is also highly dominated by one single value, it is better to drop this column.

leads.drop(columns = 'City', axis=1, inplace=True)

In [None]:
#Checking if any more null values exist in the dataframe.

leads.isnull().sum()

In [None]:
#Checking for all the values and their occurances in the column: "Lead Origin".

leads['Lead Origin'].value_counts()

In [None]:
#VALUE COUNTS IN GRAPH 

plt.figure(figsize=[20,5])

leads['Lead Origin'].value_counts(normalize=True).plot.barh(color='darkorchid').set_title("Lead Origin", fontsize=20, 
                                                                                          color='Teal', pad=20)

plt.xlabel('Lead Origin', fontsize= 20, color='Brown')
plt.xticks(rotation=45, size=12)
plt.yticks(size=12)

plt.show()

##### **`INFERENCE`** 

- We can observe that most of the leads are from the landing page and the API.

In [None]:
#Checking for all the values and their occurances in the column: "Do Not Email".

leads['Do Not Email'].value_counts()

In [None]:
#Checking for all the values and their occurances in the column: "Do Not Call".

leads['Do Not Call'].value_counts()

In [None]:
#Checking for all the values and their occurances in the column: "Search".

leads['Search'].value_counts()

In [None]:
#Checking for all the values and their occurances in the column: "Newspaper Article".

leads['Newspaper Article'].value_counts()

In [None]:
#Checking for all the values and their occurances in the column: "X Education Forums".

leads['X Education Forums'].value_counts()

In [None]:
#Checking for all the values and their occurances in the column: "Newspaper".

leads['Newspaper'].value_counts()

In [None]:
#Checking for all the values and their occurances in the column: "Digital Advertisement".

leads['Digital Advertisement'].value_counts()

In [None]:
#Checking for all the values and their occurances in the column: "Through Recommendations".

leads['Through Recommendations'].value_counts()

In [None]:
#Since, for the above columns, we can observe that they are usually dominated by a single entity, it is best to drop them.

cols = ['Do Not Email','Do Not Call','Search','Newspaper Article','Digital Advertisement','Through Recommendations','X Education Forums','Newspaper']

leads.drop(columns=cols, axis=1, inplace=True)

In [None]:
#Checking for all the values and their occurances in the column: "Converted".

leads['Converted'].value_counts()

In [None]:
#Visualizing the column: Converted.

plt.figure(figsize=[20,5])

leads['Converted'].value_counts(normalize=True).plot.barh(color='skyblue').set_title("Converted", fontsize=20,
                                                                                          color='Teal', pad=20)

plt.xticks(rotation=45, size=13)
plt.yticks(size=13)

plt.show()

##### **`INFERENCE`** 

- Here we can see that there are almost 38% of candidates getting converted.
- But a majority of more than 60% are not converted

In [None]:
#Checking for any imbalance in the Converted column.

converted = leads[leads['Converted'] == 1].shape[0]
non_converted = leads[leads['Converted'] != 1].shape[0]

# the ratio of converted to non_converted is:

ratio = float(converted / non_converted)

if ratio > 0.95:
    print("The data is imbalanced.")
else:
    print("The data is balanced.")

In [None]:
#Checking for all the values and their occurances in the column: "Total Time Spent on Website".

leads['Total Time Spent on Website'].value_counts().head()

In [None]:
# Visualizing the column: "Total Time Spent on Website"

plt.figure(figsize=[15,8])

plt.subplot(2,1,1)
sns.boxplot(leads['Total Time Spent on Website']).set_title("Total Time Spent on Website", fontsize=20, color='Teal', pad=20)

plt.subplot(2,1,2)
sns.distplot(leads['Total Time Spent on Website'], color='g')

plt.show()

##### **`INFERENCE`** 

- Here, we can see from the boxplot that the mostly people spend about 1000 seconds on the website.
- Also, we can see from the histogram that it is skewed and most people spend near about 500 seconds on the website.

In [None]:
#Checking the percentile values of the Total Time Spent on Website column.

leads.describe(percentiles = [0.05, 0.5, 0.75, 0.90, 0.95, 0.98, 0.99])

In [None]:
#Since in the Total Time Spent on Website column, we can see that there is a huge jump from 99th percentile to the max. And also below 5,
#we are removing the top 1 percentile and the bottom 5 percentile.

plt.figure(figsize=[8,5])

Q3 = leads["Total Time Spent on Website"].quantile(0.99)
leads = leads[(leads["Total Time Spent on Website"] <= Q3)]
Q1 = leads.TotalVisits.quantile(0.05)
leads = leads[(leads["Total Time Spent on Website"] >= Q1)]
sns.boxplot(y=leads["Total Time Spent on Website"]).set_title("Total Time Spent on Website", fontsize=20, color='Teal', pad=20)
plt.ylabel("Total Time Spent on Website",size = 15)
plt.show()

In [None]:
#Checking for all the values and their occurances in the column: "A free copy of Mastering The Interview".

leads['A free copy of Mastering The Interview'].value_counts()

In [None]:
#Performing One Hot encoding on the Yes/No values.

leads['A free copy of Mastering The Interview']=leads['A free copy of Mastering The Interview'].map({"No":0,"Yes":1})

In [None]:
#Visualizing the column: "A free copy of Mastering The Interview"

plt.figure(figsize=[20,5])

leads['A free copy of Mastering The Interview'].value_counts(normalize=True).plot.barh(color='wheat').set_title("A free copy of Mastering The Interview", 
                                                                                                                 fontsize=20, color='Teal', pad=20)

plt.xticks(rotation=45, size=13)
plt.yticks(size=13)

plt.show()

In [None]:
#Checking for all the values and their occurances in the column: "Last Notable Activity".

leads['Last Notable Activity'].value_counts()

In [None]:
#Clubbing anything below 100 as "Other_Activities"

leads['Last Notable Activity'] = leads['Last Notable Activity'].replace(['Email Bounced','Unsubscribed','Unreachable','Had a Phone Conversation','Email Marked Spam','Form Submitted on Website','Resubscribed to emails','Email Received','Approached upfront','View in browser link Clicked'], 'Other_Activities')

In [None]:
#VALUE COUNTS IN GRAPH 

plt.figure(figsize=[20,5])

sns.barplot(x=leads['Last Notable Activity'].value_counts().index, 
            y=leads['Last Notable Activity'].value_counts().values).set_title("Last Notable Activity", fontsize=30, color='Teal', pad = 20)

plt.xticks(rotation=45, size=13)
plt.yticks(size=13)

plt.show()

##### **`INFERENCE`**

- We can observe from the above bar chart that the last activity of the leads are usually- modification, email opened or sending sms.

In [None]:
#Lead Origin Vs Converted

plt.figure(figsize=[20,10])

sns.countplot(data = leads, y= 'Lead Origin', order=leads['Lead Origin'].value_counts().index,
              hue = 'Converted',palette='magma').set_title("Lead Origin Vs Converted " ,
                                                                      fontsize=30, color='Teal', pad=20)


plt.ylabel("Lead Origin", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("Count", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(size=13)
plt.yticks(size=13)

plt.show()

##### **`INFERENCE`** 

- Here, we can see that Leads, who's origin is from the Add Form section, are more likely to get converted later on.
- The ratio of Leads converted from the Landing Page Submission and API looks okayish, however not as great as that of Lead Add Form.

In [None]:
# Lead Source Vs Converted

plt.figure(figsize=[20,10])

sns.countplot(data = leads, y= 'Lead Source', order=leads['Lead Source'].value_counts().index,
              hue = 'Converted').set_title("Lead Source Vs Converted " ,
                                                                      fontsize=30, color='Teal', pad=20)


plt.ylabel("Lead Source", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("Count", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(size=13)
plt.yticks(size=13)

plt.show()

##### **`INFERENCE`**

- From the above graph we can see that Leads who come through refrence or from Wellingak website, or any other sources are more likely to get converted.
- Leads from Google are also quite likely to get converted. 

In [None]:
#Last Activity Vs Converted

plt.figure(figsize=[20,10])

sns.countplot(data = leads, y= 'Last Activity', order=leads['Last Activity'].value_counts().index,
              hue = 'Converted',palette='mako').set_title("Last Activity Vs Converted " ,
                                                                      fontsize=30, color='Teal', pad=20)


plt.ylabel("Last Activity", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("Count", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(size=13)
plt.yticks(size=13)

plt.show()

##### **`INFERENCE`**

- Here, we can notice that the Leads who's last activity is sending SMS are really good to target, as they are more likely to get converted.
- However, we should avoid leads who's last activities are- Olark Chat Conversation, Email Bounced or already converted leads.

In [None]:
#Specialization Vs Converted

plt.figure(figsize=[20,15])

sns.countplot(data = leads, y= 'Specialization', order=leads['Specialization'].value_counts().index,
              hue = 'Converted',palette='icefire').set_title("Specialization Vs Converted " ,
                                                                      fontsize=30, color='Teal', pad=20)


plt.ylabel("Specialization", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("Count", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(size=13)
plt.yticks(size=13)

plt.show()

##### **`INFERENCE`** 

- From the graph we can understand that leads from Management, Business Administration, Banking inverstment and insurance are more likely to get converted.
- However, people who do not mention their specialization are less likely to be converted.

In [None]:
#What is your current occupation Vs Converted

plt.figure(figsize=[20,10])

sns.countplot(data = leads, y= 'What is your current occupation', order=leads['What is your current occupation'].value_counts().index,
              hue = 'Converted',palette='cubehelix').set_title("What is your current occupation Vs Converted " ,
                                                                      fontsize=30, color='Teal', pad=20)


plt.ylabel("What is your current occupation", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("Count", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(size=13)
plt.yticks(size=13)

plt.show()

##### **`INFERENCE`**

- From the bar graph we can clearly see that Leads who are working should be targetted, followed by students.
- Unemployed leads are the worst category to target.

In [None]:
#Tags  Vs Converted

plt.figure(figsize=[20,15])

sns.countplot(data = leads, y= 'Tags', order=leads['Tags'].value_counts().index,
              hue = 'Converted',palette='viridis').set_title("Tags Vs Converted " ,
                                                                      fontsize=30, color='Teal', pad=20)


plt.ylabel("Tags", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("Count", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(size=13)
plt.yticks(size=13)

plt.show()

##### **`INFERENCE`**

- It can be observed from the plot above that Leads who are tagged as "Will revert back after reading the email" are more likely to be converted followed by "Closed by Horizzon" and "Lost to EINS".
- Leads who are still graduating, interested in other course or their phone ringing and not picking up are less likely to get converted.

In [None]:
#Last Notable Activity Vs Converted

plt.figure(figsize=[20,15])

sns.countplot(data = leads, y= 'Last Notable Activity', order=leads['Last Notable Activity'].value_counts().index,
              hue = 'Converted',palette='flare').set_title("Last Notable Activity Vs Converted " ,
                                                                      fontsize=30, color='Teal', pad=20)


plt.ylabel("Last Notable Activity\n\n", fontdict={'fontsize': 25, 'fontweight' : 5, 'color' : 'Brown'})
plt.xlabel("Count", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(size=13)
plt.yticks(size=13, rotation=0)

plt.show()

##### **`INFERENCE`**

- The result here is very similar to that of Last Activity performed by the Lead.

In [None]:
#Checking all the columns that we are left with...

print(list(leads.columns), end="")

In [None]:
#Checking the percentile of all the numerical values

round(leads.describe(percentiles=[0.10,0.25,0.50,0.60,0.75,0.90,0.95,0.99,0.999]),2)

In [None]:
plt.figure(figsize=[12,12])

f=sns.heatmap(leads.corr(),cmap = "YlGnBu" , annot=True).set_title('Correlation between variables\n', fontsize = 25, color='Teal')
plt.yticks(rotation=0)
plt.show()

##### **`INFERENCE`**

- The heatmap clearly shows us that there is a strong correlation between "Page Views Per Visit" and "Total Visit" column.
- Similar positive correlations can be identified between "Total Time Spent on Website" against the "Converted" value.
- There is also a positive correlation between "Total Time Spent on Website" with both "Total Visit" and "Page Views Per Visit".

### Preparing the Dataset for Modelling

In [None]:
#Before we proceed with the modelling, we need to create dummies for the categorical columns.

#Finding all the categorical columns:

cols_to_drop = leads.select_dtypes(include=[object]).columns
cols_to_drop

In [None]:
#Creating dummies for all the categorical columns one by one:

#For "Lead Origin", "What is your current occupation" and "City":

dummy = pd.get_dummies(leads[['Lead Origin','What is your current occupation']], drop_first=True)

leads = pd.concat([leads,dummy], axis=1)

#For "Specialization":

dummy = pd.get_dummies(leads['Specialization'], prefix  = 'Specialization')

dummy = dummy.drop(['Specialization_Unknown'], axis = 1)

leads = pd.concat([leads, dummy], axis = 1)

#For "Lead Source":

dummy = pd.get_dummies(leads['Lead Source'], prefix  = 'Lead Source')

dummy = dummy.drop(['Lead Source_Others'], axis = 1)

leads = pd.concat([leads, dummy], axis = 1)

#For "Last Activity":

dummy = pd.get_dummies(leads['Last Activity'], prefix  = 'Last Activity')

dummy = dummy.drop(['Last Activity_Others'], axis = 1)

leads = pd.concat([leads, dummy], axis = 1)

#For "Tags":

dummy = pd.get_dummies(leads['Tags'], prefix  = 'Tags')

dummy = dummy.drop(['Tags_Other_Reasons'], axis = 1)

leads = pd.concat([leads, dummy], axis = 1)

#For "Last Notable Activity":

dummy = pd.get_dummies(leads['Last Notable Activity'], prefix  = 'Last Notable Activity')

dummy = dummy.drop(['Last Notable Activity_Other_Activities'], axis = 1)

leads = pd.concat([leads, dummy], axis = 1)

In [None]:
#Dropping the categorical columns:

leads.drop(columns = cols_to_drop, axis=1, inplace = True)

In [None]:
leads.info() #Checking the columns that have been created after the dummy creation

In [None]:
#Resetting the index of the dataframe

leads.reset_index()

### Test Train Split

In [None]:
#Creating the X and y variables:

y = leads["Converted"]

X = leads.drop(columns=["Converted"])

In [None]:
X.head() #verifying

In [None]:
y.head() #verifying

In [None]:
#Performing the train_test_split:

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 30)

In [None]:
X_train.shape #Verifying

In [None]:
X_test.shape #Verifying

### Scaling the Data

In [None]:
#Here, in this case, we are going to perform Standardization

scale = StandardScaler() #creating an object of the class

numerical_cols = X_train.select_dtypes(include = ['float64','int64']).columns #Finding the numerical columns

#Now we use the Standardization on the numerical columns:

X_train[numerical_cols] = scale.fit_transform(X_train[numerical_cols])

X_train.head() #Verifying if the scaling happened correctly

## Building the Logistic Regression Model

### USING RECURSIVE FEATURE ELIMINATION (RFE) 
#### TO SELECT TOP 15 PREDICTOR VARIABLES 

In [None]:
#Creating a object of the Logistic Regression class

regression_ = LogisticRegression()

#Taking the help of RFE to eliminate the less important columns:

rfe = RFE(regression_, 15)             # running RFE with 15 variables as output

rfe = rfe.fit(X_train, y_train)

In [None]:
rfe.support_

In [None]:
lead_rfe = pd.DataFrame({'PREDICTORS': X_train.columns, 'SELECTED ': rfe.support_, 'RANKS': rfe.ranking_})
lead_rfe.sort_values(by='RANKS')

In [None]:
#Here are the list of RFE supported columns:

cols = X_train.columns[rfe.support_]
cols

In [None]:
X_train.columns[~rfe.support_]

### MODEL NUMBER - 1

In [None]:
#Building the first Logistic Regression Model:

X_train_sm = sm.add_constant(X_train[cols])

model1 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())

result = model1.fit()

result.summary()

##### **`INFERENCE`**

- From the results given by StatsModel, we can see that none of the columns have a very high P value.
- Hence, we will not drop anything here. Instead we will go ahead and find the Variance Inflation Factor to make further decisions.

In [None]:
#Checking the Variation Inflation Factor:

vif = pd.DataFrame()
vif['Features'] = X_train[cols].columns
vif['VIF'] = [variance_inflation_factor(X_train[cols].values, i) for i in range(X_train[cols].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

##### **`INFERENCE`**

- Right off the bat, we can see that the Variation Inflation Factor of "Last Activity_SMS Sent" and "Last Notable Activity_SMS Sent". 
- This goes to show that these features have high correlation among them. 
- In this case, we will be dropping off the feature: "Last Activity_SMS Sent" and build the model again.

In [None]:
#Dropping the "Last Activity_SMS Sent" column from the list of columns from before:

cols = cols.drop('Last Activity_SMS Sent',1)

In [None]:
cols #Verifying if the column has been dropped

### MODEL NUMBER - 2

In [None]:
#Building our second logistic regression model:

X_train_sm = sm.add_constant(X_train[cols])

model2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())

result = model2.fit()

result.summary()

In [None]:
#Since there are no high P values, we will go ahead and find the VIF:

vif = pd.DataFrame()
vif['Features'] = X_train[cols].columns
vif['VIF'] = [variance_inflation_factor(X_train[cols].values, i) for i in range(X_train[cols].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

##### **`INFERENCE`**

- After building the second model, we can see that the all the Variation Inflation Factors are now sensible.
- We will use this model to derive our probabilities etc on the Training dataset.

In [None]:
#Predicting on the training dataset:

y_train_pred = result.predict(X_train_sm)

y_train_pred.head(10)

In [None]:
y_train_pred = y_train_pred.values.reshape(-1) #Reshaping the data

In [None]:
#Now it is time to compare the Actual Converted with that of the Predicted Conversion.

y_train_pred_final = pd.DataFrame({'Converted':y_train.values, 'Converted_prob':y_train_pred})

y_train_pred_final['Prospect ID'] = y_train.index

y_train_pred_final.head()

##### **`INFERENCE`**

- What we get to see here is that, the higher the value, the more likely the Lead is going to get converted.

In [None]:
# Now we are going to take an arbitrary number 
# If the converted probability is higher than this number, the Lead is converted else not.

y_train_pred_final['Predicted'] = y_train_pred_final.Converted_prob.map(lambda x: 1 if x > 0.5 else 0)

y_train_pred_final.head()

### Evaluating the Model

In [None]:
#Checking the Confusion Matrix of this Logistic Regression Model:
 
con_matrix = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Predicted )

print(con_matrix)

In [None]:
#Checking the Accuracy of the model

acc = metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.Predicted)

print("Accuracy in the model is: " + str(round(acc,2)))

In [None]:
#Finding: true positive, true negatives, false positives, false negatives

TP = con_matrix[1,1] 
TN = con_matrix[0,0] 
FP = con_matrix[0,1] 
FN = con_matrix[1,0] 

In [None]:
# Checking the sensitivity of the model:

round(TP / float(TP+FN),2)

In [None]:
# Checking the specificity of the model:

round(TN / float(TN+FP),2)

In [None]:
# Checking the False positive rates of the model:

round(FP/ float(TN+FP),2)

In [None]:
# Calculating Positive Predictive value of the model:

round (TP / float(TP+FP),2)

In [None]:
# Calculating the Negative predictive value of the model:

round (TN / float(TN+ FN),2)

In [None]:
#Now to check the ROC curve:

def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(10, 7))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('\nFalse Positive Rate or [1 - True Negative Rate]', size = 13, color = "Brown")
    plt.ylabel('True Positive Rate\n', size = 13, color = "Brown")
    plt.title('Receiver operating characteristic example', size = 20, pad = 20, color = "Teal")
    plt.xticks(size = 13)
    plt.yticks(size = 13)
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
fpr, tpr, thresholds = metrics.roc_curve( y_train_pred_final.Converted, y_train_pred_final.Converted_prob, drop_intermediate = False )

In [None]:
draw_roc(y_train_pred_final.Converted, y_train_pred_final.Converted_prob)

##### **`INFERENCE`**

- From the ROC curve, we can see that the area under the curve is very high (0.97).
- A high area under the curve indicates that the model is very good.

### Finding the best Cut-Off

In [None]:
#Finding the different probability cut-offs:

numbers = [float(x)/10 for x in range(10)]

for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.Converted_prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

- But with this chart alone, we cannot decide.
- A better way to decide will be to create a graph and plot- accuracy sensitivity and specificity for different probabilities.

In [None]:
# Calculating the accuracy sensitivity and specificity for various probability cutoffs.
cutoff = pd.DataFrame( columns = ['Prob','Accuracy','Sensitivity','Specificity'])

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff.loc[i] =[ i ,accuracy,sensi,speci]
    
print(cutoff)

In [None]:
# Visualizing the graph:

cutoff.plot.line(x='Prob', y=['Accuracy','Sensitivity','Specificity'])
plt.xlabel('\nProbabilities', size = 13, color = "Brown")
plt.title('Probabilities of Accuracy, Sensitivity and Specificity', size = 15, pad = 20, color = "Teal")
plt.xticks(size = 13)
plt.yticks(size = 13)

plt.show()

##### **`INFERENCE`**

- From the above graph, we can make out that the optimal cut-off for our model will be 0.28.
- This is the point where the sensitivity, accuracy and specificity co-exist.

In [None]:
#Finding the final predicted

y_train_pred_final['Final_Predicted'] = y_train_pred_final.Converted_prob.map( lambda x: 1 if x > 0.28 else 0)

y_train_pred_final.head()

In [None]:
# Checking the final confusion matrix:

con_matrix1 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Final_Predicted )
print(con_matrix1)

In [None]:
# Accuracy of the model after selecting optimal cut-off.

round(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.Final_Predicted),2)

In [None]:
# Sensitivity of the model after selecting the optimal cut-off.

round(TP / float(TP+FN),2)

In [None]:
# Specificity of the model after selecting the optimal cut-off.

round(TN / float(TN+FP),2)

##### **`INFERENCE`**

- We have seen that our model is producing a ROC curve with AUC of 0.97.
- This is really good. The higher the value, the better the model.
- Also, the Accuracy, Sensitivity and Specificity of the model are 92%, 88% and 96% respectively. 

In [None]:
# Calculating the Precision:

TP / TP + FP

round(con_matrix1[1,1]/(con_matrix1[0,1]+con_matrix1[1,1]),2)

In [None]:
# Calculating the Recall:

TP / TP + FN

round(con_matrix1[1,1]/(con_matrix1[1,0]+con_matrix1[1,1]),2)

In [None]:
#Finding the precision_score:

round(precision_score(y_train_pred_final.Converted , y_train_pred_final.Final_Predicted),2)

In [None]:
#Finding the recall_score:

round(recall_score(y_train_pred_final.Converted, y_train_pred_final.Final_Predicted),2)

In [None]:
#Creating the Recall Curve:
plt.figure(figsize=[10,10])

y_train_pred_final.Converted, y_train_pred_final.Final_Predicted
p, r, thresholds = precision_recall_curve(y_train_pred_final.Converted, y_train_pred_final.Converted_prob)

plt.plot(thresholds, p[:-1], "y-")
plt.plot(thresholds, r[:-1], "b-")
plt.show()

### Preparing the Test Set

In [None]:
#Scaling the Test Set:

numeric_cols=X_test.select_dtypes(include=['float64', 'int64']).columns

X_test[numeric_cols] = scale.transform(X_test[numeric_cols])

X_test.head()

### Prediction on Testing Dataset

In [None]:
# Adding a constant since we are using StatsModel
X_test = X_test[cols]

X_test_sm = sm.add_constant(X_test)

y_test_pred = result.predict(X_test_sm)

In [None]:
y_test_pred.head(10)

In [None]:
y_pred_ = pd.DataFrame(y_test_pred)

In [None]:
y_pred_.head()

In [None]:
# Converting y_test to dataframe

y_test_df = pd.DataFrame(y_test)

In [None]:
# Making Prospect ID as the index:

y_test_df['Prospect ID'] = y_test_df.index

In [None]:
# Removing index for both dataframes:

y_pred_.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)

In [None]:
# Appending y_test_df and y_pred_1
y_pred_final = pd.concat([y_test_df, y_pred_],axis=1)

In [None]:
y_pred_final.head()

In [None]:
# Renaming the column:

y_pred_final= y_pred_final.rename(columns={ 0 : 'Converted_probability'})

y_pred_final.head()

In [None]:
# Arranging the cols:

y_pred_final = y_pred_final[['Prospect ID','Converted','Converted_probability']]
y_pred_final['Lead_Score'] = y_pred_final.Converted_probability.map( lambda x: round(x*100))

y_pred_final.head()

In [None]:
y_pred_final['Final_Predicted'] = y_pred_final.Converted_probability.map(lambda x: 1 if x > 0.28 else 0)

y_pred_final.head()

In [None]:
# Checking the confusion Matrix:

con_mat_pred = metrics.confusion_matrix(y_pred_final.Converted, y_pred_final.Final_Predicted )
print(con_mat_pred)

In [None]:
#Checking the Accuracy on the Test Dataset of the model:

round(metrics.accuracy_score(y_pred_final.Converted, y_pred_final.Final_Predicted),2)

In [None]:
#Checking the Sensitivity on the Test Dataset of the model:

TP = con_mat_pred[1,1] 
TN = con_mat_pred[0,0] 
FP = con_mat_pred[0,1] 
FN = con_mat_pred[1,0] 


round(TP / float(TP+FN),2)

In [None]:
#Checking the Specificity on the Test Dataset of the model:

round(TN / float(TN+FP),2)

In [None]:
#Finding the precision_score on the Test Dataset:

round(precision_score(y_pred_final.Converted , y_pred_final.Final_Predicted),2)

In [None]:
#Finding the recall_score on the Test Dataset:

round(recall_score(y_pred_final.Converted, y_pred_final.Final_Predicted),2)

##### **`INFERENCE ON THE TRAIN DATASET`**

- When the Logistic Regression was applied to the Test dataset, we can see that the Accuracy, Sensitivity and Specificity are 92%, 88% and 96% respectively.
- The precision score is 88%.
- The recall score is 91%.

##### **`INFERENCE ON THE TEST DATASET`**

- When the Logistic Regression was applied to the Test dataset, we can see that the Accuracy, Sensitivity and Specificity are 93%, 93% and 92% respectively.
- The precision score is 88%.
- The recall score is 93%.



- The model is doing a great job in prediction! 
- This model can hence, be used, to make sound business descisions.

# CONCLUSION:

#### FINAL MODEL EQUATION

**`Probability of conversion = -4.3841 + ( 1.1248 * Total Time Spent on Website ) + ( 3.0066 * Lead Origin_Lead Add Form ) + ( 1.5997 * Lead Source_Olark Chat ) + ( 2.6224 * Lead Source_Welingak Website ) + ( -1.7582 * Last Activity_Email Bounced) + ( -1.2890 * Last Activity_Olark Chat Conversation ) + ( 3.1471 * Tags_Busy ) + ( 9.1264 * Tags_Closed by Horizzon ) + ( 8.6147 * Tags_Lost to EINS ) + ( -0.9941 * Tags_Ringing ) + ( 2.6034 * Tags_Unknown ) + ( 6.9711 * Tags_Will revert after reading the email ) + ( -0.7132 * Last Notable Activity_Modified ) + ( 2.2050 * Last Notable Activity_SMS Sent )`**

 - **Tags_Closed by Horizzon has the highest coefficient of 9.1264**, which means keeping other variable constant an unit increase results in 9.1264 unit increase in Probability of conversion.
 - **Tags_Closed by Horizzon**, **Tags_Lost to EINS** and **Tags_Will revert after reading the email** are the **top 3 variables** having strong coefficients.
 - Last Activity_Olark Chat Conversation, Tags_Ringing and Last Notable Activity_Modified have **negative coeeficient**, which means increase in values of these variables would result in decrease in value of Probability of conversion.
 - Probability of conversion increases if Tags_Busy, Lead Origin_Lead Add Form, Lead Source_Welingak Website, Tags_Unknown, Last Notable Activity_SMS Sent, Lead Source_Olark Chat, Total Time Spent on Website increases as these variables have **positive coefficients**.
 - **Constant value** - when all other variables are zero the Probability of conversion value will still be **-4.3841**


 - Comparing Precision, Recall and other metrics value for both train and test. Our model performs well on test set as well.
 - This model explains how exactly the Probability of conversion vary with different features. The management can accordingly manipulate the business strategy to meet the conversion target and meet the business expectations.
 - In business terms, this model can be deployed in the upcoming future to meet the X education's requirements.
 - Focusing on the features of the model will increase their chances of contacting most of the potential buyers for the course.
 - The Marketting team and evaluate the leads based on the top 3 variables and make sound business decisions.
 - The Marketting team can also chase after leads, who spend longer time on their website, orginate from Add form.
 - The team can also come with interesting courses and offers that attract people with specialization in banking, investment and insurance.
 - They can also keep a close watch on Leads originating from Olark Chat.