In [None]:
# Importing useful libraries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve

pd.set_option("display.max_columns",150)
pd.set_option("display.max_rows",150)

In [None]:
# Importing the data
df = pd.read_csv("../input/lead-scoring-x-online-education/Leads X Education.csv")
df.head()

In [None]:
# Checking the shape of data
df.shape

In [None]:
#Checking info of variables in dateframe like data type and null count
df.info()

In [None]:
# Let's check statistical details of numerical variables
df.describe()

## EDA

### Data Cleaning and preparation
### Univariate analysis

In [None]:
# checking null values percentage in descending order
null_values = round(df.isnull().sum().sort_values(ascending = False)/len(df)*100,2)
null_values

- We can drop Asymmetrique Profile Index & Asymmetrique Profile Score, since Asymmetrique Activity Index & Asymmetrique Activity Score will resembles same

In [None]:
# Dropping 'Asymmetrique Profile Index' &'Asymmetrique Activity Index'
df = df.drop(['Asymmetrique Profile Index','Asymmetrique Activity Index'],axis=1)

### Asymmetrique Profile Score

In [None]:
df['Asymmetrique Profile Score'].describe()

In [None]:
# Data visualization
sns.distplot(df['Asymmetrique Profile Score'])
plt.show()

- Multimodal distrubution is observed.
- Asymmetrique Profile Score of 16 has highest count.

In [None]:
# Treating the missing values with median value as per observation from above stats
df['Asymmetrique Profile Score']= df['Asymmetrique Profile Score'].fillna(df['Asymmetrique Profile Score'].median())

### Asymmetrique Activity Score

In [None]:
df['Asymmetrique Activity Score'].describe()

In [None]:
# Data visualization
sns.distplot(df['Asymmetrique Activity Score'])
plt.show()

In [None]:
# Treating the missing values with median value as per observation from above stats
df['Asymmetrique Activity Score'] = df['Asymmetrique Activity Score'].fillna(df['Asymmetrique Activity Score'].median())

- Multimodal distrubution is observed.
- Asymmetrique Profile Score of 14 has highest count.

### Let's check Other columns having missing values before dropping them
### Tags

In [None]:
df['Tags'].value_counts()

In [None]:
df['Tags'].value_counts().index

- Lets fill the values less than 1% and null values with others
- This may be an good interpreter for modeling since it is a feedback by customer

In [None]:
Others = ['invalid number',
       'Diploma holder (Not Eligible)', 'wrong number given', 'opp hangup',
       'number not provided', 'in touch with EINS', 'Lost to Others',
       'Want to take admission but has financial problems', 'Still Thinking',
       'Interested in Next batch', 'In confusion whether part time or DLP',
       'Lateral student', 'University not recognized',
       'Shall take in the next coming month',
       'Recognition issue (DEC approval)']
df['Tags'] = df['Tags'].replace(Others,'Others')
df['Tags'].fillna('Others',inplace=True)

In [None]:
df['Tags'].value_counts(normalize=True)*100

In [None]:
plt.figure(figsize=(14,6))
sns.countplot(df['Tags'],hue=df['Converted'])
plt.xticks(rotation=90)
plt.show()

### What matters most to you in choosing a course

In [None]:
df['What matters most to you in choosing a course'].value_counts()

- As a general perspective for upskilling is looking for better career prospects so this column can be dropped

In [None]:
df = df.drop('What matters most to you in choosing a course',axis=1)

### Lead Profile

In [None]:
df['Lead Profile'].value_counts()/len(df)

- From above data we can observe that 29% of data is missing and 45% of the data is not defined. In total unavailable data for this column will be ~74%. So, we can drop this variable

In [None]:
df = df.drop('Lead Profile',axis=1)

### What is your current occupation

In [None]:
df['What is your current occupation'].value_counts()

In [None]:
df['What is your current occupation'].fillna('Unemployed',inplace=True)

- Missing values can be imputed with Unemployed, since it most occuring value and a reasonable value

In [None]:
# Data visualization
plt.figure(figsize=(12,5))
sns.countplot(df['What is your current occupation'],hue=df['Converted'])
plt.xticks(rotation=45)
plt.show()

- From the above plot we can observe and in terms of business aspect below observations are made
    - Unemployed are the one who are willing to upskill for getting better jobs
    - Working professional who are looking for professional growth
    - So, Working professionals and unemployed are good to target

### Country

In [None]:
# Checking for Percentage of missing values
df['Country'].value_counts()

In [None]:
other_countires = df['Country'].value_counts().index[1:]

In [None]:
df['Country'] = df['Country'].replace(other_countires,'Other_Country')
df['Country'].fillna(df['Country'].mode(),inplace=True)

In [None]:
df['Country'].value_counts()

- About 27% of data is missing in the variable country. And most of the leads are from India only and others countires all together contributing around 2 %. So can create a new value as other country to proceed further for modelling.
- For missing value treatment we can use mode to impute the missing values.
- This results in skewed data to India, So it is better to drop this column.

In [None]:
df = df.drop('Country',axis=1)

### City

In [None]:
df['City'].value_counts(normalize=True)

In [None]:
df['City'] = df['City'].replace('Select',)
df['City'].fillna(df['City'].mode(),inplace=True)

In [None]:
df['City'].value_counts(normalize=True)

In [None]:
df = df.drop('City', axis=1)

- About 15 % of values are missing and 28% are undefined. In total City has about 43% of missing values. If we treat by imputing most occuring value. 
- Thane and Ouskirts also belongs to Mumbai.
- City is biased to Mumbai i.e., about 71%, So we should drop this column

### How did you hear about X Education

In [None]:
# Checking for various responses given by leads
df['How did you hear about X Education'].value_counts()/len(df)*100

In [None]:
df = df.drop('How did you hear about X Education',axis=1)

- From above data we can observe that 24% of data is missing and 54% of the data is not defined. In total unavailable data for this column will be ~78%. So, we can drop this variable
- Most of the things now a days we are hearing through internet i.e., online search.
- If we impute the undefined value Select with Online search data will skew towrds Online search, So we can drop this column.

### Lead Source

In [None]:
df['Lead Source'].value_counts()

In [None]:
# Based on above observations correcting the values of Lead Source
df['Lead Source'] = df['Lead Source'].replace({'google':'Google','welearnblog_Home':'WeLearn'})
df['Lead Source'].value_counts()

In [None]:
# Dropping the records which are having very less number of count from above observation say < 20
df['Lead Source'] = df['Lead Source'].replace(['bing','Click2call','Press_Release','Social Media','WeLearn',
                                'Live Chat','Pay per Click Ads','NC_EDM','testone','youtubechannel',
                                'blog'],)
df['Lead Source'].fillna(df['Lead Source'].mode(),inplace=True)

df['Lead Source'].value_counts()

In [None]:
# Data visualization
plt.figure(figsize=(12,5))
sns.countplot(df['Lead Source'],hue=df['Converted'])
plt.xticks(rotation=90)
plt.show()

- Leads generated from google, direct landed on webpage, Quark chat, Reference and Organic search can be a good targetting audience.
- People from Reference and Wellingak website are more likely to convert

### Lead Quality

In [None]:
# Lead quality has highest missing values of 52%.
# Since lead Quality can be a good intepreting variable let's see how it is distrubuted
df['Lead Quality'].value_counts()

In [None]:
# Missing values are about 50% and if we replace the null values by 'Not Sure' in the perspective of business 
#this could be an important variable
df['Lead Quality'] = df['Lead Quality'].fillna('Not Sure')
df['Lead Quality'].value_counts(normalize=True)*100

In [None]:
# Data visualization
plt.figure(figsize=(12,5))
sns.countplot(df['Lead Quality'],hue=df['Converted'])
plt.show()

- Lead quality is an unpredictable variable, because not sure leads are more and modelrately likely to convert
- Might be, high in relevance, low in relevance are most likely to converted and can be targeted.

### TotalVisits

In [None]:
df['TotalVisits'].describe()

In [None]:
# Data visualization
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
sns.distplot(df['TotalVisits'], color='Green')
plt.subplot(1,2,2)
sns.boxplot(df['TotalVisits'])
plt.show()

In [None]:
# Since outliers are present in the above variable we shall trat them by soft capping for further analysis
percentiles = df['TotalVisits'].quantile([0.01,0.99]).values
df['TotalVisits'][df['TotalVisits'] <= percentiles[0]] = percentiles[0]
df['TotalVisits'][df['TotalVisits'] >= percentiles[1]] = percentiles[1]

In [None]:
percentiles

In [None]:
df['TotalVisits'].fillna(df['TotalVisits'].mean(),inplace=True) # A few missing values are treated by mean value

- Data is showing Bi-modal distrubution. Clearly indicates leads are from different background i.e., by occupation, Education level, gender etc.
- Since outliers are present, in order to reduce the computation and model get unaffected, we have treated by soft capping
- Missing values are abou 1.5% and since it is a contineous variable we have imputed with mean.

#### Total Time Spent on Website

In [None]:
df['Total Time Spent on Website'].describe()

In [None]:
# Univariate Analysis
# Data visualization
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
sns.distplot(df['Total Time Spent on Website'], color='Green')
plt.subplot(1,2,2)
sns.boxplot(df['Total Time Spent on Website'])
plt.show()

- Data is showing Bi-modal distrubution. Clearly indicates leads are from different background i.e., by occupation, Education level, gender etc are spending different time on website.
- No outliers are observed

#### Page Views Per Visit

In [None]:
df['Total Time Spent on Website'].describe()

In [None]:
# Data visualization
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
sns.distplot(df['Page Views Per Visit'], color='Green')
plt.subplot(1,2,2)
sns.boxplot(df['Page Views Per Visit'])
plt.show()

In [None]:
# Treating the missing values
df['Page Views Per Visit'].fillna(df['Page Views Per Visit'].mean(),inplace=True)

In [None]:
# Since outliers are present in the above variable we shall trat them by soft capping for further analysis
percentiles = df['Page Views Per Visit'].quantile([0.01,0.99]).values
df['Page Views Per Visit'][df['Page Views Per Visit'] <= percentiles[0]] = percentiles[0]
df['Page Views Per Visit'][df['Page Views Per Visit'] >= percentiles[1]] = percentiles[1]

- Data is showing Bi-modal distrubution. Clearly indicates leads are from different background i.e., by occupation, Education level, gender etc.
- Missing values percentage is 1.5% and are imputed with mean
- Since outliers are present, in order to reduce the computation and model get unaffected, we have treated by soft capping

### Last Activity

In [None]:
df['Last Activity'].value_counts() # Check for values in the data for cleansing

In [None]:
df['Last Activity'].value_counts().index

In [None]:
# Replacing the very low values by most occuring value to make model simpler
Others = ['Email Bounced',
       'Email Link Clicked', 'Form Submitted on Website', 'Unreachable',
       'Unsubscribed', 'Had a Phone Conversation', 'Approached upfront',
       'View in browser link Clicked', 'Email Received', 'Email Marked Spam',
       'Visited Booth in Tradeshow', 'Resubscribed to emails']
df['Last Activity']=df['Last Activity'].replace(Others,'Others')
df['Last Activity'].value_counts()

In [None]:
# Data visualization
plt.figure(figsize=(12,5))
sns.countplot(df['Last Activity'],hue=df['Converted'])
plt.xticks(rotation=90)
plt.show()

- Leads with status of last activity as SMS sent can be most targettable
- Leads with Email opened are moderatly likely to target for conversion.

### Specialization

In [None]:
df['Specialization'].value_counts(normalize=True)/len(df)

In [None]:
df['Specialization'].value_counts().index

In [None]:
# Customers less than 4 % can be considered as other category, which make ease for further model analysis
others = ['Travel and Tourism', 'Media and Advertising', 'International Business',
       'Healthcare Management', 'Hospitality Management', 'E-COMMERCE',
       'Retail Management', 'Rural and Agribusiness', 'E-Business',
       'Services Excellence','Select']

In [None]:
df['Specialization'] = df['Specialization'].replace(others,'Others')
df['Specialization'].fillna('Others',inplace=True)

In [None]:
df['Specialization'].value_counts(normalize=True)*100

In [None]:
# Data visualization
plt.figure(figsize=(12,5))
sns.countplot(df['Specialization'], hue=df['Converted'])
plt.xticks(rotation=90)
plt.show()

- From above data we can observe that 16% of data is missing and 24% of the data is not defined. In total unavailable data for this column will be ~40%. 
- So we can consider this could be an important variable, we shall treat them by assigning as Others

### Let's check for unique values in the variables

In [None]:
df.nunique()

In [None]:
# Dropping the variable which are unique from above data
df = df.drop(['Lead Number','Magazine','Receive More Updates About Our Courses','Update me on Supply Chain Content',
        'Get updates on DM Content','I agree to pay the amount through cheque'],axis=1)

In [None]:
df.shape

### Lets check the value counts of Yes/No variable from above data to decide the vaiable for modeling

### Do Not Call

In [None]:
df['Do Not Call'].value_counts(normalize=True)*100

In [None]:
df.drop('Do Not Call',axis=1,inplace=True)

- All the leads are likely to receive calls about the course, very less number who ignore.
- And we can drop the column

### Do Not Email

In [None]:
df['Do Not Email'].value_counts(normalize=True)*100

- All the leads are likely to receive mails about the course, there are about 8% people who ignore.

### Converted

In [None]:
df['Converted'].value_counts(normalize=True)*100

- About 39% of the leads are converted into business

### Search

In [None]:
df['Search'].value_counts(normalize=True)*100

In [None]:
# dropping the Search variable, since it is skewed
df = df.drop('Search',axis=1)

### Newspaper Article

In [None]:
df['Newspaper Article'].value_counts(normalize=True) # Check for values in the data for cleansing

In [None]:
# dropping the newspaper article variable, since it is skewed
df = df.drop('Newspaper Article',axis=1)

### Newspaper

In [None]:
df['Newspaper'].value_counts()

In [None]:
# dropping the newspaper variable, since it is skewed
df = df.drop('Newspaper',axis=1)

### X Education Forums

In [None]:
df['X Education Forums'].value_counts()

In [None]:
# dropping the X Education Forums variable, since it is skewed
df = df.drop('X Education Forums',axis=1)

### Digital Advertisement

In [None]:
df['Digital Advertisement'].value_counts()

In [None]:
# dropping the Digital Advertisement variable, since it is skewed
df = df.drop('Digital Advertisement',axis=1)

### Through Recommendations

In [None]:
df['Through Recommendations'].value_counts()

In [None]:
# dropping the Through Recommendations variable, since it is skewed
df = df.drop('Through Recommendations',axis=1)

### A free copy of Mastering The Interview

In [None]:
df['A free copy of Mastering The Interview'].value_counts()

In [None]:
# Data visualization
sns.countplot(df['A free copy of Mastering The Interview'],hue=df['Converted'])
plt.show()

- As per business aspect, most of the people who are looking for upskilling will be either internal promotion or upgrading for current trends.
- A few category of people will be looking for mastering the interview. 
- That is well understandable from above plot

### Checking Other variables

In [None]:
df['Last Notable Activity'].value_counts()

In [None]:
# Let's categorize the low value counts less than 1% i.e, 90 to others to make modeling easier
df['Last Notable Activity'].value_counts().index

In [None]:
Others = ['Email Bounced',
       'Unsubscribed', 'Unreachable', 'Had a Phone Conversation',
       'Email Marked Spam', 'Form Submitted on Website', 'Email Received',
       'Resubscribed to emails', 'Approached upfront',
       'View in browser link Clicked']
df['Last Notable Activity'] = df['Last Notable Activity'].replace(Others,'Others')
df['Last Notable Activity'].value_counts()

In [None]:
# Data Visualization
plt.figure(figsize=(12,5))
sns.countplot(df['Last Notable Activity'],hue=df['Converted'])
plt.xticks(rotation=90)
plt.show()

- Leads with status of last activity as SMS sent can be most targettable
- Leads with Email opened are moderatly likely to target for conversion.

## Bivariate Analysis

### Total Visits V/s Time spent on website

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(df['TotalVisits'],df['Total Time Spent on Website'],hue=df['Converted'])
plt.legend(loc='best')
plt.show()

- Leads with spending more time on website with 3 or more visits are more likely to get converted

### Page Views Per Visit v/s Total Time Spent on Website

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(df['Page Views Per Visit'],df['Total Time Spent on Website'],hue=df['Converted'])
plt.legend(loc='best')
plt.show()

- Leads with spending more time are more likely to get converted

### Page Views Per Visit v/s TotalVisits

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(df['Page Views Per Visit'],df['TotalVisits'],hue=df['Converted'])
plt.legend(loc='best')
plt.show()

- Total visits about 3 are more likely to convert

### Creating dummy variables for the remaining categorical variables and dropping originals

In [None]:
# Creating dummy variables for the variable 'Lead Origin'
Lead_origin = pd.get_dummies(df['Lead Origin'], prefix='LO',drop_first=True)
#Adding the results to the master dataframe
df1 = pd.concat([df,Lead_origin],axis=1)
# Dropping Lead_origin
df1.drop('Lead Origin', axis=1,inplace=True)

# Creating dummy variables for the variable 'Lead Source'
Lead_source = pd.get_dummies(df1['Lead Source'], prefix='LS',drop_first=True)
#Adding the results to the master dataframe
df1 = pd.concat([df1,Lead_source],axis=1)
# Dropping Lead_origin
df1.drop('Lead Source', axis=1,inplace=True)

# Creating dummy variables for the variable 'Lead Source'
Last_activity = pd.get_dummies(df1['Last Activity'], prefix='LA',drop_first=True)
#Adding the results to the master dataframe
df1 = pd.concat([df1,Last_activity],axis=1)
# Dropping Last_activity
df1.drop('Last Activity', axis=1,inplace=True)

# Creating dummy variables for the variable 'Specialization'
tags = pd.get_dummies(df1['Tags'], prefix='T',drop_first=True)
#Adding the results to the master dataframe
df1 = pd.concat([df1,tags],axis=1)
# Dropping Lead_origin
df1.drop('Tags', axis=1,inplace=True)

# Creating dummy variables for the variable 'Specialization'
Lead_Quality = pd.get_dummies(df1['Lead Quality'], prefix='LQ',drop_first=True)
#Adding the results to the master dataframe
df1 = pd.concat([df1,Lead_Quality],axis=1)
# Dropping Lead_origin
df1.drop('Lead Quality', axis=1,inplace=True)

# Creating dummy variables for the variable 'Specialization'
Specialization = pd.get_dummies(df1['Specialization'], prefix='Sp',drop_first=True)
#Adding the results to the master dataframe
df1 = pd.concat([df1,Specialization],axis=1)
# Dropping Lead_origin
df1.drop('Specialization', axis=1,inplace=True)

# Creating dummy variables for the variable 'What is your current occupation'
Occupation = pd.get_dummies(df1['What is your current occupation'], prefix='Occupation',drop_first=True)
#Adding the results to the master dataframe
df1 = pd.concat([df1,Occupation],axis=1)
# Dropping Lead_origin
df1.drop('What is your current occupation', axis=1,inplace=True)

# Creating dummy variables for the variable 'Last Notable Activity'
Last_notable_activity = pd.get_dummies(df1['Last Notable Activity'], prefix='LNA',drop_first=True)
#Adding the results to the master dataframe
df1 = pd.concat([df1,Last_notable_activity],axis=1)
# Dropping Lead_origin
df1.drop('Last Notable Activity', axis=1,inplace=True)



df1.head()

In [None]:
df.columns

In [None]:
# One hot encoding for 'Do Not Email' & 'A free copy of Mastering The Interview'
dummy = pd.get_dummies(df1[['Do Not Email', 'A free copy of Mastering The Interview']], drop_first=True)
df1 = pd.concat([df1,dummy], axis=1)
df1.drop(['Do Not Email', 'A free copy of Mastering The Interview'], axis=1, inplace=True)
df1.head()

In [None]:
df1_corr = df1.corr()
corr_final = df1_corr.where(np.triu(np.ones(df1_corr.shape),k=1).astype(np.bool))
corr_final

In [None]:
corr_final = corr_final.unstack()
corr_final = corr_final.sort_values(ascending=False).drop_duplicates()
corr_final.head(10)

In [None]:
# Defining X & y
X = df1.drop(['Prospect ID','Converted'],axis=1)
y = df1['Converted']

## Splitting of train-test data

In [None]:
X_train, X_test,y_train , y_test = train_test_split(X, y, train_size=0.7, test_size = 0.3, random_state = 0)

## Feature Scaling

In [None]:
scaler = StandardScaler()

X_train[['TotalVisits','Total Time Spent on Website','Page Views Per Visit',
         'Asymmetrique Activity Score','Asymmetrique Profile Score']] = scaler.fit_transform(
    X_train[['TotalVisits','Total Time Spent on Website','Page Views Per Visit',
         'Asymmetrique Activity Score','Asymmetrique Profile Score']])

In [None]:
# Logistic regression

lr = LogisticRegression()
rfe = RFE(lr,15) # Logistic regression with 20 variable
rfe = rfe.fit(X_train, y_train)

In [None]:
rfe.support_

In [None]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]
X_train.columns[~rfe.support_]

In [None]:
df1_corr = df1[['Converted','TotalVisits', 'Page Views Per Visit', 'Asymmetrique Activity Score',
       'Asymmetrique Profile Score', 'LO_Landing Page Submission',
       'LO_Lead Import', 'LO_Quick Add Form', 'LS_Google', 'LS_Organic Search',
       'LS_Reference', 'LS_Referral Sites', 'LA_Email Opened', 'LA_Others',
       'LA_Page Visited on Website', 'T_Graduation in progress',
       'T_Interested  in full time MBA', 'T_Others', 'LQ_Low in Relevance',
       'LQ_Might be', 'LQ_Not Sure', 'Sp_Business Administration',
       'Sp_Finance Management', 'Sp_Human Resource Management',
       'Sp_IT Projects Management', 'Sp_Marketing Management',
       'Sp_Operations Management', 'Sp_Others', 'Sp_Supply Chain Management',
       'Occupation_Housewife', 'Occupation_Other', 'Occupation_Student',
       'Occupation_Unemployed', 'Occupation_Working Professional',
       'LNA_Modified', 'LNA_Olark Chat Conversation',
       'LNA_Page Visited on Website',
       'A free copy of Mastering The Interview_Yes']].corr()
df1_corr

In [None]:
# Model building
X_train_sm = sm.add_constant(X_train[col])
log1 = sm.GLM(y_train,X_train_sm, sm.families.Binomial())
res = log1.fit()
res.summary()

- Model accuracy is good but there are some features that are having p-value > 0.05. So, proceeding with VIF score and feature elimination.

### VIF

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Dropping high p-value variable LS_Olark Chat
col = col.drop('LS_Olark Chat',1)

In [None]:
X_train_sm = sm.add_constant(X_train[col])
log1 = sm.GLM(y_train,X_train_sm, sm.families.Binomial())
res = log1.fit()
res.summary()

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Dropping high p-value variable LO_Lead Add Form
col = col.drop('LO_Lead Add Form',1)

In [None]:
X_train_sm = sm.add_constant(X_train[col])
log1 = sm.GLM(y_train,X_train_sm, sm.families.Binomial())
res = log1.fit()
res.summary()

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

- All p-values equal to or less than 0.05, so we can final the model and find the probablity

In [None]:
# Prediction of y value
y_train_pred = res.predict(X_train_sm)

In [None]:
y_train_pred = y_train_pred.values.reshape(-1)

In [None]:
y_train_pred_final = pd.DataFrame({'Converted':y_train.values, 'Conv_Prob':y_train_pred})
y_train_pred_final.head()

### Plotting the ROC Curve

In [None]:
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

An ROC curve demonstrates several things:

- It shows the tradeoff between sensitivity and specificity (any increase in sensitivity will be accompanied by a decrease in specificity).
- The closer the curve follows the left-hand border and then the top border of the ROC space, the more accurate the test.
- The closer the curve comes to the 45-degree diagonal of the ROC space, the less accurate the test.

In [None]:
fpr, tpr, thresholds = metrics.roc_curve( y_train_pred_final.Converted, y_train_pred_final.Conv_Prob, 
                                             drop_intermediate = False )

In [None]:
draw_roc(y_train_pred_final.Converted, y_train_pred_final.Conv_Prob)

### Finding Optimal Cutoff Point

Optimal cutoff probability is that prob where we get balanced sensitivity and specificity

In [None]:
# Let's create columns with different probability cutoffs 
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.Conv_Prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
# Let's plot accuracy sensitivity and specificity for various probabilities.
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.grid()
plt.show()

- From the curve above, 0.20 is the optimum point to take it as a cutoff probability.

In [None]:
y_train_pred_final['final_predicted'] = y_train_pred_final.Conv_Prob.map( lambda x: 1 if x > 0.20 else 0)

y_train_pred_final.head()

In [None]:
# Let's check the overall accuracy.
print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.final_predicted))

In [None]:
# Let's take a look at the confusion matrix again 
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.final_predicted )
confusion

In [None]:
TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
TP / float(TP+FN)

In [None]:
# Let us calculate specificity
TN / float(TN+FP)

In [None]:
# Calculate false postive rate - predicting churn when customer does not have churned
FP/ float(TN+FP)

In [None]:
# positive predictive value 
TP / float(TP+FP)

In [None]:
# Negative predictive value
TN / float(TN+ FN)

- We have acheived the very good scores of Specificity(0.91), Sensitivity(0.92), false positive(0.09), true positive(0.87) and true negative(0.94). Indicates model is well fit.
- Let's check the same on test data

### Precision and Recall & F1 score

#### Precision

In [None]:
precision_score(y_train_pred_final.Converted, y_train_pred_final.final_predicted)

#### Recall

In [None]:
recall_score(y_train_pred_final.Converted, y_train_pred_final.final_predicted)

In [None]:
f1_score(y_train_pred_final.Converted, y_train_pred_final.final_predicted)

- Precision is 0.92, recall is 0.87 and F1 Score is 0.89 which are good parameters of the model.
- Where F1 score is a relative average weight of precision and recall.

### Precision and recall tradeoff

In [None]:
from sklearn.metrics import precision_recall_curve
p, r, thresholds = precision_recall_curve(y_train_pred_final.Converted, y_train_pred_final.Conv_Prob)

In [None]:
plt.plot(thresholds, p[:-1], "g-")
plt.plot(thresholds, r[:-1], "r-")
plt.show()

### Making predictions on the test set

In [None]:
X_test[['TotalVisits','Total Time Spent on Website','Page Views Per Visit',
         'Asymmetrique Activity Score','Asymmetrique Profile Score']] = scaler.transform(
    X_test[['TotalVisits','Total Time Spent on Website','Page Views Per Visit',
         'Asymmetrique Activity Score','Asymmetrique Profile Score']])

In [None]:
X_test.head()

In [None]:
X_test_sm = sm.add_constant(X_test[col])

In [None]:
y_test_pred = res.predict(X_test_sm) # Predicting the y

In [None]:
# Converting y_pred to a dataframe which is an array
y_pred_1 = pd.DataFrame(y_test_pred)

In [None]:
# Converting y_test to dataframe
y_test_df = pd.DataFrame(y_test)

In [None]:
y_pred_1

In [None]:
# Removing index for both dataframes to append them side by side 
y_pred_1.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)

In [None]:
# Appending y_test_df and y_pred_1
y_pred_final = pd.concat([y_test_df, y_pred_1],axis=1)

In [None]:
y_pred_final.head()

In [None]:
# Renaming the column 
y_pred_final.columns = ['Converted', 'Conv_prob']
y_pred_final.head()

In [None]:
y_pred_final['final_predicted'] = y_pred_final.Conv_prob.map(lambda x: 1 if x > 0.2 else 0)

In [None]:
y_pred_final['Lead_score'] = y_pred_final.Conv_prob*100

In [None]:
y_pred_final.head()

In [None]:
# Let's check the overall accuracy.
metrics.accuracy_score(y_pred_final.Converted, y_pred_final.final_predicted)

In [None]:
confusion2 = metrics.confusion_matrix(y_pred_final.Converted, y_pred_final.final_predicted )
confusion2

In [None]:
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
TP / float(TP+FN)

In [None]:
# Let us calculate specificity
TN / float(TN+FP)

In [None]:
f1_score(y_pred_final.Converted, y_pred_final.final_predicted)

- Values of Sensitivity, specificity and F1 score are nearest to the train data set and we can say model is good.

In [None]:
df1.head()

In [None]:
df_sm = sm.add_constant(df1[col])
df1['Lead Score'] = round(pd.DataFrame(res.predict(df_sm))*100)

In [None]:
df1['Lead Score'].describe()

In [None]:
len(df1['Lead Score'][df1['Lead Score']>20])/len(df1)

In [None]:
df['Converted'].value_counts(normalize=True)

In [None]:
df1['final_predicted'] = df1['Lead Score'].map(lambda x: 1 if x > 20 else 0)

In [None]:
df1[['Prospect ID','Lead Score','Converted','final_predicted']].head()