In [None]:
# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn import metrics
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve

In [None]:
#Read the data

Lead_data = pd.read_csv("../input/leadscore/Leads.csv")
Lead_data.head()

In [None]:
Lead_data.shape

In [None]:
Lead_data['Specialization'].value_counts()

In [None]:
Lead_data.describe()

In [None]:
Lead_data.info()

## Data Cleaning

In [None]:
#Replacing the 'Select' values with null
Lead_data = Lead_data.replace('Select', np.nan)

In [None]:
round(100*(Lead_data.isnull().sum()/len(Lead_data)),2)

In [None]:
Lead_data.drop(['How did you hear about X Education','Search','Magazine','Newspaper Article','X Education Forums','Newspaper','Digital Advertisement','Through Recommendations','Receive More Updates About Our Courses','Lead Quality','Tags','Lead Profile','Asymmetrique Activity Index','Asymmetrique Profile Index','Asymmetrique Activity Score','Asymmetrique Profile Score','I agree to pay the amount through cheque','Update me on Supply Chain Content','Get updates on DM Content','Receive More Updates About Our Courses'], axis=1, inplace = True)
Lead_data.head()

In [None]:
Lead_data.shape

In [None]:
round(100*(Lead_data.isnull().sum()/len(Lead_data)),2)

In [None]:
Lead_data = Lead_data[~Lead_data['Lead Source'].isnull()]

In [None]:
Lead_data = Lead_data[~Lead_data['TotalVisits'].isnull()]

In [None]:
Lead_data = Lead_data[~Lead_data['Page Views Per Visit'].isnull()]

In [None]:
Lead_data = Lead_data[~Lead_data['Last Activity'].isnull()]

## Random Sample Imputation for 'Specialization'

In [None]:
def impute_nan(df,variable):
    random_sample = df[variable].dropna().sample(df[variable].isnull().sum(),random_state = 0)
    random_sample.index = df[df[variable].isnull()].index
    df.loc[df[variable].isnull(),variable] = random_sample

In [None]:
impute_nan(Lead_data, 'Specialization')

In [None]:
Lead_data['Specialization'].value_counts()

In [None]:
round(100*(Lead_data.isnull().sum()/len(Lead_data)),2)

In [None]:
Lead_data['Country'].mode()[0]

In [None]:
Lead_data['Country'].fillna(Lead_data['Country'].mode()[0], inplace=True)

In [None]:
round(100*(Lead_data.isnull().sum()/len(Lead_data)),2)

In [None]:
Lead_data['What is your current occupation'].mode()[0]

In [None]:
Lead_data['What is your current occupation'].fillna(Lead_data['What is your current occupation'].mode()[0], inplace=True)

In [None]:
round(100*(Lead_data.isnull().sum()/len(Lead_data)),2)

In [None]:
Lead_data['What matters most to you in choosing a course'].mode()[0]

In [None]:
Lead_data['What matters most to you in choosing a course'].fillna(Lead_data['What matters most to you in choosing a course'].mode()[0], inplace=True)

In [None]:
round(100*(Lead_data.isnull().sum()/len(Lead_data)),2)

In [None]:
Lead_data['City'].mode()[0]

In [None]:
Lead_data['City'].fillna(Lead_data['City'].mode()[0], inplace=True)

In [None]:
round(100*(Lead_data.isnull().sum()/len(Lead_data)),2)

In [None]:
Lead_data.info()

In [None]:
Lead_data['Do Not Email'].value_counts()

In [None]:
Lead_data['Do Not Call'].value_counts()

In [None]:
Lead_data['City'].value_counts()

In [None]:
Lead_data.drop(['Prospect ID','Do Not Email','Do Not Call','What matters most to you in choosing a course','Country'],axis=1, inplace = True)
Lead_data.head()

In [None]:
Lead_data['Lead Origin'].value_counts()

## Univariate Analysis

In [None]:
fig, subp = plt.subplots(1,3, figsize = (18,5),)
plt1 = sns.boxplot(Lead_data['TotalVisits'], ax= subp[0], orient = 'v')
plt2 = sns.boxplot(Lead_data['Total Time Spent on Website'], ax = subp[1], orient = 'v')
plt3 = sns.boxplot(Lead_data['Page Views Per Visit'], ax = subp[2], orient = 'v')

plt.show()

In [None]:
Lead_data['TotalVisits'].describe(percentiles=[0.05,.25, .5, .75, .90, .95, .99])

In [None]:
Q99 = Lead_data.TotalVisits.quantile(0.99)
Lead_data = Lead_data[(Lead_data.TotalVisits <= Q99)]
sns.boxplot(y=Lead_data['TotalVisits'])
plt.show()

In [None]:
Lead_data['Total Time Spent on Website'].describe(percentiles=[0.05,.25, .5, .75, .90, .95, .99])

In [None]:
Lead_data['Page Views Per Visit'].describe(percentiles=[0.05,.25, .5, .75, .90, .95, .99])

In [None]:
Q99 = Lead_data['Page Views Per Visit'].quantile(0.99)
Lead_data = Lead_data[(Lead_data['Page Views Per Visit'] <= Q99)]
sns.boxplot(y=Lead_data['Page Views Per Visit'])
plt.show()

In [None]:
fig, subp = plt.subplots(1,3, figsize = (18,5),)
plt1 = sns.boxplot(Lead_data['TotalVisits'], ax= subp[0], orient = 'v')
plt2 = sns.boxplot(Lead_data['Total Time Spent on Website'], ax = subp[1], orient = 'v')
plt3 = sns.boxplot(Lead_data['Page Views Per Visit'], ax = subp[2], orient = 'v')

plt.show()

## Bivariate Analysis

In [None]:
plt.figure(figsize=(10,5))
plot1=sns.countplot(Lead_data.City, hue=Lead_data.Converted)
plot1.set_xticklabels(plot1.get_xticklabels(),rotation=90)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plot1=sns.countplot(Lead_data['Lead Origin'], hue=Lead_data.Converted)
plot1.set_xticklabels(plot1.get_xticklabels(),rotation=90)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plot1=sns.countplot(Lead_data['Lead Source'], hue=Lead_data.Converted)
plot1.set_xticklabels(plot1.get_xticklabels(),rotation=90)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plot1=sns.countplot(Lead_data['Last Activity'], hue=Lead_data.Converted)
plot1.set_xticklabels(plot1.get_xticklabels(),rotation=90)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plot1=sns.countplot(Lead_data['What is your current occupation'], hue=Lead_data.Converted)
plot1.set_xticklabels(plot1.get_xticklabels(),rotation=90)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plot1=sns.countplot(Lead_data['A free copy of Mastering The Interview'], hue=Lead_data.Converted)
plot1.set_xticklabels(plot1.get_xticklabels(),rotation=90)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plot1=sns.countplot(Lead_data['Last Notable Activity'], hue=Lead_data.Converted)
plot1.set_xticklabels(plot1.get_xticklabels(),rotation=90)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plot1=sns.countplot(Lead_data['Specialization'], hue=Lead_data.Converted)
plot1.set_xticklabels(plot1.get_xticklabels(),rotation=90)
plt.show()

## Dummy Variable

In [None]:
Lead_data.head()

In [None]:
# List of variables to map

varlist =  ['A free copy of Mastering The Interview']

# Defining the map function
def binary_map(x):
    return x.map({'Yes': 1, "No": 0})

# Applying the function to the housing list
Lead_data[varlist] = Lead_data[varlist].apply(binary_map)

In [None]:
Lead_data.head()

In [None]:
Lead_data['Lead Origin'].value_counts()

In [None]:
Lead_data['Lead Source'].value_counts()

In [None]:
Lead_data['Last Activity'].value_counts()

In [None]:
Lead_data['What is your current occupation'].value_counts()

In [None]:
Lead_data['City'].value_counts()

In [None]:
Lead_data['Last Notable Activity'].value_counts()

In [None]:
Lead_data['Specialization'].value_counts()

In [None]:
#Creating Dummies of the categorical variables

# Creating a dummy variable for some of the categorical variables and dropping the first one.
Dummy_variables = pd.get_dummies(Lead_data[['Lead Origin', 'Lead Source', 'Last Activity', 'What is your current occupation','City','Last Notable Activity','Specialization']], drop_first=True)

# Adding the results to the master dataframe
Lead_data = pd.concat([Lead_data, Dummy_variables], axis=1)
Lead_data.head()

In [None]:
# We have created dummies for the below variables, so we can drop them
Lead_data = Lead_data.drop(['Lead Origin', 'Lead Source', 'Last Activity', 'What is your current occupation','City','Last Notable Activity','Specialization'], axis=1)
Lead_data.shape

In [None]:
Dummy_variables.head()

## Splitting the data into X and y

In [None]:
X = Lead_data.drop(['Lead Number','Converted'], axis = 1)
X.head()

In [None]:
y = Lead_data['Converted']
y.head()

## Splitting the data into Train and Test data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=10)

In [None]:
scaler = StandardScaler()

X_train[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']] = scaler.fit_transform(X_train[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']])

X_train.head()

In [None]:
X_train.describe()

In [None]:
## Conversion Rate
Conversion_rate = (sum(Lead_data['Converted'])/len(Lead_data['Converted'].index))*100
Conversion_rate

## Model Building

In [None]:
# Logistic regression model
logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

In [None]:
logreg = LogisticRegression()

In [None]:
from sklearn.feature_selection import RFE
rfe = RFE(logreg, 20)
rfe = rfe.fit(X_train, y_train)

In [None]:
rfe.support_

In [None]:
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]

In [None]:
X_train.columns[~rfe.support_]

##### Assessing the model with StatsModels

In [None]:
X_train_sm = sm.add_constant(X_train[col])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
New_col = col.drop('What is your current occupation_Housewife',1)

In [None]:
X_train_sm = sm.add_constant(X_train[New_col])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
New_col2 = New_col.drop('What is your current occupation_Working Professional',1)

In [None]:
X_train_sm = sm.add_constant(X_train[New_col2])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
New_col3 = New_col2.drop('Lead Source_Reference',1)

In [None]:
X_train_sm = sm.add_constant(X_train[New_col3])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
New_col4 = New_col3.drop('Last Notable Activity_Email Link Clicked',1)

In [None]:
X_train_sm = sm.add_constant(X_train[New_col4])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
# Getting the predicted values on the train set
y_train_pred = res.predict(X_train_sm)
y_train_pred[:10]

In [None]:
y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

##### Creating a dataframe with the actual churn flag and the predicted probabilities

In [None]:
y_train_pred_final = pd.DataFrame({'Converted':y_train.values, 'Conversion_Prob':y_train_pred})
y_train_pred_final['Lead Number'] = y_train.index
y_train_pred_final.head()

##### Creating new column 'predicted' with 1 if Churn_Prob > 0.5 else 0

In [None]:
y_train_pred_final['predicted'] = y_train_pred_final.Conversion_Prob.map(lambda x: 1 if x > 0.5 else 0)

y_train_pred_final.head()

In [None]:
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.predicted )
print(confusion)

In [None]:
# Let's check the overall accuracy.
print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.predicted))

#### Checking VIFs

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train[New_col4].columns
vif['VIF'] = [variance_inflation_factor(X_train[New_col4].values, i) for i in range(X_train[New_col4].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
New_col4 = New_col4.drop('Last Notable Activity_Email Opened')

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train[New_col4].columns
vif['VIF'] = [variance_inflation_factor(X_train[New_col4].values, i) for i in range(X_train[New_col4].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train_sm = sm.add_constant(X_train[New_col4])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
y_train_pred_final['predicted'] = y_train_pred_final.Conversion_Prob.map(lambda x: 1 if x > 0.5 else 0)

y_train_pred_final.head()

In [None]:
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.predicted )
print(confusion)

In [None]:
# Let's check the overall accuracy.
print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.predicted))