In [None]:
# basic libraries to work on the dataframe
import pandas as pd
import numpy as np
# data Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# libraries
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

#Increasing the columns views limit
pd.options.display.max_columns = None
pd.options.display.max_rows = 150
pd.options.display.float_format = '{:.2f}'.format

In [None]:
#Reading the data file using pandas
lead=pd.read_csv('../input/lead-scoring-dataset/Lead Scoring.csv')
lead.head()

In [None]:
# check the shape of the dataset
lead.shape

In [None]:
# check statistics for numerical columns
lead.describe().transpose()

In [None]:
# check whether there are any duplicates
lead.duplicated().sum()

In [None]:
#Lets have a look at all the columns, their datatypes and also get an idea of null values present
lead.info()


## Observations

A large number of columns have null values. Those columns should ideally be dropped
Prospect ID and Lead Number both serve the same purpose. They are both unique identifiers. We will drop Prospect ID
Column names are just too long. We will modify the column names
Few categorical columns have "Select" in their entries. Those select are essentially null values because Select appears when someone does not select anything from the dropdown

# Data Cleaning
Rename column names
Long column names make analysis tiring as one has to always refer to column names. Also has impact on charts created later on
Ideally, we should follow python's preferred Snakecase nomenclature

In [None]:
# change nomenclature to snakecase
lead.columns = lead.columns.str.replace(' ', '_').str.lower()

# test
lead.columns

In [None]:
# shorten column names
lead.rename(columns = {'totalvisits': 'total_visits', 'total_time_spent_on_website': 'time_on_website', 
                    'how_did_you_hear_about_x_education': 'source', 'what_is_your_current_occupation': 'occupation',
                    'what_matters_most_to_you_in_choosing_a_course' : 'course_selection_reason', 
                    'receive_more_updates_about_our_courses': 'courses_updates', 
                     'update_me_on_supply_chain_content': 'supply_chain_content_updates',
                    'get_updates_on_dm_content': 'dm_content_updates',
                    'i_agree_to_pay_the_amount_through_cheque': 'cheque_payment',
                    'a_free_copy_of_mastering_the_interview': 'mastering_interview'}, inplace = True)

lead.head(1)

## Drop prospect_id column

In [None]:
lead.drop('prospect_id', axis = 1, inplace = True)


## Replace "Select" category with null values

In [None]:
# Select all non-numeric columns
lead_obj = lead.select_dtypes(include='object')

# Find out columns that have "Select"
s = lambda x: x.str.contains('Select', na=False)
l = lead_obj.columns[lead_obj.apply(s).any()].tolist()
print (l)

There are 4 columns that contains Select, which are effectively null values. We are going to make that change

In [None]:
# select all the columns that have a "Select" entry
sel_cols = ['specialization', 'source', 'lead_profile', 'city']

# replace values
lead[sel_cols] = lead[sel_cols].replace('Select', np.NaN)

## Handle null values and sales generated columns
* Given there are a number of columns with very high number of null entries, let's calculate the percentage of null values in each column, and take a decision from there.
* Furthermore, we can also drop Sales generated columns because those are the data entries that are made after the sales team has connected with the student. Those data have no bearing to the purpose of our model ie. providing lead score. The columns are
    * tags
    * lead_quality
    * all asymmetrique columns
    * last_activity
    * last_notable_activity

In [None]:
# Calculate percentage of null values for each column
(lead.isnull().sum() / lead.shape[0]) * 100

## Observation: 
As can be seen, there are quite a few columns with high number of missing data. Since there are no ways to get data back from reliable sources, we can drop all those columns that have missing values > 40%

# Drop columns that have null values > 40% or Sales generated columns

In [None]:
lead.drop(['source', 'lead_quality', 'lead_profile', 'asymmetrique_activity_index', 
                      'asymmetrique_profile_index', 'asymmetrique_activity_score', 'asymmetrique_profile_score',
        'tags', 'last_activity', 'last_notable_activity'], 
        axis = 1, inplace = True)

lead.head(1)


In [None]:
# Lets look at what are we left with
# Calculate percentage of null values for each column
(lead.isnull().sum() / lead.shape[0]) * 100

## Observations
There are five columns that still have high null values: country, specialization, occupation, course_selection_reason, and city. We will look at them individually to see what can be done

## country column

In [None]:
lead.country.value_counts(normalize = True, dropna = False) * 100

## Observation
The distribution of the data is very heavily skewed, with India + null values = 97% of the total. It is safe to drop this column.

In [None]:
lead.drop('country', axis = 1, inplace = True)

## course_selection_reason column

In [None]:
lead.course_selection_reason.value_counts(normalize = True, dropna = False) * 100

## Observation
The distribution of the data is very heavily skewed, with Better career prospects + null values = approx 100% of the total. It is safe to drop this column.

In [None]:
lead.drop('course_selection_reason', axis = 1, inplace = True)

## occupation column

In [None]:
lead.occupation.value_counts(normalize = True, dropna = False) * 100

## Observation
For occupation, we can first combine categories, and then impute proportionally to maintain the distribution and not introduce bias

In [None]:
# combine low representing categories
lead.loc[(lead.occupation == 'Student') | (lead.occupation == 'Other') | (lead.occupation == 'Housewife') | 
       (lead.occupation == 'Businessman') , 'occupation'] = 'Student and Others'

In [None]:
lead.occupation.value_counts(normalize = True) * 100

In [None]:
# impute proportionately
lead['occupation'] = lead.occupation.fillna(pd.Series(np.random.choice(['Unemployed', 'Working Professional', 
                                                                    'Student and Others'], 
                                                                   p = [0.8550, 0.1078, 0.0372], size = len(lead))))

## specialization column

In [None]:
lead.specialization.value_counts(normalize = True, dropna = False) * 100

## Observation
For specialization, we can first combine categories based on the course type, and then impute proportionally to maintain the distribution and not introduce bias

In [None]:
# categorize all management courses
lead.loc[(lead.specialization == 'Finance Management') | (lead.specialization == 'Human Resource Management') | 
       (lead.specialization == 'Marketing Management') |  (lead.specialization == 'Operations Management') |
       (lead.specialization == 'IT Projects Management') | (lead.specialization == 'Supply Chain Management') |
       (lead.specialization == 'Healthcare Management') | (lead.specialization == 'Hospitality Management') |
       (lead.specialization == 'Retail Management') , 'specialization'] = 'Management Specializations'

# categorize all busines courses
lead.loc[(lead.specialization == 'Business Administration') | (lead.specialization == 'International Business') | 
       (lead.specialization == 'Rural and Agribusiness') | (lead.specialization == 'E-Business') 
        , 'specialization'] = 'Business Specializations'

# categorize all industry courses
lead.loc[(lead.specialization == 'Banking, Investment And Insurance') | (lead.specialization == 'Media and Advertising') |
       (lead.specialization == 'Travel and Tourism') | (lead.specialization == 'Services Excellence') |
       (lead.specialization == 'E-COMMERCE'), 'specialization'] = 'Industry Specializations'

In [None]:
lead.specialization.value_counts(normalize = True) * 100

In [None]:
# impute proportionately
lead['specialization'] = lead.specialization.fillna(pd.Series(np.random.choice(['Management Specializations',  
                                                    'Business Specializations', 'Industry Specializations'], 
                                                                   p = [0.7258, 0.1213, 0.1529 ], size = len(lead))))

## city column

In [None]:
lead.city.value_counts(normalize = True, dropna = False) * 100

## Observations
We will categorize cities based on logical decisions and impute proportionately

In [None]:
# categorize all non-mumbai, but Maharashtra cities
lead.loc[(lead.city == 'Thane & Outskirts') | (lead.city == 'Other Cities of Maharashtra'), 
       'city'] = 'Non-Mumbai Maharashtra Cities'

# categorize all other cities
lead.loc[(lead.city == 'Other Cities') | (lead.city == 'Other Metro Cities') | (lead.city == 'Tier II Cities') , 
       'city'] = 'Non-Maharashtra Cities'

In [None]:
lead.city.value_counts(normalize = True) * 100

In [None]:
# impute proportionately
lead['city'] = lead.city.fillna(pd.Series(np.random.choice(['Mumbai', 'Non-Mumbai Maharashtra Cities', 
                                                                    'Non-Maharashtra Cities'], 
                                                                   p = [0.5784, 0.2170, 0.2046 ], size = len(lead))))

## Handle categorical columns with low number of missing values and low representation of categories
In this step, we will go through the rest of the categorical columns one by one and

    Merge categories that have low representation
    Impute the missing values

In [None]:
(lead.isnull().sum() / lead.shape[0]) * 100

In [None]:
# determine unique values for all object datatype columns
for k, v in lead.select_dtypes(include='object').nunique().to_dict().items():
    print('{} = {}'.format(k,v))

## Observation
As can be seen from the above output, the categorical columns (i.e. number of unique values > 2) are:

* lead_origin
* lead_source

## lead_origin column

In [None]:
lead.lead_origin.value_counts(normalize = True, dropna = False) * 100

In [None]:
#There are a lot of smaller values which will not be used as definitive factors, lets group them together
lead.loc[(lead.lead_origin == 'Lead Import') | (lead.lead_origin == 'Quick Add Form') | (lead.lead_origin == 'Lead Add Form')
       , 'lead_origin'] = 'Lead Add Form and Others'

## lead_source column

In [None]:
lead.lead_source.value_counts(normalize = True, dropna = False) * 100

In [None]:
# Lets impute the missing values with the mode of data i.e. clearly 'Google'
lead.lead_source.fillna(lead.lead_source.mode()[0], inplace=True)

In [None]:
#There are a lot of smaller values which will not be used as definitive factors, lets group them together
lead['lead_source'] = lead['lead_source'].apply(lambda x: x if 
                                            ((x== 'Google') | (x=='Direct Traffic') | (x=='Olark Chat') | 
                                             (x=='Organic Search') | (x=='Reference')) 
                                            else 'Other Social Sites')

## Handle Binary columns
* Drop those columns that have significant data imbalance
* Drop all those columns that have only 1 unique entry

In [None]:
# determine unique values
for k, v in lead.select_dtypes(include='object').nunique().to_dict().items():
    print('{} = {}'.format(k,v))

## Observation

The following columns can be dropped as they have just 1 unique values
* magazine
* course_updates
* supply_chain_content_updates
* dm_content_updates
* cheque_payment

Let's now check the data imbalance for the rest of the columns

In [None]:
# select rest of the binary columns in a new dataframe
lead_bin = lead[['do_not_email', 'do_not_call', 'search', 'newspaper_article', 'x_education_forums', 
           'newspaper', 'digital_advertisement', 'through_recommendations', 'mastering_interview']]

# see value counts for each of the columns
for i in lead_bin.columns:
    x = (lead_bin[i].value_counts(normalize = True)) * 100
    print(x)
    print()

## Observations
Because of heavy data imbalance, we can drop the following columns as well

* do_not_call
* search
* newspaper_article
* x_education_forums
* newspaper
* digital_advertisement
* through_recommendations

In [None]:
drop_bin = ['do_not_call', 'search', 'newspaper_article', 'x_education_forums', 
           'newspaper', 'digital_advertisement', 'through_recommendations', 'magazine', 'courses_updates', 
           'supply_chain_content_updates', 'dm_content_updates', 'cheque_payment']

lead.drop(drop_bin, axis = 1, inplace = True)

# Handle Numerical columns
## lead_number column: change datatype
lead_number column is a unique identifier for each leads. Therefore, aggregations won't be of any relevance. We should change it to object

In [None]:
lead.lead_number = lead.lead_number.astype('object')

## total_visits column
For this column, we need to handle the missing values, and can convert the datatype to integer since visits can't be decimal

In [None]:
lead.total_visits.fillna(lead.total_visits.median(), inplace=True)
lead.total_visits = lead.total_visits.astype('int')

## page_views_per_visit column
### Handle missing values

In [None]:
lead.page_views_per_visit.fillna(lead.page_views_per_visit.median(), inplace=True)

In [None]:
lead.info()

# Exploratory Data Analysis
## Numerical columns

In [None]:
# Set style
plt.style.use('ggplot')

# See distribution of each of these columns
fig = plt.figure(figsize = (14, 10))
plt.subplot(2, 2, 1)
plt.hist(lead.total_visits, bins = 20)
plt.title('Total website visits')

plt.subplot(2, 2, 2)
plt.hist(lead.time_on_website, bins = 20)
plt.title('Time spent on website')

plt.subplot(2, 2, 3)
plt.hist(lead.page_views_per_visit, bins = 20)
plt.title('Average number of page views per visit')

plt.show()

## Observations

High peaks and skewed data. There might be a possibility of outliers. We will check them next

In [None]:
plt.figure(figsize = (14,12))
sns.heatmap(lead[['total_visits', 'time_on_website', 'page_views_per_visit']].corr(), cmap="YlGnBu", annot = True)
plt.show()

## Observations: 
No significaqnt correlation such that columns can be dropped

In [None]:
plt.figure(figsize = (10, 14))

plt.subplot(3,1,1)
sns.boxplot(lead.total_visits)

plt.subplot(3,1,2)
sns.boxplot(lead.time_on_website)

plt.subplot(3,1,3)
sns.boxplot(lead.page_views_per_visit)
plt.show()

## Observations

Looking at both the box plots and the statistics, there are upper bound outliers in both total_visits and page_views_per_visit columns. We can also see that the data can be capped at 99 percentile.

## Categorical columns


### Lead Origin

In [None]:
plt.figure(figsize = (14, 8))

lead.groupby('lead_origin')['lead_number'].count().sort_values(ascending = False).plot(kind= 'barh', width = 0.8, 
                                                            edgecolor = 'black', 
                                                            color = plt.cm.Paired(np.arange(len(lead))))
plt.show()


## Lead Source

In [None]:
plt.figure(figsize = (14, 8))

lead.groupby('lead_source')['lead_number'].count().sort_values(ascending = False).plot(kind= 'barh', width = 0.8, 
                                                            edgecolor = 'black', 
                                                            color = plt.cm.Paired(np.arange(len(lead))))
plt.show()

## Specialization

In [None]:
plt.figure(figsize = (10, 8))

lead.groupby('specialization')['lead_number'].count().sort_values(ascending = False).plot(kind= 'barh', width = 0.8, 
                                                            edgecolor = 'black', 
                                                            color = plt.cm.Paired(np.arange(len(lead))))
plt.show()

Most of the speciliazation taken are management

## Occupation

In [None]:
plt.figure(figsize = (14, 8))

lead.groupby('occupation')['lead_number'].count().sort_values(ascending = False).plot(kind= 'barh', width = 0.8, 
                                                            edgecolor = 'black', 
                                                            color = plt.cm.Paired(np.arange(len(lead))))
plt.show()

Unempployed users are the most significant leads

## City

In [None]:
plt.figure(figsize = (14, 8))

lead.groupby('city')['lead_number'].count().sort_values(ascending = False).plot(kind= 'barh', width = 0.8, 
                                                            edgecolor = 'black', 
                                                            color = plt.cm.Paired(np.arange(len(lead))))
plt.show()

Mumbai in particular and Maharashtra in general dominates the lead. This is likely due to the fact that the courses are based in Mumbai

In [None]:
plt.figure(figsize = (14, 8))

lead.groupby('do_not_email')['lead_number'].count().sort_values(ascending = False).plot(kind= 'barh', width = 0.8, 
                                                            edgecolor = 'black', 
                                                            color = plt.cm.Paired(np.arange(len(lead))))
plt.show()

# DATA Preparation

## Converting Binary (Yes/No) to 0/1

In [None]:
# determine unique values
for k, v in lead.select_dtypes(include='object').nunique().to_dict().items():
    print('{} = {}'.format(k,v))

We have two binary columns: do_not_email, mastering_interview

In [None]:
binlist = ['do_not_email', 'mastering_interview']

# Defining the map function
def binary_map(x):
    return x.map({'Yes': 1, "No": 0})

# Applying the function to the housing list
lead[binlist] = lead[binlist].apply(binary_map)

# check the operation was success
lead.head()

## Creating dummy variable for categorical columns
### Categorical columns are: lead_origin, lead_source, specialization, occupation, city

In [None]:
# Creating a dummy variable for some of the categorical variables and dropping the first one.
dummy1 = pd.get_dummies(lead[['lead_origin', 'lead_source', 'specialization', 'occupation', 'city']], drop_first = True)

# Adding the results to the master dataframe
lead = pd.concat([lead, dummy1], axis=1)

In [None]:
# Dropping the columns for which dummies have been created
lead.drop(['lead_origin', 'lead_source', 'specialization', 'occupation', 'city'], axis = 1, inplace = True)

lead.head()

# Outliers Treatment

In [None]:
num_cols = lead[['total_visits', 'time_on_website', 'page_views_per_visit']]

# Checking outliers at 25%, 50%, 75%, 90%, 95% and 99%
num_cols.describe(percentiles=[.25, .5, .75, .90, .95, .99])

In [None]:
# capping at 99 percentile
lead.total_visits.loc[lead.total_visits >= lead.total_visits.quantile(0.99)] = lead.total_visits.quantile(0.99)
lead.page_views_per_visit.loc[lead.page_views_per_visit >= 
                            lead.page_views_per_visit.quantile(0.99)] = lead.page_views_per_visit.quantile(0.99)

In [None]:
plt.figure(figsize = (10, 14))

plt.subplot(2,1,1)
sns.boxplot(lead.total_visits)

plt.subplot(2,1,2)
sns.boxplot(lead.page_views_per_visit)
plt.show()

As we can see, we were able to significantly reduce the number of outliers by capping