In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy             as np 
import pandas            as pd 
import matplotlib.pyplot as plt
import seaborn           as sns

import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### 1. IMPORTING DATA AND ANALYZING

In [None]:
#Imporitng the data into enviroment 
leads = pd.read_csv(r'../input/leadscore/Leads.csv')
pd.set_option('display.max_columns',None)
leads.head()

In [None]:
#Analyzing the data 
leads.info()

In [None]:
#Analyzing the null values 
import missingno as msno
msno.matrix(leads)
msno.bar(leads)

### 2. DATA CLEANING

In [None]:
#Function to Check the null values in terms of perecentage 
def null_values_check(leads):
    null_values    = round((leads.isnull().sum()/len(leads) * 100),2).to_frame().rename(columns={0:'Null_values_percentage'})
    null_values    = pd.DataFrame(null_values)
    null_values.reset_index(inplace=True)
    return null_values.sort_values(by='Null_values_percentage',ascending=False)

In [None]:
#Function to impute the null values with most frequent values 
def impute(df):
    from sklearn.impute import SimpleImputer
    my_imputer           = SimpleImputer(strategy='most_frequent')
    imputed_data         = pd.DataFrame(my_imputer.fit_transform(df))
    imputed_data.columns = df.columns
    return imputed_data

In [None]:
#Checking the null values 
null_values_check(leads)[:17]

In [None]:
#Eliminating the columns having 45 or more than 45% of the null values 
print('The shape of leads df before deleting columns:{}'.format(leads.shape))
col_eliminated = ['Lead Quality','Asymmetrique Activity Index','Asymmetrique Profile Score',
                 'Asymmetrique Activity Score','Asymmetrique Profile Index']
leads.drop(columns=col_eliminated,axis=1,inplace=True)
print('The shape of leads df after deleting columns:{}'.format(leads.shape))

In [None]:
#Analysing the tag,Tags, Lead Profile, What matters most to you in choosing a course,
#What is your current occupation,Country,How did you hear about X Education,Specialization,City
leads['Lead Profile'].value_counts()

In [None]:
leads['What matters most to you in choosing a course'].value_counts()

In [None]:
leads['What is your current occupation'].value_counts()

In [None]:
leads['Country'].value_counts()[:13]

In [None]:
leads['How did you hear about X Education'].value_counts()

In [None]:
leads['Specialization'].value_counts()

In [None]:
leads['City'].value_counts()

#### IMPUTING the null values with most frequent values

In [None]:
#Imputing the null values with the most frequently occuring null values 
leads = impute(leads)

In [None]:
#Checking the null values again to verify  
null_values_check(leads)[:5]

### 3. EXPLORATORY DATA ANALYSIS

In [None]:
#Checking the target column to check if our data is balanced or imbalanced 
#Exploring the Target variable
sns.countplot(x='Converted',data=leads);

In [None]:
#Checking the percentage the target values
round(leads['Converted'].value_counts()/len(leads['Converted'])*100,2)

1. #### We can observe that the Dataset is Imbalanced because the converted Leads are far more less as comapred to the not-converted leads we will handle this further

In [None]:
#Function to analyse the categorical variables wrt target variable
def eda(col_name1,col_name2,df,l,b):
    plt.figure(figsize=(l,b))
    g = sns.countplot(x=col_name1,hue=col_name2,data=df)
    g.set_xticklabels(labels=g.get_xticklabels(),rotation=90);
    plt.legend(loc='upper right');

In [None]:
eda('Lead Origin','Converted',leads,13,6)

In [None]:
eda('Lead Source','Converted',leads,13,6)

In [None]:
eda('Country','Converted',leads,14,6)

In [None]:
#Function to analyse the categorical variables wrt target variable
def analysing(col_name,df):
    unique   = df[col_name].unique()
    Analysis = pd.DataFrame(columns=[col_name,'1_per','0_per','1_count','0_count','Total'])
    Analysis[col_name] = unique
    for value in unique:
        Total_values = len(df[(df[col_name] == value)])
        Analysis.loc[Analysis[col_name] ==  value,'1_per']   = round((len(df[(df[col_name] == value) & (df['Converted'] == 1)])/Total_values)*100,2)
        Analysis.loc[Analysis[col_name] ==  value,'0_per']   = round((len(df[(df[col_name] == value) & (df['Converted'] == 0)])/Total_values)*100,2)
        Analysis.loc[Analysis[col_name] ==  value,'1_count'] = len(df[(df[col_name] == value) & (df['Converted'] == 1)])
        Analysis.loc[Analysis[col_name] ==  value,'0_count'] = len(df[(df[col_name] == value) & (df['Converted'] == 0)])
        Analysis.loc[Analysis[col_name] ==  value,'Total']   = Total_values
    return Analysis

In [None]:
Analysis_1 = analysing('Country',leads)
Analysis_1.sort_values(by='Total',ascending=False)[:15]

### Reason why I have analysed using percentage of leads conversion, per country is some information was lost in the graph.

### eg we can see that UNITED STATES graph is not even visible but its second higest country where the Leads come from followed by UAE


In [None]:
eda('Specialization','Converted',leads,14,6)

In [None]:
Analysis_1 = analysing('Specialization',leads)
Analysis_1.sort_values(by='1_count',ascending=False)

In [None]:
eda('How did you hear about X Education','Converted',leads,14,6)

In [None]:
eda('What is your current occupation','Converted',leads,14,6)

In [None]:
eda('What matters most to you in choosing a course','Converted',leads,10,6)

In [None]:
Analysis_1 = analysing('What matters most to you in choosing a course',leads)
Analysis_1.sort_values(by='1_count',ascending=False)

### It is evident that more than 99% of students have same reason to join the course so this column will not help us in decision making, so we will eliminate this column.

In [None]:
eda('Tags','Converted',leads,14,6)

In [None]:
Analysis_1 = analysing('Tags',leads)
Analysis_1.sort_values(by='Total',ascending=False)

### Columns like this will be useful in decision making as they have various values & conversion rate differs for each one of them.

In [None]:
eda('Receive More Updates About Our Courses','Converted',leads,6,6)

In [None]:
eda('Update me on Supply Chain Content','Converted',leads,6,6)

In [None]:
eda('Get updates on DM Content','Converted',leads,6,6)

In [None]:
eda('I agree to pay the amount through cheque','Converted',leads,6,6)

### We can observe that the above columns have only one value, which will not help us in decision making so we need to eliminate these columns

In [None]:
eda('Lead Profile','Converted',leads,9,6)

In [None]:
eda('City','Converted',leads,9,6)

In [None]:
eda('A free copy of Mastering The Interview','Converted',leads,9,6)

In [None]:
eda('Last Notable Activity','Converted',leads,11,6)

In [None]:
Analysis_1 = analysing('Last Notable Activity',leads)
Analysis_1.sort_values(by='Total',ascending=False)

In [None]:
eda('Last Activity','Converted',leads,11,6)

In [None]:
Analysis_1 = analysing('Last Activity',leads)
Analysis_1.sort_values(by='Total',ascending=False)

### The columns 'LAST NOTABLE ACTIVITY' & 'LAST ACTIVITY' are having similar values but they have different outcomes so we can retain them both even though they are similar

In [None]:
eda('Through Recommendations','Converted',leads,11,6)

In [None]:
Analysis_1 = analysing('Through Recommendations',leads)
Analysis_1.sort_values(by='Total',ascending=False)

### It is evident that more than 95% of students have same values so this column will not help us in decision making, so we will eliminate this column.

In [None]:
eda('Do Not Email','Converted',leads,6,6)

In [None]:
Analysis_1 = analysing('Do Not Email',leads)
Analysis_1.sort_values(by='Total',ascending=False)

### We can observe that 92% values are assigned to one type, so this column will make our model baised so its better to eliminate it

In [None]:
eda('Do Not Call','Converted',leads,6,6)

In [None]:
Analysis_1 = analysing('Do Not Call',leads)
Analysis_1.sort_values(by='Total',ascending=False)

### We can observe that 99% values are assigned to one type, so this column will make our model baised so its better to eliminate it coz it wont help in any decision making

In [None]:
eda('TotalVisits','Converted',leads,16,6)

In [None]:
eda('Search','Converted',leads,6,6)

In [None]:
eda('Magazine','Converted',leads,6,6)

In [None]:
eda('Newspaper Article','Converted',leads,6,6)

In [None]:
eda('X Education Forums','Converted',leads,6,6)

In [None]:
eda('Newspaper','Converted',leads,6,6)

In [None]:
eda('Digital Advertisement','Converted',leads,6,6)

In [None]:
#We will analyse the columns under 
#Indicating whether the customer had seen the ad in any of the listed items.
#with a different method
leads['ad'] = leads['Digital Advertisement'] + leads['Newspaper'] + leads['X Education Forums']+ leads['Newspaper Article'] + leads['Magazine'] + leads['Search']
leads['ad'].value_counts()

### As it is evident that all columns have maximum of no's in it so it wont help us in any decision making as it is reduntant we will eliminate all these columns at once

### Also we can see that the Newspaper column and Newspaper article are same, It is obivious that if a customer has seen the ad in a newspaper article it has come from Newspaper

In [None]:
col_to_be_eliminated = ['What matters most to you in choosing a course','Receive More Updates About Our Courses',
                       'Update me on Supply Chain Content','Get updates on DM Content',
                       'I agree to pay the amount through cheque','Through Recommendations','Do Not Email',
                       'Do Not Call','Digital Advertisement','Newspaper','X Education Forums',
                       'Newspaper Article','Magazine','Search','ad']
print('The shape of df before deleting reduntant columns: {}'.format(leads.shape))
print('The no of columns to be removed:{}'.format(len(col_to_be_eliminated)))
leads.drop(columns=col_to_be_eliminated,axis=1,inplace=True)
print('The shape of df after  deleting reduntant columns: {}'.format(leads.shape))

In [None]:
#We are finally left with only 18 columns lets have a look at the df
leads.head()

## 4. DATA PRE-PROCESSING 

### Converting all the numeric columns to Numeric datatype


In [None]:
leads = leads.apply(pd.to_numeric, errors='ignore')

### Handling Categorical Values

In [None]:
#As we are going to build a logistic regression model we need to convert all the categorical
#variables in numerical values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cat_list        = leads.select_dtypes('O').columns

#Printing the no of categorical columns before conversion 
print('Before LabelEncoding')
print('The no of categorical columns in dataset are {}'.format(len(leads.select_dtypes('O').columns)))

#Converting the cat columns into numerical 
for column_name in cat_list:
    leads[column_name] = le.fit_transform(leads[column_name])

#Printing the no of categorical columns before conversion 
print('---'*30)
print('After LabelEncoding')
print('The no of categorical columns in dataset are {}'.format(len(leads.select_dtypes('O').columns)))


In [None]:
#Analyzing the column once more 
leads.head()

### Splitting the data into Train and Test 

In [None]:
#Splitting the data 
from sklearn.model_selection import train_test_split
col_list = ['Prospect ID', 'Lead Number', 'Lead Origin', 'Lead Source','TotalVisits', 
          'Total Time Spent on Website', 'Page Views Per Visit','Last Activity', 'Country', 
          'Specialization','How did you hear about X Education', 'What is your current occupation',
          'Tags', 'Lead Profile', 'City','A free copy of Mastering The Interview', 
          'Last Notable Activity'] 

X = leads[col_list].copy()
y = leads['Converted'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42)

### Handling the Data Imbalance in the dataset using SMOTE technique

In [None]:
#Oversampling the dataset to get better results 
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 33)
X_train_new, y_train_new = sm.fit_resample(X_train, y_train.ravel())
pd.Series(y_train_new).value_counts().plot.bar()

1. #### We can observe that now the TARGET Variable is having equal classes. This will help our model not being baised towards one class

### Standardizing all the numerical features

In [None]:
#Standardising the values 
from sklearn.preprocessing import StandardScaler
SS = StandardScaler()

X_train_new = pd.DataFrame(SS.fit_transform(X_train_new), columns=X_train_new.columns, index=X_train_new.index)
X_test      = pd.DataFrame(SS.transform(X_test), columns=X_test.columns, index=X_test.index)

## 5. MODEL BUILDING

In [None]:
#Function to create a table with pred values for LOGISTIC REGRESSION MODEL
def prediction(model_name,x_test,y_test,thre):
    y_pred                        = model_name.predict(x_test)
    y_pred_final                  = pd.DataFrame({'train_Prob':y_pred})
    y_pred_final['real_op']       = y_test
    y_pred_final['pred_op']       = y_pred_final['train_Prob'].apply(lambda x:1 if x>thre else 0)
    return y_pred_final

In [None]:
#Function to Evaluate LOGISTIC REGRESSION MODEL based on various parameters
def validating_lr(y_real,y_pred):
    from sklearn.metrics import confusion_matrix, accuracy_score
    import seaborn as sns
    confusion = confusion_matrix(y_pred,y_real)
    sns.heatmap(confusion,annot=True,fmt='',cmap='Blues')
    print('Accuracy Score',(accuracy_score(y_pred,y_real)*100))
    TP = confusion[1,1] # true positive 
    TN = confusion[0,0] # true negatives
    FP = confusion[0,1] # false positives
    FN = confusion[1,0] # false negatives
    TPR = round(((TP / float(TP+FP)*100)),2)
    FPR = round(((FP/ float(TN+FP)*100)),2)
    #print('True Positive rate                         :',round((TP / float(TP+FP)*100)),2)
    #print('False postive rate(predicting 1 when its 0):',round((FP/ float(TN+FP)*100)),2)
    print('True Positive rate                         :{}'.format(TPR))
    print('False postive rate(predicting 1 when its 0):{}'.format(FPR))
    print('\n')
    #print('Negative predictive value:',(TN / float(TN+ FN)*100))

In [None]:
#Function to Plot the ROC curve & find the optimal threshold value for LOGISTIC REGRESSION MODEL
def draw_roc( actual, probs ):
    from sklearn.metrics import roc_curve,roc_auc_score
    fpr, tpr, thresholds = roc_curve( actual, probs,drop_intermediate = False )
    auc_score = roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()
    return None
    fpr, tpr, thresholds = roc_curve( y_train_pred_final.Churn, y_train_pred_final.Churn_Prob, drop_intermediate = False )
    draw_roc(y_train_pred_final.Churn, y_train_pred_final.Churn_Prob)

In [None]:
#Function to check the VIF for a set of features
def vif_validation(X_train):
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    # Create a dataframe that will contain the names of all the feature variables and their respective VIFs
    vif = pd.DataFrame()
    vif['Features']  = X_train.columns
    vif['VIF']       = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
    vif['VIF']       = round(vif['VIF'], 2)
    vif              = vif.sort_values(by = "VIF", ascending = False)
    return vif

In [None]:
#Model building using statsmodel & checking the performance
#MODEL NO 1
import statsmodels.api as sm

#Features for model no 1
fcol_list = ['Lead Number', 'Lead Origin', 'Lead Source','TotalVisits', 
          'Total Time Spent on Website', 'Page Views Per Visit','Last Activity', 'Country', 
          'Specialization','How did you hear about X Education', 'What is your current occupation',
          'Tags', 'Lead Profile', 'City','A free copy of Mastering The Interview', 
          'Last Notable Activity']

#Adding constants 
X_train_new_sm = sm.add_constant(X_train_new[fcol_list])
lr   = sm.GLM(y_train_new,X_train_new_sm, family = sm.families.Binomial())
lr_1 = lr.fit() 
lr_1.summary()

### EVALUATION FOR MODEL NO 1

In [None]:
#Evaluting MODEL NO 1 on TRAIN dataset:
pred_df = prediction(lr_1,X_train_new_sm,y_train_new,0.4)

#Evaluating MODEL NO 1
validating_lr(pred_df['real_op'],pred_df['pred_op'])

In [None]:
#Evaluting MODEL NO 1 on Test dataset:
X_test_new    = X_test[fcol_list].copy()
X_test_new_sm = sm.add_constant(X_test_new )

#Predicting the values for MODEL NO 1
pred_df = prediction(lr_1,X_test_new_sm,y_test,0.4)

#Checking the Evaluation parameters
validating_lr(pred_df['real_op'],pred_df['pred_op'])

### BUILDING MODEL NO 2 BY ELIMINATING THE INSIGNIFICANT FEATURES

In [None]:
#Building MODEL NO 2 by eliminating insignificant Features
import statsmodels.api as sm

#We will eliminate columns like 'Country','How did you hear about X Education'
#as the pvalue is more than 0.05

fcol_list = ['Lead Number', 'Lead Origin', 'Lead Source','TotalVisits', 
          'Total Time Spent on Website', 'Page Views Per Visit','Last Activity', 
          'Specialization','What is your current occupation',
          'Tags', 'Lead Profile', 'City','A free copy of Mastering The Interview', 
          'Last Notable Activity']

#Adding constants & model building
X_train_new_sm = sm.add_constant(X_train_new_sm[fcol_list])
lr   = sm.GLM(y_train_new,X_train_new_sm, family = sm.families.Binomial())
lr_2 = lr.fit() 
lr_2.summary()

### EVALUATION FOR MODEL NO 2

In [None]:
#Evaluting MODEL 2 on training dataset 
pred_df = prediction(lr_2,X_train_new_sm,y_train_new,0.5)

#Checking the evalution parameters 
validating_lr(pred_df['real_op'],pred_df['pred_op'])

In [None]:
#Finding the optimal threshold value of MODEL NO 2 using ROC curve
draw_roc(pred_df['real_op'],pred_df['pred_op'])

In [None]:
#Evaluting MODEL NO 2 on Test dataset:
X_test_new    = X_test[fcol_list].copy()
X_test_new_sm = sm.add_constant(X_test_new )

pred_df = prediction(lr_2,X_test_new_sm,y_test,0.4)

#Checking the evalution parameters 
validating_lr(pred_df['real_op'],pred_df['pred_op'])

#### 1. We will chose TPR & FPR as our metrics to evaluate the model. We have to classify the Leads correctly & reduce mis-calssification to avoid unecessary resource allocation. 

In [None]:
#Evaluting MODEL NO 2 on Test dataset: using optimum threshold value of 0.2
X_test_new    = X_test[fcol_list].copy()
X_test_new_sm = sm.add_constant(X_test_new )

pred_df = prediction(lr_2,X_test_new_sm,y_test,0.2)

#Checking the evalution parameters 
validating_lr(pred_df['real_op'],pred_df['pred_op'])

### We are getting 94% TPR and amongst them only 7% are being misclassified so overall its a good model lets further optimise the model

#### CHECKING MULTICOLINEARITY FOR FEATURES OF MODEL NO2

In [None]:
#Evaluating multicolinearity for features of MODEL NO 2
vif_validation(X_train_new_sm)

#### 1. As we had earlier seen in the EDA Process that the 2 columns '**LAST ACTIVITY**' & **'LAST NOTABLE ACTIVITY'** is almost same.
#### 2. We did not eliminate any feature back then coz we were not sure which column to be eliminated.
#### 3.Now, we have checked the VIF values and the VIF value for '**LAST ACTIVITY**' is high, so we will eliminate it & later check if the MULTICOLINEARITY is reduced.


#### BUILDING MODEL NO 3 BY ELIMINATING MULTICOLINEARITY IN FEATURES

In [None]:
#Building MODEL NO 3
import statsmodels.api as sm

#Lets eliminate 'LAST ACTIVITY' feature and check if the multicolinearity reduces
fcol_list = ['Lead Number', 'Lead Origin', 'Lead Source','TotalVisits', 
          'Total Time Spent on Website', 'Page Views Per Visit', 
          'Specialization','What is your current occupation',
          'Tags', 'Lead Profile', 'City','A free copy of Mastering The Interview', 
          'Last Notable Activity']

#Adding constants & Model training
X_train_new_sm = sm.add_constant(X_train_new_sm[fcol_list])
lr   = sm.GLM(y_train_new,X_train_new_sm, family = sm.families.Binomial())
lr_3 = lr.fit() 
lr_3.summary()

In [None]:
#Checking multicolinearity for features of MODEL NO 3
vif_validation(X_train_new_sm)

#### 1. We can observe that the MULTICOLINEARITY has been reduced below 2. We can say that our model is now the optimum model as MULTICOLINEARITY is eliminated & all features are significant.
#### 2. The genral heuristic that we will take for checking MULTICOLINEARITY is 2 for this model building process.

In [None]:
#Evaluting MODEL NO 3 on TRAIN dataset
pred_df = prediction(lr_3,X_train_new_sm,y_train_new,0.4)

#Checking the evalution parameters 
validating_lr(pred_df['real_op'],pred_df['pred_op'])

In [None]:
#Finding the optimal threshold value of MODEL NO 3 using ROC curve
draw_roc(pred_df['real_op'],pred_df['pred_op'])

In [None]:
#Evaluting MODEL NO 3 on Test dataset: using optimum threshold value of 0.25
X_test_new    = X_test[fcol_list].copy()
X_test_new_sm = sm.add_constant(X_test_new )

pred_df = prediction(lr_3,X_test_new_sm,y_test,0.25)

#Checking the evalution parameters 
validating_lr(pred_df['real_op'],pred_df['pred_op'])

1. This will be our Final Model to be deployed in the Production as we are getting TPR of 91%.
2. In our Model FPR is only 9.51% that means only (127) values are being miscalssified.
3. Although the Accuracy of the model is only 68.5%, we have choosen TPR & FPR to be our metrics for evaluation.
4. Our Final Model is the best model as it do not have Multicolinearlity, non-significant features.
5. With this model we wont spend unecessary resources on the misclassified leads.
6. Leads which are correctly classified will help in saving resources & increase the Lead conversion rate.