# Lead Scoring Assignment

### Data Sourcing

* We will import all libraries used in the entire assignment.
* We will import the file in dataframe.
* We will try to check basic information.

#### Importing libraries

In [None]:
# Importing basic libraries
import numpy as np
import pandas as pd

# Importing libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Importing libraries for model preparing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Importing library for model building
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn import metrics
from sklearn.metrics import recall_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import precision_recall_curve

In [None]:
# Importing libraries for removing warnings
import warnings as warnings
warnings.filterwarnings("ignore")

#### Sourcing File

In [None]:
# Importing the file
leads_df = pd.read_csv("../input/leads-dataset/Leads.csv")
leads_df.head()

#### Checking information

In [None]:
# Checking Shape
leads_df.shape

In [None]:
# Checking other values
leads_df.describe()

In [None]:
# Checking columns details
leads_df.info()

### Data Cleaning
- We will try to change 'Select' values of the columns to <i>NAN</i>.
- We will try to drop columns having 35% missing values.
- We will try to drop rows having 70% missing values
- We will try to merge unique categories if they are multiples.
- We will impute the missing values
    * Categorical with mode value
    * Continuous with median value
- We will compare orginal with cleaned data

#### Dropping Duplicates & Basic Cleaning

In [None]:
# Calculating the shape after removing duplicates
leads_modified_df = leads_df.drop_duplicates(keep = 'first')
leads_modified_df.shape

<b>Note :-</b> No duplicate values found.

In [None]:
# Removing columns of no significance
leads_modified_df.drop(['Prospect ID', 'Lead Number'], axis=1, inplace=True)
leads_modified_df.shape

In [None]:
# Change the mapping of a column
leads_modified_df['A free copy of Mastering The Interview'] = leads_modified_df['A free copy of Mastering The Interview'].map({'Yes':1, 'No':0})
leads_modified_df['A free copy of Mastering The Interview'].value_counts()

#### Handle the “Select” level that is present in many of the categorical variables.

In [None]:
# Replacing the 'Select' with NaN
leads_modified_df.replace('Select', np.NAN, inplace=True)
leads_modified_df.head()

#### Drop columns that are having high percentage of missing values.

In [None]:
# Calculating % of missing values
round(leads_modified_df.isnull().sum() * 100 / len(leads_modified_df), 2)

In [None]:
# Function to remove the columns having more than threshold values
def rmissingvaluecol(dff, threshold):
    col = []
    col = list(dff.drop(dff.loc[:,list((100*(dff.isnull().sum()/len(dff.index)) >= threshold))].columns, 1).columns.values)
    print("Columns having more than %s percent missing values: "%threshold, (dff.shape[1] - len(col)))
    print("Columns to be dropped                             : ", list(set(list((dff.columns.values))) - set(col)))
    return col

# Removing columns having 40% missing values
col = rmissingvaluecol(leads_modified_df, 40)
leads_modified_df = leads_modified_df[col]
leads_modified_df.head()

<b>Note :-</b> 7 columns removed.

#### Drop rows that are having high percentage of missing values.

In [None]:
# Deleting rows containing either 70% or more than 70% NaN Values
perc = 70.0 # Here N is 70
min_count =  int(((100-perc)/100)*leads_modified_df.shape[1] + 1)
leads_modified_df = leads_modified_df.dropna(axis=0, thresh=min_count)
leads_modified_df.shape

<b>Note :-</b> No rows deleted.

####  Check the number of unique categories in each categorical column.

In [None]:
# Checking the unique categories
columns_not_to_be_considered = ['Converted', 'TotalVisits', 'Total Time Spent on Website', 'Page Views Per Visit'] #Continous Values
column_names = leads_modified_df.columns
column_names = column_names.drop(columns_not_to_be_considered)

for column_name in column_names:
    print("Column Name        :", column_name)
    print("------------------------------------------")
    print(leads_modified_df[column_name].value_counts(normalize=True, dropna=False)*100)
    print('\n')

In [None]:
# Removing columns of highly skewed data
skewed_columns_to_be_dropped = ['Do Not Email', 'Do Not Call', 'Search', 'Magazine', 'Newspaper Article', 'X Education Forums',
                                'Newspaper', 'Digital Advertisement', 'Through Recommendations', 
                                'Receive More Updates About Our Courses', 'Update me on Supply Chain Content',
                                'Get updates on DM Content', 'I agree to pay the amount through cheque' ]

leads_modified_df.drop(skewed_columns_to_be_dropped, axis=1, inplace=True)
leads_modified_df.shape

<b>Note :-</b> A few columns are removed as they may skew the model.

#### For the columns with less percentage of missing, use some imputation technique.

In [None]:
# Listing down categorical columns with missing values
categorical_column_names = ['Lead Source', 'Last Activity', 'Country',  'City',
                'What is your current occupation', 'What matters most to you in choosing a course']

for column_name in categorical_column_names:
    print("Column Name        :", column_name)
    print("------------------------------")
    print("Unique Values      : ", leads_modified_df[column_name].unique())
    
    values_to_be_imputed = leads_modified_df[column_name].isnull().sum()
    print("Any Null (Before)  :", values_to_be_imputed)
    
    leads_modified_df[column_name].fillna(leads_modified_df[column_name].mode()[0], inplace=True)
    print(values_to_be_imputed, " values imputed with mode values of the column.")
    
    print("Null Values (After):", leads_modified_df[column_name].isnull().sum())
    print('\n')

In [None]:
# Imputing the 'NAN' of 'Specialization' with 'Unspecified' 

print("Values to be imputed : ", leads_modified_df.Specialization.isnull().sum())

leads_modified_df.Specialization.fillna("Unspecified", inplace=True)

In [None]:
# Checking the unique categories
columns_not_to_be_considered = ['Converted', 'TotalVisits', 'Total Time Spent on Website', 'Page Views Per Visit']
column_names = leads_modified_df.columns
column_names = column_names.drop(columns_not_to_be_considered)

for column_name in column_names:
    print("Column Name        :", column_name)
    print("-----------------------------------------------------")
    print(leads_modified_df[column_name].value_counts(normalize=True, dropna=False)*100)
    print('\n')

In [None]:
# Removing columns of highly skewed data
skewed_columns_to_be_dropped = ['Country', 'What is your current occupation', 'What matters most to you in choosing a course']

leads_modified_df.drop(skewed_columns_to_be_dropped, axis=1, inplace=True)
leads_modified_df.shape

In [None]:
# Changing categories with lesser percentage to 'others'

def change_to_others(x, value_counts_df):
    for key, val in value_counts_df.to_dict().items():
        if key == x and val < 10:
            return 'others'
    return x

columns_to_be_changed = ['Lead Origin', 'Lead Source', 'Last Activity', 'Specialization', 'City',
                         'A free copy of Mastering The Interview', 'Last Notable Activity']

for column_name in columns_to_be_changed:
    print("Column Name : ", column_name)
    print("-----------------------------------------")

    value_counts_df = leads_modified_df[column_name].value_counts(normalize=True) * 100
    print("Before :")
    print(value_counts_df)
    print('\n')

    leads_modified_df[column_name] = leads_modified_df[column_name].apply(lambda x:change_to_others(x, value_counts_df))
    value_counts_df = leads_modified_df[column_name].value_counts(normalize=True) * 100
    print("After :")
    print(value_counts_df)
    print('\n')

In [None]:
# Listing down continuos columns with missing values
categorical_column_names = ['TotalVisits', 'Page Views Per Visit']

for column_name in categorical_column_names:
    print("Column Name        :", column_name)
    print("------------------------------")
    values_to_be_imputed = leads_modified_df[column_name].isnull().sum()
    print("Any Null (Before)  :", values_to_be_imputed)
    
    leads_modified_df[column_name].fillna(leads_modified_df[column_name].median(), inplace=True)
    print(values_to_be_imputed, " values imputed with mode values of the column.")
    
    print("Null Values (After):", leads_modified_df[column_name].isnull().sum())
    print('\n')

In [None]:
# Changing the datatype of a column
leads_modified_df['TotalVisits'] = leads_modified_df['TotalVisits'].astype(int)
leads_modified_df.head()

In [None]:
# Dropping a few more columns
leads_modified_df.drop(['Last Activity', 'Last Notable Activity', 'Tags'], axis=1, inplace=True)
leads_modified_df.shape

In [None]:
# Verifying various parameters after cleaning
print("Before Cleaning Data")
print("*********************************")
print("Shape: ", leads_df.shape)
print("Missing:")
print("-----------------------")
print(round(leads_df.isnull().sum()*100/len(leads_df), 2))

print('\n')
print("After Cleaning Data")
print("*********************************")
print("Shape: ", leads_modified_df.shape)
print("Missing:")
print("-----------------------")
print(round(leads_modified_df.isnull().sum()*100/len(leads_modified_df), 2))

### Data Analysis
- Analyzing a few continuous columns
- Analyzing a few categorical columns

In [None]:
# Visualizing continuous data
plt.figure(figsize=(20,15))

plt.subplot(2,1,1)
plt.title("Total Visits", fontsize=25)
graph1 = sns.countplot(x='TotalVisits', data=leads_modified_df)
graph1.set(xlabel=None)

plt.subplot(2,1,2)
plt.title("Total Time Spent on Website", fontsize=25)
graph2 = sns.distplot(leads_modified_df['Total Time Spent on Website'], bins=100)
graph2.set(xlabel=None)

plt.show()

In [None]:
# Visualizing catgorical data
plt.figure(figsize=(15,6))

plt.subplot(1,2,1)
plt.title("Lead Source vs Total Visits", fontsize=15)
grpah1 = sns.boxplot(x="Lead Source", y="TotalVisits", data=leads_modified_df)
graph1.set(xlabel=None)

plt.subplot(1,2,2)
plt.title("Lead Source vs Total Time Spent on Website", fontsize=15)
grpah2 = sns.boxplot(x="Lead Source", y="Total Time Spent on Website", data=leads_modified_df)
graph2.set(xlabel=None)

plt.show()

### Data Preparation
- We will create dummies for categorical columns.
- We will split data into train-test set.
- We will perform scaling.

#### Create dummies for all categorical columns.

In [None]:
# Imputing season as categorical 'season' values
column_names = ['Lead Source', 'Lead Origin', 'Specialization', 'City']

for column_name in column_names:
    dummies = pd.get_dummies(leads_modified_df[column_name])
    dummies.drop('others', axis=1, inplace=True)
    leads_modified_df = pd.concat([leads_modified_df, dummies], axis=1)
    leads_modified_df.drop(column_name, axis=1, inplace=True)
    print("Dummies created for: ", column_name)

leads_modified_df.head()

#### Splitting the data into train-test set

In [None]:
# Putting feature variable to X
X = leads_modified_df.drop('Converted', axis=1)

X.head()

In [None]:
# Putting response variable to y
y = leads_modified_df.Converted

y.head()

In [None]:
# Splitting the data into train and test on a ratio of 70-30
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

#### Scaling the continuous data

In [None]:
# Initializing the scaler and scaling the data
scaler = StandardScaler()

columns_to_be_scaled = ['TotalVisits','Total Time Spent on Website','Page Views Per Visit']
X_train[columns_to_be_scaled] = scaler.fit_transform(X_train[columns_to_be_scaled])

X_train.head()

In [None]:
# Checking the Converted Rate
converted = round((sum(leads_modified_df['Converted'])/len(leads_modified_df['Converted'].index))*100, 2)
converted

In [None]:
# Let's see the correlation matrix 
plt.figure(figsize = (20,10))        # Size of the figure
sns.heatmap(leads_modified_df.corr(), annot = True, cmap="Blues")
plt.show()

### Model Building
- We will try to build our first.
- We will use RFE to know how much parameters can be considered.
- We will iteratively remove columns either having high <b>p</b> or <b>VIF</b>.

#### Building our first model 

In [None]:
# Logistic regression model
logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

#### Running RFE to check parameter significance

In [None]:
# Initializing LogisticRegression
logreg = LogisticRegression()

# Running RFE with 12 variables as output
rfe = RFE(logreg, 12)             
rfe = rfe.fit(X_train, y_train)

# Listing the columns
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
# Listing down the columns important for building a model
col = X_train.columns[rfe.support_]

# Listing down the columns not important
X_train.columns[~rfe.support_]

<b>Note:</b> Since we have 13 columns remaining and we chose to find RFE on 12, that's why we have one column to be discarded i.e <i>Mumbai</i>. 

#### Building our second model

In [None]:
# Building our second model
X_train_sm = sm.add_constant(X_train[col])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
# Getting the predicted values on the train set
y_train_pred = res.predict(X_train_sm).values.reshape(-1)
y_train_pred[:10]

In [None]:
# Forming prediction table 
y_train_pred_final = pd.DataFrame({'Converted':y_train.values, 'Converted_Prob':y_train_pred})
y_train_pred_final['LeadId'] = y_train.index
y_train_pred_final.head()

In [None]:
y_train_pred_final['Predicted'] = y_train_pred_final.Converted_Prob.map(lambda x: 1 if x > 0.5 else 0)

# Let's see the head
y_train_pred_final.head()

In [None]:
# Confusion matrix 
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Predicted)
print(confusion)

In [None]:
# Let's check the overall report.
print(metrics.classification_report(y_train_pred_final.Converted, y_train_pred_final.Predicted))

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Dropping a column which is least impactful
col = col.drop('Direct Traffic', 1)
col

#### Building our third model

In [None]:
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm3 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm3.fit()
res.summary()

In [None]:
y_train_pred = res.predict(X_train_sm).values.reshape(-1)
y_train_pred[:10]

In [None]:
y_train_pred_final['Converted_Prob'] = y_train_pred

# Creating new column 'predicted' with 1 if Churn_Prob > 0.5 else 0
y_train_pred_final['Predicted'] = y_train_pred_final.Converted_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final.head()

In [None]:
# Confusion matrix 
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Predicted)
print(confusion)

In [None]:
# Let's check the overall accuracy.
print(metrics.classification_report(y_train_pred_final.Converted, y_train_pred_final.Predicted))

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Dropping a column which is least significant
col = col.drop('Page Views Per Visit', 1)
col

#### Building our fourth model

In [None]:
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm4 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm4.fit()
res.summary()

In [None]:
y_train_pred = res.predict(X_train_sm).values.reshape(-1)
y_train_pred[:10]

In [None]:
y_train_pred_final['Converted_Prob'] = y_train_pred

# Creating new column 'predicted' with 1 if Churn_Prob > 0.5 else 0
y_train_pred_final['Predicted'] = y_train_pred_final.Converted_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final.head()

In [None]:
# Confusion matrix 
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Predicted)
print(confusion)

In [None]:
# Let's check the overall accuracy.
print(metrics.classification_report(y_train_pred_final.Converted, y_train_pred_final.Predicted))

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Dropping one more column which is least significant
col = col.drop('Finance Management', 1)
col

#### Building our fifth model

In [None]:
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm5 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm5.fit()
res.summary()

In [None]:
y_train_pred = res.predict(X_train_sm).values.reshape(-1)
y_train_pred[:10]

In [None]:
y_train_pred_final['Converted_Prob'] = y_train_pred

# Creating new column 'predicted' with 1 if Churn_Prob > 0.5 else 0
y_train_pred_final['Predicted'] = y_train_pred_final.Converted_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final.head()

In [None]:
# Confusion matrix 
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Predicted)
print(confusion)

In [None]:
# Let's check the overall accuracy.
print(metrics.classification_report(y_train_pred_final.Converted, y_train_pred_final.Predicted))

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Dropping one more column which is least significant
col = col.drop('Organic Search', 1)
col

#### Building our sixth model

In [None]:
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm6 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm6.fit()
res.summary()

In [None]:
y_train_pred = res.predict(X_train_sm).values.reshape(-1)
y_train_pred[:10]

In [None]:
y_train_pred_final['Converted_Prob'] = y_train_pred

# Creating new column 'predicted' with 1 if Churn_Prob > 0.5 else 0
y_train_pred_final['Predicted'] = y_train_pred_final.Converted_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final.head()

In [None]:
# Confusion matrix 
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Predicted)
print(confusion)

In [None]:
# Let's check the overall report.
print(metrics.classification_report(y_train_pred_final.Converted, y_train_pred_final.Predicted))

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Dropping one more column which is least significant
col = col.drop('Google', 1)
col

#### Building our final model

In [None]:
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm7 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm7.fit()
res.summary()

In [None]:
y_train_pred = res.predict(X_train_sm).values.reshape(-1)
y_train_pred[:10]

In [None]:
y_train_pred_final['Converted_Prob'] = y_train_pred

# Creating new column 'predicted' with 1 if Churn_Prob > 0.5 else 0
y_train_pred_final['Predicted'] = y_train_pred_final.Converted_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final.head()

In [None]:
# Confusion matrix 
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Predicted)
print(confusion)

In [None]:
# Let's check the overall report.
print(metrics.classification_report(y_train_pred_final.Converted, y_train_pred_final.Predicted))

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

### Model Assessment
- We will draw ROC curve.
- We will create data with different probabilities.
- We will plot a graph for 'accuracy','sensitivity' and 'specificity'.

#### Drawing ROC Curve.

In [None]:
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
fpr, tpr, thresholds = metrics.roc_curve( y_train_pred_final.Converted, y_train_pred_final.Converted_Prob, drop_intermediate = False )

In [None]:
draw_roc(y_train_pred_final.Converted, y_train_pred_final.Converted_Prob)

#### Creating columns with different probabilities

In [None]:
# Let's create columns with different probability cutoffs 
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.Converted_Prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

#### Plotting graph for 'accuracy','sensitivity' and 'specificity'

In [None]:
# Let's plot accuracy sensitivity and specificity for various probabilities.
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

In [None]:
y_train_pred_final['final_predicted'] = y_train_pred_final.Converted_Prob.map( lambda x: 1 if x > 0.3 else 0)

y_train_pred_final.head()

In [None]:
# Confusion matrix 
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Predicted)
print(confusion)

In [None]:
# Let's check the overall accuracy.
print(metrics.classification_report(y_train_pred_final.Converted, y_train_pred_final.Predicted))

In [None]:
metrics.recall_score(y_train_pred_final.Converted, y_train_pred_final.Predicted)

In [None]:
p, r, thresholds = precision_recall_curve(y_train_pred_final.Converted, y_train_pred_final.Converted_Prob)

In [None]:
plt.plot(thresholds, p[:-1], "g-")
plt.plot(thresholds, r[:-1], "r-")
plt.show()

### Model Evaluation
- We will scale test dataset.
- We will predict 'Converted' on it.
- We will read the report and check the <b>sensitivity</b>.

#### Scaling the test dataset

In [None]:
columns_to_be_scaled = ['TotalVisits','Total Time Spent on Website','Page Views Per Visit']
X_test[columns_to_be_scaled] = scaler.transform(X_test[columns_to_be_scaled])
X_test = X_test[col]
X_test.head()

In [None]:
#### Predicting the model on test dataset

In [None]:
X_test_sm = sm.add_constant(X_test)
y_test_pred = res.predict(X_test_sm)
y_test_pred[:10]

In [None]:
# Converting y_pred to a dataframe which is an array
y_pred_1 = pd.DataFrame(y_test_pred)

# Let's see the head
y_pred_1.head()

In [None]:
# Converting y_test to dataframe
y_test_df = pd.DataFrame(y_test)

# Putting CustID to index
y_test_df['LeadId'] = y_test_df.index

# Removing index for both dataframes to append them side by side 
y_pred_1.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)

# Appending y_test_df and y_pred_1
y_pred_final = pd.concat([y_test_df, y_pred_1],axis=1)

y_pred_final.head()

In [None]:
# Renaming the column 
y_pred_final= y_pred_final.rename(columns={ 0 : 'Converted_Prob'})

# Rearranging the columns
y_pred_final = y_pred_final.reindex(['LeadId','Converted','Converted_Prob'], axis=1)

# Let's see the head of y_pred_final
y_pred_final.head()

In [None]:
y_pred_final['final_predicted'] = y_pred_final.Converted_Prob.map(lambda x: 1 if x > 0.3 else 0)

y_pred_final.head()

In [None]:
metrics.confusion_matrix(y_pred_final.Converted, y_pred_final.final_predicted)

#### Checking the overall report

In [None]:
# Let's check the overall report.
print(metrics.classification_report(y_pred_final.Converted, y_pred_final.final_predicted))

#### <b>It seems that the model can predict with sensitivity 77%.</b>