In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')

In [3]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [4]:
df.tail()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1
1461,organic_search,finance,3,92855.0,student,north_america,3,0.41,1


In [5]:
df.describe()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
count,1462.0,1281.0,1462.0,1462.0,1462.0
mean,2.031464,59886.273224,2.976744,0.506108,0.619015
std,1.449717,15070.140389,1.681564,0.288465,0.485795
min,0.0,13929.0,0.0,0.0,0.0
25%,1.0,49698.0,2.0,0.2625,0.0
50%,2.0,60148.0,3.0,0.51,1.0
75%,3.0,69639.0,4.0,0.75,1.0
max,9.0,109899.0,11.0,1.0,1.0


<b> DATA PREPARATION <br>

Check if the missing values are presented in the features.<br>
If there are missing values:<br>


In [6]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [7]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

The numeric columns are number_of_courses_viewed,  annual_income , interaction_count ,lead_score

<b> For caterogiral features, replace them with 'NA'<br>
For numerical features, replace with with 0.0<br>

In [8]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
numerical_columns = list(df.dtypes[df.dtypes != 'object'].index)
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')
    df[c] = df[c].fillna('NA')

for n in numerical_columns:
     df[n] = df[n].fillna(0)

In [13]:
numerical_columns

['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score',
 'converted']

In [14]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [15]:
df.nunique()

lead_source                    6
industry                       8
number_of_courses_viewed      10
annual_income               1268
employment_status              5
location                       8
interaction_count             12
lead_score                   101
converted                      2
dtype: int64

<b> Question 1 <br>
What is the most frequent observation (mode) for the column industry?<br>
Ans: retail </b>

In [16]:
df_grouped = df.groupby('industry')['industry'].count()

df_grouped

industry
NA               134
education        187
finance          200
healthcare       187
manufacturing    174
other            198
retail           203
technology       179
Name: industry, dtype: int64

<b> Question 2 <br> 
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features. <br>
What are the two features that have the biggest correlation? <br>
interaction_count and lead_score <br>
number_of_courses_viewed and lead_score <br>
number_of_courses_viewed and interaction_count <br>
annual_income and interaction_count <br>
Only consider the pairs above when answering this question. <br>

<b> Ans: annual income and interation count  = 0.027036

In [17]:
for n in numerical_columns:
    print('Correlation with', n)
    cols = [c for c in numerical_columns if c != n]  # remove n from the list
    corr_values = df[cols].corrwith(df[n]).abs().sort_values(ascending=False)    # absolute correlations
    print(corr_values)


Correlation with number_of_courses_viewed
converted            0.435914
interaction_count    0.023565
annual_income        0.009770
lead_score           0.004879
dtype: float64
Correlation with annual_income
converted                   0.053131
interaction_count           0.027036
lead_score                  0.015610
number_of_courses_viewed    0.009770
dtype: float64
Correlation with interaction_count
converted                   0.374573
annual_income               0.027036
number_of_courses_viewed    0.023565
lead_score                  0.009888
dtype: float64
Correlation with lead_score
converted                   0.193673
annual_income               0.015610
interaction_count           0.009888
number_of_courses_viewed    0.004879
dtype: float64
Correlation with converted
number_of_courses_viewed    0.435914
interaction_count           0.374573
lead_score                  0.193673
annual_income               0.053131
dtype: float64


<b> annual income and interation count  = 0.027036 <br> 
number_of_courses_viewed and lead_score = 0.004879  <br> 
number_of_courses_viewed and interaction_count = 0.023565  <br> 
interaction count and lead score = 0.009888  <br> 

<b> Split the data <br> 
Split your data in train/val/test sets with 60%/20%/20% distribution.<br> 
Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.<br> 
Make sure that the target value converted is not in your dataframe.<br> 
Question 3 <br> 
Calculate the mutual information score between converted and other categorical variables in the dataset. Use the training set only. <br> 
Round the scores to 2 decimals using round(score, 2).<br> 
Which of these variables has the biggest mutual information score?<br> 
industry<br> 
location<br> 
lead_source <br> 
employment_status<br> 
Ans: lead_source 

In [18]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_val), len(df_test)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

df_train_copy = df_train.copy()
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [19]:
from sklearn.metrics import mutual_info_score


In [20]:
def mutual_info_converted_score(series):
    return mutual_info_score(series, df_train_copy.converted)

In [21]:
mi = df_train_copy[categorical_columns].apply(mutual_info_converted_score)
mi.sort_values(ascending=False)

lead_source          0.035396
employment_status    0.012938
industry             0.011575
location             0.004464
dtype: float64

<b> Question 4 <br>
Now let's train a logistic regression.<br>
Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.<br>
Fit the model on the training dataset. <br>
To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:<br>
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42) <br>
Calculate the accuracy on the validation dataset and round it to 2 decimal digits.<br>
What accuracy did you get? <br>

0.64 <br>
0.74 <br>
0.84 <br>
0.94 <br>

<b> Ans: 0.7 which is closest to 0.74 <br>

In [22]:
from sklearn.feature_extraction import DictVectorizer

numeric_cols = [c for c in numerical_columns if c != 'converted']

In [23]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical_columns + numeric_cols].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_columns + numeric_cols].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [24]:
from sklearn.linear_model import LogisticRegression



In [25]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
# solver='lbfgs' is the default solver in newer version of sklearn
# for older versions, you need to specify it explicitly
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [26]:
model.intercept_[0]


np.float64(-0.0691472802783609)

In [27]:
#model.coef_[0].round(3)


In [28]:
y_pred = model.predict_proba(X_val)[:, 1]


In [29]:

converted_decision = (y_pred >= 0.5)

(y_val == converted_decision).mean()

np.float64(0.6996587030716723)

In [30]:
round((y_val == converted_decision).mean(),2)


np.float64(0.7)

<b> Question 5 <br>
Let's find the least useful feature using the feature elimination technique.  <br>
Train a model using the same features and parameters as in Q4 (without rounding).  <br>
Now exclude each feature from this set and train a model without it. Record the accuracy for each model.  <br>
For each feature, calculate the difference between the original accuracy and the accuracy without the feature.  <br>
Which of following feature has the smallest difference?  <br>

'industry'  <br>
'employment_status'  <br>
'lead_score'  <br>
Note: The difference doesn't have to be positive. <br>
<b> Ans: industry

<b> Find Original Accuracy

In [31]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
# solver='lbfgs' is the default solver in newer version of sklearn
# for older versions, you need to specify it explicitly
model.fit(X_train, y_train)
model.intercept_[0]
model.coef_[0]



array([-1.77843867e-05, -1.47154423e-02,  3.39095225e-02,  2.66248432e-03,
        1.15238518e-02, -1.02527697e-01, -2.48510995e-02,  4.93604222e-02,
       -2.01258344e-02, -1.34214865e-02, -3.00232200e-03, -9.25991830e-03,
       -3.17957304e-02, -1.60513114e-02,  3.11339155e-01,  5.12012528e-02,
        2.01511698e-02, -1.20346284e-02, -1.16021521e-02, -1.15251880e-01,
        7.95303436e-02, -2.99401329e-02,  3.95843295e-03, -1.14296944e-02,
       -1.12457415e-02, -5.59987025e-03,  8.26402635e-03,  5.58598769e-03,
       -3.33967159e-02, -2.52837052e-02,  4.53752887e-01])

In [32]:
y_pred = model.predict_proba(X_val)[:, 1]


In [33]:
converted_decision = (y_pred >= 0.5)


In [34]:
original_accuracy = (y_val == converted_decision).mean()
print(original_accuracy)

0.6996587030716723


Original Accuracy = 0.699

In [35]:
df_train.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
0,paid_ads,retail,0,58472.0,student,middle_east,5,0.03
1,organic_search,manufacturing,3,71738.0,student,middle_east,6,0.77
2,paid_ads,technology,3,81973.0,employed,north_america,2,0.59
3,,technology,1,74956.0,employed,europe,3,0.34
4,organic_search,retail,3,59335.0,student,australia,1,0.98


In [36]:
all_columns = categorical_columns + numeric_cols

def feature_importance(df_train, df_val, y_train, y_val, model, all_columns, original_accuracy):
    least_accuracy = 1
    for n in all_columns:
        dv = DictVectorizer(sparse=False)
        cols = [c for c in all_columns if c != n]  # remove n from the list
        train_dict = df_train[cols].to_dict(orient='records')
        X_train = dv.fit_transform(train_dict)
        val_dict = df_val[cols].to_dict(orient='records')
        X_val = dv.transform(val_dict)
        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_val)[:, 1]
        converted_decision = (y_pred >= 0.5)
        accuracy = (y_val == converted_decision).mean()
        print('Accuracy after eliminating', n , 'is ', accuracy)
        print('Difference in accuracy is', accuracy - original_accuracy)
        impact = abs(accuracy - original_accuracy)
        if impact < least_accuracy:
            least_accuracy = impact
            least_imp_feature = n

    return least_imp_feature

In [37]:
print('Least important feature is ', feature_importance(df_train, df_val, y_train, y_val, model, all_columns, original_accuracy))

Accuracy after eliminating lead_source is  0.7030716723549488
Difference in accuracy is 0.0034129692832765013
Accuracy after eliminating industry is  0.6996587030716723
Difference in accuracy is 0.0
Accuracy after eliminating employment_status is  0.6962457337883959
Difference in accuracy is -0.0034129692832763903
Accuracy after eliminating location is  0.7098976109215017
Difference in accuracy is 0.010238907849829393
Accuracy after eliminating number_of_courses_viewed is  0.5563139931740614
Difference in accuracy is -0.14334470989761094
Accuracy after eliminating annual_income is  0.8532423208191127
Difference in accuracy is 0.15358361774744034
Accuracy after eliminating interaction_count is  0.5563139931740614
Difference in accuracy is -0.14334470989761094
Accuracy after eliminating lead_score is  0.7064846416382252
Difference in accuracy is 0.0068259385665528916
Least important feature is  industry


<b> Least important feature is  industry

<b> Question 6 <br>
Now let's train a regularized logistic regression. <br>
Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100]. <br>
Train models using all the features as in Q4. <br>
Calculate the accuracy on the validation dataset and round it to 3 decimal digits. <br>
Which of these C leads to the best accuracy on the validation set? <br>
Ans: 0.01 (choose smallest C)

In [53]:
def regularised_logistic_reg(df_train, df_val, y_train, y_val, all_columns, original_accuracy):
    least_accuracy = 1
    dv = DictVectorizer(sparse=False)
    train_dict = df_train[all_columns].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    val_dict = df_val[all_columns].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    for reg in [0.01, 0.1, 1, 10, 100]:
        model = LogisticRegression(solver='liblinear', C= reg, max_iter=1000, random_state=42)
        # solver='lbfgs' is the default solver in newer version of sklearn
        # for older versions, you need to specify it explicitly
        model.fit(X_train, y_train)
        model.intercept_[0]
        model.coef_[0].round(3)
        
        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_val)[:, 1]
        converted_decision = (y_pred >= 0.5)
        accuracy = (y_val == converted_decision).mean()
        print('Accuracy for C =', reg , 'is ', round(accuracy,3))
        

In [54]:
regularised_logistic_reg(df_train, df_val, y_train, y_val, all_columns, original_accuracy)

Accuracy for C = 0.01 is  0.7
Accuracy for C = 0.1 is  0.7
Accuracy for C = 1 is  0.7
Accuracy for C = 10 is  0.7
Accuracy for C = 100 is  0.7
