In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
link = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"

In [3]:
df = pd.read_csv(link)

In [4]:
df.shape

(1462, 9)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [6]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [7]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [8]:
categorical_features = list(df.dtypes[df.dtypes == 'object'].index)
categorical_features

['lead_source', 'industry', 'employment_status', 'location']

In [9]:
for col in categorical_features:
    print(col)
    print(df[col].unique())
    print(df[col].nunique())
    print()

lead_source
['paid_ads' 'social_media' 'events' 'referral' 'organic_search' nan]
5

industry
[nan 'retail' 'healthcare' 'education' 'manufacturing' 'technology'
 'other' 'finance']
7

employment_status
['unemployed' 'employed' nan 'self_employed' 'student']
4

location
['south_america' 'australia' 'europe' 'africa' 'middle_east' nan
 'north_america' 'asia']
7



In [10]:
df[categorical_features]=df[categorical_features].fillna('NA')

In [11]:
df.annual_income = df.annual_income.fillna(0.0)

In [12]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

##

## 1. Most frequent observation / mode

In [13]:
df.industry.mode()

0    retail
Name: industry, dtype: object

In [14]:
df.industry.value_counts().sort_values(ascending=False)

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

#### Sol 1: The most frequent observation (mode) for the column industry is RETAIL with 203 observations

##

## 2. Correlation Matrix

In [15]:
list(df.dtypes[df.dtypes != object].index)

['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score',
 'converted']

In [16]:
numeric_features = ['number_of_courses_viewed','annual_income','interaction_count','lead_score']

In [17]:
Corr_Matrix = round(df[numeric_features].corr(),4)
Corr_Matrix

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.0098,-0.0236,-0.0049
annual_income,0.0098,1.0,0.027,0.0156
interaction_count,-0.0236,0.027,1.0,0.0099
lead_score,-0.0049,0.0156,0.0099,1.0


#### Sol 2: Annual_income and Interaction_count has the biggest correlation with coefficient value as 0.0270	

##

## 3. Mutual Information Score

In [18]:
from sklearn.model_selection import train_test_split


In [19]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [20]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [21]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [22]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

In [23]:
from sklearn.metrics import mutual_info_score

In [24]:
print(categorical_features)
print()
print(numeric_features)

['lead_source', 'industry', 'employment_status', 'location']

['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']


In [25]:
for col in categorical_features:
    print(col)
    score = mutual_info_score(df_full_train.converted, df_full_train[col])
    print(round(score,2))
    print()

lead_source
0.03

industry
0.01

employment_status
0.01

location
0.0



#### Sol 3: The categorical variable with the biggest mutual information score is lead_source with value 0.03

##

## 4. Training Logistic Regression

In [26]:
from sklearn.feature_extraction import DictVectorizer


In [27]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical_features + numeric_features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_features + numeric_features].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [28]:
from sklearn.linear_model import LogisticRegression

In [29]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [30]:
model.intercept_[0]

np.float64(-0.06914728027824993)

In [31]:
model.coef_[0].round(3)

array([-0.   , -0.015,  0.034,  0.003,  0.012, -0.103, -0.025,  0.049,
       -0.02 , -0.013, -0.003, -0.009, -0.032, -0.016,  0.311,  0.051,
        0.02 , -0.012, -0.012, -0.115,  0.08 , -0.03 ,  0.004, -0.011,
       -0.011, -0.006,  0.008,  0.006, -0.033, -0.025,  0.454])

In [32]:
y_pred = model.predict_proba(X_val)[:, 1]

In [33]:
converted_decision = (y_pred >= 0.5)

In [34]:
(y_val == converted_decision).mean().round(2)


np.float64(0.7)

In [35]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = converted_decision.astype(int)
df_pred['actual'] = y_val

In [36]:
df_pred['correct'] = df_pred.prediction == df_pred.actual

In [37]:
df_pred

Unnamed: 0,probability,prediction,actual,correct
0,0.611922,1,0,False
1,0.799826,1,1,True
2,0.530213,1,0,False
3,0.471315,0,0,True
4,0.570661,1,0,False
...,...,...,...,...
288,0.419342,0,0,True
289,0.710539,1,1,True
290,0.418185,0,0,True
291,0.744835,1,1,True


In [38]:
val_accuracy = df_pred.correct.mean()
val_accuracy

np.float64(0.6996587030716723)

In [39]:
df_pred.correct.mean().round(2)

np.float64(0.7)

#### Sol 4: Trained a logistic regression model using one-hot encoding of the categorical variables
#### The accuracy on the validation set is 0.7 and the closest value is 0.74

##

## 5. Feature Elimination

In [40]:
all_features = categorical_features + numeric_features
all_features

['lead_source',
 'industry',
 'employment_status',
 'location',
 'number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score']

In [41]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from itertools import combinations

features = all_features

results = []

for combo in combinations(features, 7):
    excluded = [f for f in features if f not in combo]
    print('Combo: ', combo)
    
    dv = DictVectorizer(sparse=False)

    # Creating Train Dict
    train_dict = df_train[list(combo)].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    # Creating Val Dict
    val_dict = df_val[list(combo)].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    # Fitting the Logistic Regression model
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)



    y_pred = model.predict_proba(X_val)[:, 1]
    converted_decision = (y_pred >= 0.5)
    
    # Calculating Accuracy
    df_pred = pd.DataFrame()
    df_pred['probability'] = y_pred
    df_pred['prediction'] = converted_decision.astype(int)
    df_pred['actual'] = y_val
    df_pred['correct'] = df_pred.prediction == df_pred.actual
    
    print('Excluded: ',excluded)
    new_accuracy = df_pred.correct.mean()
    print('Accuracy of Model without: ', excluded, 'is: ', new_accuracy)
    print('Accuracy of Full Model: ',val_accuracy)
    print('Absolute Diff: ',abs(new_accuracy-val_accuracy))
    print()


    results.append({
        'Col_Excluded': excluded,
        'Without_Col_Accuracy': new_accuracy,
        'Full_Model_Accuracy': val_accuracy,
        'Diff': val_accuracy-new_accuracy,
        'Abs_Diff': abs(new_accuracy-val_accuracy),
    })


Combo:  ('lead_source', 'industry', 'employment_status', 'location', 'number_of_courses_viewed', 'annual_income', 'interaction_count')
Excluded:  ['lead_score']
Accuracy of Model without:  ['lead_score'] is:  0.7064846416382252
Accuracy of Full Model:  0.6996587030716723
Absolute Diff:  0.0068259385665528916

Combo:  ('lead_source', 'industry', 'employment_status', 'location', 'number_of_courses_viewed', 'annual_income', 'lead_score')
Excluded:  ['interaction_count']
Accuracy of Model without:  ['interaction_count'] is:  0.5563139931740614
Accuracy of Full Model:  0.6996587030716723
Absolute Diff:  0.14334470989761094

Combo:  ('lead_source', 'industry', 'employment_status', 'location', 'number_of_courses_viewed', 'interaction_count', 'lead_score')
Excluded:  ['annual_income']
Accuracy of Model without:  ['annual_income'] is:  0.8532423208191127
Accuracy of Full Model:  0.6996587030716723
Absolute Diff:  0.15358361774744034

Combo:  ('lead_source', 'industry', 'employment_status', 'loc

In [42]:
results = pd.DataFrame(results)
results


Unnamed: 0,Col_Excluded,Without_Col_Accuracy,Full_Model_Accuracy,Diff,Abs_Diff
0,[lead_score],0.706485,0.699659,-0.006826,0.006826
1,[interaction_count],0.556314,0.699659,0.143345,0.143345
2,[annual_income],0.853242,0.699659,-0.153584,0.153584
3,[number_of_courses_viewed],0.556314,0.699659,0.143345,0.143345
4,[location],0.709898,0.699659,-0.010239,0.010239
5,[employment_status],0.696246,0.699659,0.003413,0.003413
6,[industry],0.699659,0.699659,0.0,0.0
7,[lead_source],0.703072,0.699659,-0.003413,0.003413


In [43]:
results.sort_values(by=['Abs_Diff'])

Unnamed: 0,Col_Excluded,Without_Col_Accuracy,Full_Model_Accuracy,Diff,Abs_Diff
6,[industry],0.699659,0.699659,0.0,0.0
5,[employment_status],0.696246,0.699659,0.003413,0.003413
7,[lead_source],0.703072,0.699659,-0.003413,0.003413
0,[lead_score],0.706485,0.699659,-0.006826,0.006826
4,[location],0.709898,0.699659,-0.010239,0.010239
1,[interaction_count],0.556314,0.699659,0.143345,0.143345
3,[number_of_courses_viewed],0.556314,0.699659,0.143345,0.143345
2,[annual_income],0.853242,0.699659,-0.153584,0.153584


#### Sol 5: From the list of features ['industry','employment_status','lead_score'] 
####  The least difference is INDUSTRY as there is no difference 0.0 without it on the model without this variable in accuracy


##



## 6. Regularized Logistic Regression

In [44]:
all_features

['lead_source',
 'industry',
 'employment_status',
 'location',
 'number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score']

In [45]:
from sklearn.feature_extraction import DictVectorizer


In [46]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[all_features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[all_features].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [47]:
from sklearn.linear_model import LogisticRegression

In [48]:
parameter_test = [0.01, 0.1, 1, 10, 100]

In [49]:
for p in parameter_test:
    
    print('Value of parameter is: ',p)
    
    model = LogisticRegression(solver='liblinear', C=p, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_val)[:, 1]
    converted_decision = (y_pred >= 0.5)
    
    # Calculating Accuracy
    df_pred = pd.DataFrame()
    df_pred['probability'] = y_pred
    df_pred['prediction'] = converted_decision.astype(int)
    df_pred['actual'] = y_val
    df_pred['correct'] = df_pred.prediction == df_pred.actual
    
    new_accuracy = df_pred.correct.mean().round(3)
    
    print('Value of parameter is: ', p ,'Accuracy: ', new_accuracy )
    
    #print(df_pred)
    
    print()


Value of parameter is:  0.01
Value of parameter is:  0.01 Accuracy:  0.7

Value of parameter is:  0.1
Value of parameter is:  0.1 Accuracy:  0.7

Value of parameter is:  1
Value of parameter is:  1 Accuracy:  0.7

Value of parameter is:  10
Value of parameter is:  10 Accuracy:  0.7

Value of parameter is:  100
Value of parameter is:  100 Accuracy:  0.7



#### Sol 6: From the above list of values of parameters, all yield the same accuracy and hence the least is answer which is 0.01


####