In [32]:
import pandas as pd
import numpy as np

In [33]:
# !wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

In [34]:
df = pd.read_csv('course_lead_scoring.csv')

In [35]:
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [36]:
df_data = df.copy()

In [37]:
df_data.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [38]:
df_data.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [39]:
categorical_columns = list(df_data.dtypes[df_data.dtypes == 'object'].index)
categorical_columns

['lead_source', 'industry', 'employment_status', 'location']

In [40]:
numerical_columns = ['number_of_courses_viewed', 'annual_income', 'interaction_count']
numerical_columns

['number_of_courses_viewed', 'annual_income', 'interaction_count']

In [41]:
df_data[categorical_columns] = df_data[categorical_columns].fillna('NA')
df_data[numerical_columns] = df_data[numerical_columns].fillna(0)
df_data.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [68]:
# Question 1
df_data.industry.mode()

0    retail
Name: industry, dtype: object

In [43]:
#Question 2
print(f"The correlation coefficient between interaction_count and lead_score is: {float(df_data['interaction_count'].corr(df_data['lead_score']).round(3))}")
print(f"The correlation coefficient between number_of_courses_viewed and lead_score is: {float(df_data['number_of_courses_viewed'].corr(df_data['lead_score']).round(3))}")
print(f"The correlation coefficient between number_of_courses_viewed and interaction_count is: {float(df_data['number_of_courses_viewed'].corr(df_data['interaction_count']).round(3))}")
print(f"The correlation coefficient between annual_income and interaction_count is: {float(df_data['annual_income'].corr(df_data['interaction_count']).round(3))}")

The correlation coefficient between interaction_count and lead_score is: 0.01
The correlation coefficient between number_of_courses_viewed and lead_score is: -0.005
The correlation coefficient between number_of_courses_viewed and interaction_count is: -0.024
The correlation coefficient between annual_income and interaction_count is: 0.027


In [44]:
# Split the dataset
from sklearn.model_selection import train_test_split

In [45]:
df_full_train, df_test = train_test_split(df_data, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_val), len(df_test)


# df_train, df_temp = train_test_split(df_data, test_size=0.4, random_state=42)
# df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)
# len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [46]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [47]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [48]:
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [49]:
# Question 3
from sklearn.metrics import mutual_info_score

In [50]:
# print(round(mutual_info_score(df_train['lead_source'], y_train),2))
# print(round(mutual_info_score(df_train['industry'], y_train),2))
# print(round(mutual_info_score(df_train['employment_status'], y_train),2))
# print(round(mutual_info_score(df_train['location'], y_train),2))

In [51]:
def mutual_info_churn_score(series):
    return round(mutual_info_score(series, y_train),2)

In [52]:
mi = df_train[categorical_columns].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

lead_source          0.04
industry             0.01
employment_status    0.01
location             0.00
dtype: float64

In [53]:
# Question 4
from sklearn.feature_extraction import DictVectorizer

In [54]:
dv = DictVectorizer(sparse=False)
train_dict = df_train[categorical_columns + numerical_columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_columns + numerical_columns].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [55]:
dv.get_feature_names_out()

array(['annual_income', 'employment_status=NA',
       'employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'industry=NA', 'industry=education', 'industry=finance',
       'industry=healthcare', 'industry=manufacturing', 'industry=other',
       'industry=retail', 'industry=technology', 'interaction_count',
       'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america',
       'number_of_courses_viewed'], dtype=object)

In [56]:
from sklearn.linear_model import LogisticRegression

In [57]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [58]:
model.intercept_[0]

np.float64(-0.06891397780582399)

In [59]:
model.coef_[0].round(3)

array([-0.   , -0.015,  0.034,  0.003,  0.012, -0.103, -0.025,  0.05 ,
       -0.02 , -0.013, -0.003, -0.009, -0.032, -0.016,  0.313,  0.02 ,
       -0.012, -0.012, -0.116,  0.08 , -0.03 ,  0.004, -0.011, -0.011,
       -0.006,  0.008,  0.006, -0.033, -0.025,  0.456])

In [60]:
y_pred = model.predict_proba(X_val)[:,1]
converted_decision = (y_pred >= 0.5)
(y_val == converted_decision).mean().round(2)

np.float64(0.71)

In [61]:
#Question 5
dv = DictVectorizer(sparse=False)
train_dict = df_train[categorical_columns + numerical_columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_columns + numerical_columns].to_dict(orient='records')
X_val = dv.transform(val_dict)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:,1]
converted_decision = (y_pred >= 0.5)
accuracy_with_full_features = (y_val == converted_decision).mean()
print(f"original accuracy with all features: {accuracy_with_full_features}")

df_full_train = df_train[categorical_columns + numerical_columns]
df_full_val = df_val[categorical_columns + numerical_columns]
results = []
for feature in df_full_train.columns:
    df_train_reduced = df_full_train.drop(columns=[feature])
    df_val_reduced = df_full_val.drop(columns=[feature])

    train_dict = df_train_reduced.to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    
    val_dict = df_val_reduced.to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:,1]
    converted_decision = (y_pred >= 0.5)
    accuracy_with_out_full_features = (y_val == converted_decision).mean()
    accuracy_difference = accuracy_with_full_features - accuracy_with_out_full_features
    print(f"Actual Accuracy with out {feature} {accuracy_with_out_full_features} and difference is: {accuracy_difference} ")
#     results.append({
#     "Feature_removed": feature,
#     "Accuracy_without_feature": accuracy_with_out_full_features,
#     "Accuracy_drop": accuracy_difference
# })

# # Convert results to a DataFrame for clear viewing
# df_results = pd.DataFrame(results).sort_values("Accuracy_drop", ascending=False)
# print(df_results)

original accuracy with all features: 0.7064846416382252
Actual Accuracy with out lead_source 0.6996587030716723 and difference is: 0.0068259385665528916 
Actual Accuracy with out industry 0.7030716723549488 and difference is: 0.0034129692832763903 
Actual Accuracy with out employment_status 0.6928327645051194 and difference is: 0.013651877133105783 
Actual Accuracy with out location 0.7064846416382252 and difference is: 0.0 
Actual Accuracy with out number_of_courses_viewed 0.5597269624573379 and difference is: 0.14675767918088733 
Actual Accuracy with out annual_income 0.8225255972696246 and difference is: -0.11604095563139938 
Actual Accuracy with out interaction_count 0.5597269624573379 and difference is: 0.14675767918088733 


In [63]:
#Question 6
dv = DictVectorizer(sparse=False)
train_dict = df_train[categorical_columns + numerical_columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_columns + numerical_columns].to_dict(orient='records')
X_val = dv.transform(val_dict)

for C in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:,1]
    converted_decision = (y_pred >= 0.5)
    accuracy = (y_val == converted_decision).mean()
    print(f"Accuracy {C}: {accuracy}")

Accuracy 0.01: 0.6962457337883959
Accuracy 0.1: 0.6996587030716723
Accuracy 1: 0.7064846416382252
Accuracy 10: 0.7064846416382252
Accuracy 100: 0.7064846416382252
