In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-13 14:25:46--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv.1’


2025-10-13 14:25:47 (1.04 MB/s) - ‘course_lead_scoring.csv.1’ saved [80876/80876]



In [2]:
import pandas as pd
import numpy as np

In [3]:
df=pd.read_csv('course_lead_scoring.csv')

In [4]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [5]:
df['industry'].mode()

0    retail
Name: industry, dtype: object

In [6]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [7]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [8]:
corr1 = df['interaction_count'].corr(df['lead_score'])
corr1

np.float64(0.009888182496913084)

In [9]:
corr2= df['number_of_courses_viewed'].corr(df['lead_score'])
corr2

np.float64(-0.004878998354681257)

In [10]:
corr3= df['number_of_courses_viewed'].corr(df['interaction_count'])
corr3

np.float64(-0.023565222882888117)

In [11]:
corr4= df['annual_income'].corr(df['interaction_count'])
corr4

np.float64(0.048618416552580965)

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X=df.drop(columns=['converted'])
y=df['converted']

In [14]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [15]:
df_full_train.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
1066,social_media,manufacturing,2,44403.0,self_employed,australia,1,0.71,0
638,events,retail,3,38048.0,student,north_america,6,0.97,1
799,social_media,education,2,71399.0,,europe,1,0.51,1
380,referral,education,2,47912.0,employed,australia,1,0.04,0
303,paid_ads,healthcare,1,34806.0,employed,europe,4,0.32,1


In [16]:
categorical = ['industry', 'location', 'lead_source', 'employment_status']
for col in categorical:
    df_full_train[col]=df_full_train[col].fillna('missing')

In [17]:
df_full_train.isnull().sum()

lead_source                   0
industry                      0
number_of_courses_viewed      0
annual_income               147
employment_status             0
location                      0
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [18]:
from sklearn.metrics import mutual_info_score
m1=mutual_info_score(df_full_train.converted, df_full_train.industry)
m1


0.011684562750165564

In [19]:
from sklearn.metrics import mutual_info_score
m2=mutual_info_score(df_full_train.converted, df_full_train.location)
m3=mutual_info_score(df_full_train.converted, df_full_train.lead_source)
m4=mutual_info_score(df_full_train.converted, df_full_train.employment_status)
m1,m2,m3,m4

(0.011684562750165564,
 0.0022530354195563346,
 0.025665373935054955,
 0.013258496589914293)

In [20]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
dv = DictVectorizer(sparse=False)

numerical=[col for col in df_train.columns if col not in categorical]
numerical


['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score',
 'converted']

In [21]:
for col in categorical:
    df_train[col]=df_train[col].fillna('missing')
    df_test[col]=df_test[col].fillna('missing')
    df_val[col]=df_val[col].fillna('missing')
df_train.isnull().sum()


lead_source                   0
industry                      0
number_of_courses_viewed      0
annual_income               110
employment_status             0
location                      0
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [22]:
for col in numerical:
    df_train[col]=df_train[col].fillna(0)
    df_test[col]=df_test[col].fillna(0)
    df_val[col]=df_val[col].fillna(0)
df_train.isnull().sum()


lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [23]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [24]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']


In [44]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=0.01, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.01
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [45]:
from sklearn.metrics import accuracy_score
y_train_pred=model.predict(X_train)
y_train_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,

In [46]:
train_acc = accuracy_score(y_train, y_train_pred)
train_acc

0.8036529680365296

In [47]:
val_acc = accuracy_score(y_val, y_val_pred)
val_acc

0.757679180887372

In [48]:
y_val_pred=model.predict(X_val)
y_val_pred

array([1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 0])

In [30]:
model = LogisticRegression(solver='liblinear', C=0.01, max_iter=1000, random_state=42)
model.fit(X_train, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.01
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [31]:
val_acc = accuracy_score(y_val, y_val_pred)
val_acc

0.757679180887372

In [32]:
model = LogisticRegression(solver='liblinear', C=0.1, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_val_pred=model.predict(X_val)
y_val_pred
val_acc = accuracy_score(y_val, y_val_pred)
val_acc

0.757679180887372

In [33]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_val_pred=model.predict(X_val)
y_val_pred
val_acc = accuracy_score(y_val, y_val_pred)
val_acc

0.757679180887372

In [34]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [35]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [51]:
c=[0.01,0.1,1,10,100]
for i in reversed(c):
    model = LogisticRegression(solver='liblinear', C=i, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_val_pred=model.predict(X_val)
    y_val_pred
    val_acc = accuracy_score(y_val, y_val_pred)
    val_acc
    print(f'{i}:{val_acc}')
    

100:0.757679180887372
10:0.757679180887372
1:0.757679180887372
0.1:0.757679180887372
0.01:0.7679180887372014


In [52]:
c=[0.01,0.1,1,10,100]
for i in c:
    model = LogisticRegression(solver='liblinear', C=i, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_val_pred=model.predict(X_val)
    y_val_pred
    val_acc = accuracy_score(y_val, y_val_pred)
    val_acc
    print(f'{i}:{val_acc}')

0.01:0.7679180887372014
0.1:0.757679180887372
1:0.757679180887372
10:0.757679180887372
100:0.757679180887372


In [70]:
features = ['industry', 'employment_status', 'lead_score']

train_dict = df_train[features].to_dict(orient='records')
val_dict = df_val[features].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_val_pred = model.predict(X_val)
acc_full = accuracy_score(y_val, y_val_pred)
print("Full model accuracy:", acc_full)

results = {}
for f in features:
    mask = [not name.startswith(f + '=') for name in dv.get_feature_names_out()]
    X_train_new = X_train[:, mask]
    X_val_new = X_val[:, mask]
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_new, y_train)
    y_pred = model.predict(X_val_new)
    
    acc = accuracy_score(y_val, y_pred)
    diff = acc_full - acc
    results[f] = diff
    print(f"Without {f}: accuracy={acc:.3f}, diff={diff:.4f}")


Full model accuracy: 0.6348122866894198
Without industry: accuracy=0.614, diff=0.0205
Without employment_status: accuracy=0.652, diff=-0.0171
Without lead_score: accuracy=0.635, diff=0.0000


In [67]:
features = ['industry', 'employment_status', 'lead_score']
for col in features:
    print(df_train[col].isnull().sum())
    

0
0
0


In [61]:
for col in features:
    df_train[col]=df_train[col].fillna('missing')
for col in features:
    print(df[col].isnull().sum())


0
0
0
