In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression 
from sklearn.preprocessing import StandardScaler

In [3]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'
!wget $data

--2025-10-14 15:36:32--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-14 15:36:32 (42.2 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [2]:
df = pd.read_csv('course_lead_scoring.csv')
df.columns = df.columns.str.lower().str.replace(' ','_')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [18]:
#check if there are any categorical values that Nan and convert them to 'NA'
cat_cols = df.select_dtypes(include=['object', 'category']).columns
#replace categorical features of nan to NA
df[cat_cols] = df[cat_cols].fillna('NA')
df[cat_cols].head()

Unnamed: 0,lead_source,industry,employment_status,location
0,paid_ads,,unemployed,south_america
1,social_media,retail,employed,south_america
2,events,healthcare,unemployed,australia
3,paid_ads,retail,,australia
4,referral,education,self_employed,europe


In [17]:
#check if there are any numberical features that are Nan and convert them to 0
num_cols = df.select_dtypes(include=['number']).columns
#replace numerical features of nan to 0
df[num_cols] = df[num_cols].fillna(0.0)
df[num_cols].head()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
0,1,79450.0,4,0.94,1
1,1,46992.0,1,0.8,0
2,5,78796.0,3,0.69,1
3,2,83843.0,1,0.87,0
4,3,85012.0,3,0.62,1


In [5]:
#what is mode for coloumn 'industry'
df['industry'].mode()

0    retail
Name: industry, dtype: object

In [11]:
# find the correlation matrix of the following pairs and find the strongest correlation
#interaction_count and lead_score
round(abs(df['interaction_count'].corr(df['lead_score'])),2)

np.float64(0.01)

In [12]:
#number_of_courses_viewed and lead_score
round(abs(df['number_of_courses_viewed'].corr(df['lead_score'])),2)

np.float64(0.0)

In [13]:
#number_of_courses_viewed and interaction_count
round(abs(df['number_of_courses_viewed'].corr(df['interaction_count'])),2)

np.float64(0.02)

In [14]:
#annual_income and interaction_count
round(abs(df['annual_income'].corr(df['interaction_count'])),2)

np.float64(0.03)

In [None]:
# according to the absolute correlation coefficient annumal income and interaction count has the highest correlation to the interaction count

In [19]:
#split the data using train_test_split function of sickit_learn
#do a data cleanup for Nans
df[num_cols].fillna(0.0)
df[cat_cols].fillna('NA')
df_train_data_full, df_test = train_test_split(df, test_size=0.2,random_state=42)
df_train, df_val = train_test_split(df_train_data_full, test_size=0.25,random_state=42)
len(df_train),len(df_val),len(df_test)
df_train.reset_index(drop=True)
df_val.reset_index(drop=True)
df_test.reset_index(drop=True)
y_train= df_train.converted.values
y_val =df_val.converted.values
y_test =df_test.converted.values
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [20]:
df_train_data_full.head()
df_train_data_full[num_cols] = df_train_data_full[num_cols].fillna(0.0)
df_train_data_full[cat_cols] = df_train_data_full[cat_cols].fillna('NA')
df_train_data_full.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [None]:
#Calculate the mutual information score between converted and other categorical variables in the dataset.

In [17]:
round(mutual_info_score(df_train_data_full.industry, df_train_data_full.converted),2)

0.01

In [42]:
round(mutual_info_score(df_train_data_full.location, df_train_data_full.converted),2)

0.0

In [18]:
round(mutual_info_score(df_train_data_full.lead_source, df_train_data_full.converted),2)

0.03

In [19]:
round(mutual_info_score(df_train_data_full.employment_status, df_train_data_full.converted),2)

0.01

In [None]:
# according to the mutual_info_score lead_source has the biggest_value

In [None]:
#Now let's train a logistic regression.using one-hot encoding.
#Fit the model on the training dataset.

In [8]:
train_dicts = df_train[cat_cols].to_dict(orient='records')

In [9]:
dv = DictVectorizer(sparse=False)
encoded_cat =dv.fit_transform(train_dicts)
encoded_df = pd.DataFrame(encoded_cat, columns=dv.get_feature_names_out(cat_cols))

In [10]:
#concat the numerical and categorical columns
X_train = pd.concat([df_train[['number_of_courses_viewed','annual_income' ,'interaction_count','lead_score']].reset_index(drop=True),encoded_df.reset_index(drop=True)], axis=1)

In [49]:
#scaler = StandardScaler()
#X_scaled = scaler.fit_transform(X_train)
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)
#check for mode coefficient
#model.coef_[0].round(3)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [50]:
#Before we can calculate the accuracy we need vectorize and encode the validation data along with concat the columns
df_val[cat_cols] = df_val[cat_cols].fillna('NA')
df_val[['number_of_courses_viewed','annual_income' ,'interaction_count','lead_score']] = df_val[['number_of_courses_viewed','annual_income' ,'interaction_count','lead_score']].fillna(0.0)
val_dicts =  df_val[cat_cols].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
encoded_cat =dv.fit_transform(val_dicts)
encoded_df = pd.DataFrame(encoded_cat, columns=dv.get_feature_names_out(cat_cols))
X_val = pd.concat([df_val[['number_of_courses_viewed','annual_income' ,'interaction_count','lead_score']].reset_index(drop=True),encoded_df.reset_index(drop=True)], axis=1)
#get the soft prediction
Y_pred = model.predict_proba(X_val)[:, 1]
#ensure the shapes match
Y_pred.shape, y_val.shape

((293,), (293,))

In [51]:
#find the accuracy of the model
churn_decision = (Y_pred >= 0.5)
accuracy = (y_val == churn_decision).mean()
print("Validation Accuracy:", round(accuracy,2))

Validation Accuracy: 0.7


In [None]:
#round to 2 decimal places the accuracy choise will be 0.74

In [66]:
#find the least useful feature from feature elimination technique
num_cols = ['number_of_courses_viewed','annual_income' ,'interaction_count','lead_score']
def prepare_train_logistical_model(list1:list):
    df_train[list1] = df_train[list1].fillna('NA')
    df_train[num_cols]= df_train[num_cols].fillna(0.0)
    df_val[list1] = df_val[list1].fillna('NA')
    df_val[num_cols] = df_val[num_cols].fillna(0.0)  
    #vactorize training data
    train_dicts = df_train[list1].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    encoded_cat =dv.fit_transform(train_dicts)
    encoded_df = pd.DataFrame(encoded_cat, columns=dv.get_feature_names_out(cat_cols))
    X_train = pd.concat([df_train[num_cols].reset_index(drop=True),encoded_df.reset_index(drop=True)], axis=1)
    #vectorize validation data
    val_dicts =  df_val[list1].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    encoded_cat =dv.fit_transform(val_dicts)
    encoded_df = pd.DataFrame(encoded_cat, columns=dv.get_feature_names_out(cat_cols))
    X_val = pd.concat([df_val[num_cols].reset_index(drop=True),encoded_df.reset_index(drop=True)], axis=1)
    #train logistical regression modol
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train,y_train)
    #get the soft prediction
    Y_pred = model.predict_proba(X_val)[:, 1]
    #find the accuracy of the model
    churn_decision = (Y_pred >= 0.5)
    accuracy = (y_val == churn_decision).mean()
    #print("Validation Accuracy:", accuracy)
    return accuracy


    
accuracy_diff1 = accuracy - prepare_train_logistical_model(['lead_source','employment_status','location'])
accuracy_diff2 = accuracy -prepare_train_logistical_model(['lead_source','industry','location'])
accuracy_diff3 = accuracy -prepare_train_logistical_model(['industry','employment_status','location'])
print('industry:', accuracy_diff1)
print('employment_status', accuracy_diff2)
print('lead_source', accuracy_diff3)



industry: 0.0
employment_status 0.0034129692832763903
lead_source -0.0034129692832765013


In [None]:
# there was no difference to the accuracy when industry feature was eliminated so industry feature
#has the least impact on the accuracy

In [73]:
#calculate regularised regression model for C: [0.01, 0.1, 1, 10, 100]
C = [0.01, 0.1, 1, 10, 100]
def logistic_regression_regularised(c:int):
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train,y_train)
    Y_pred = model.predict_proba(X_val)[:, 1]
    churn_decision = (Y_pred >= 0.5)
    accuracy = (y_val == churn_decision).mean()
    print("Validation Accuracy:",accuracy)
for c in C:
   print(c)
   logistic_regression_regularised(c)

0.01
Validation Accuracy: 0.6996587030716723
0.1
Validation Accuracy: 0.6996587030716723
1
Validation Accuracy: 0.6996587030716723
10
Validation Accuracy: 0.6996587030716723
100
Validation Accuracy: 0.6996587030716723


In [None]:
# the validation accuracy remains the same for all C values, will be picking the smallest c value for the regression model