In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression 
from sklearn.preprocessing import StandardScaler

In [3]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'
!wget $data

--2025-10-14 15:36:32--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-14 15:36:32 (42.2 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [2]:
df = pd.read_csv('course_lead_scoring.csv')
df.columns = df.columns.str.lower().str.replace(' ','_')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [3]:
#check if there are any categorical values that Nan and convert them to 'NA'
cat_cols = df.select_dtypes(include=['object', 'category']).columns
#replace categorical features of nan to NA
df[cat_cols] = df[cat_cols].fillna('NA')
df[cat_cols].head()

Unnamed: 0,lead_source,industry,employment_status,location
0,paid_ads,,unemployed,south_america
1,social_media,retail,employed,south_america
2,events,healthcare,unemployed,australia
3,paid_ads,retail,,australia
4,referral,education,self_employed,europe


In [4]:
#check if there are any numberical features that are Nan and convert them to 0
num_cols = df.select_dtypes(include=['number']).columns
#replace numerical features of nan to 0
df[num_cols] = df[num_cols].fillna(0.0)
df[num_cols].head()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
0,1,79450.0,4,0.94,1
1,1,46992.0,1,0.8,0
2,5,78796.0,3,0.69,1
3,2,83843.0,1,0.87,0
4,3,85012.0,3,0.62,1


In [5]:
#what is mode for coloumn 'industry'
df['industry'].mode()

0    retail
Name: industry, dtype: object

In [11]:
# find the correlation matrix of the following pairs and find the strongest correlation
#interaction_count and lead_score
round(abs(df['interaction_count'].corr(df['lead_score'])),2)

np.float64(0.01)

In [12]:
#number_of_courses_viewed and lead_score
round(abs(df['number_of_courses_viewed'].corr(df['lead_score'])),2)

np.float64(0.0)

In [13]:
#number_of_courses_viewed and interaction_count
round(abs(df['number_of_courses_viewed'].corr(df['interaction_count'])),2)

np.float64(0.02)

In [14]:
#annual_income and interaction_count
round(abs(df['annual_income'].corr(df['interaction_count'])),2)

np.float64(0.03)

In [None]:
# according to the absolute correlation coefficient annumal income has the highest correlation to the interaction count

In [6]:
#split the data using train_test_split function of sickit_learn
#do a data cleanup for Nans
df[num_cols].fillna(0.0)
df[cat_cols].fillna('NA')
df_train_data_full, df_test = train_test_split(df, test_size=0.2,random_state=42)
df_train, df_val = train_test_split(df_train_data_full, test_size=0.25,random_state=42)
len(df_train),len(df_val),len(df_test)
df_train.reset_index(drop=True)
df_val.reset_index(drop=True)
df_test.reset_index(drop=True)
y_train= df_train.converted.values
y_val =df_val.converted.values
y_test =df_val.converted.values
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [7]:
df_train_data_full.head()
df_train_data_full[num_cols] = df_train_data_full[num_cols].fillna(0.0)
df_train_data_full[cat_cols] = df_train_data_full[cat_cols].fillna('NA')
df_train_data_full.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [None]:
#Calculate the mutual information score between converted and other categorical variables in the dataset.

In [17]:
round(mutual_info_score(df_train_data_full.industry, df_train_data_full.converted),2)

0.01

In [42]:
round(mutual_info_score(df_train_data_full.location, df_train_data_full.converted),2)

0.0

In [18]:
round(mutual_info_score(df_train_data_full.lead_source, df_train_data_full.converted),2)

0.03

In [19]:
round(mutual_info_score(df_train_data_full.employment_status, df_train_data_full.converted),2)

0.01

In [None]:
# according to the mutual_info_score lead_source as has the biggest_value

In [None]:
#Now let's train a logistic regression.using one-hot encoding.
#Fit the model on the training dataset.

In [10]:
train_dicts = df_train[cat_cols].to_dict(orient='records')

In [13]:
dv = DictVectorizer(sparse=False)
encoded_cat =dv.fit_transform(train_dicts)
encoded_df = pd.DataFrame(encoded_cat, columns=dv.get_feature_names_out(cat_cols))

In [16]:
#concat the numerical and categorical columns
X_train = pd.concat([df_train[['number_of_courses_viewed','annual_income' ,'interaction_count','lead_score']].reset_index(drop=True),encoded_df.reset_index(drop=True)], axis=1)

In [38]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
model =model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)
#check for mode coefficient
#model.coef_[0].round(3)
#get the soft predicition for converted
y_pred = model.predict_proba(X_train)[:, 1]
print(y_pred)

[0.57914343 0.87283491 0.58816107 0.51231211 0.63666475 0.75521068
 0.67660732 0.80463891 0.31931235 0.52336705 0.42762898 0.75635962
 0.36365335 0.479759   0.66891574 0.79118097 0.84805492 0.75295599
 0.74242974 0.44250074 0.59299541 0.71532757 0.89505926 0.40505602
 0.41274906 0.49397089 0.76167979 0.54183228 0.91299586 0.84329958
 0.43502339 0.8413228  0.45134426 0.73331191 0.69094239 0.63501138
 0.78633979 0.70340975 0.31374644 0.66994888 0.81195513 0.54070748
 0.52703678 0.66105373 0.89692378 0.93952744 0.69103809 0.57569837
 0.43601078 0.58839132 0.40836197 0.62054107 0.59360953 0.80782455
 0.55881103 0.89447257 0.85676682 0.3290991  0.88454471 0.61158866
 0.55737895 0.80158813 0.29433961 0.76846368 0.26685382 0.61036624
 0.70840796 0.76184472 0.88772862 0.60724473 0.74046877 0.39672339
 0.94279447 0.89334272 0.77167755 0.46010978 0.67939687 0.78038502
 0.88118541 0.3279005  0.69468482 0.86234034 0.50374313 0.39103763
 0.32306121 0.85545275 0.48720894 0.67632878 0.78732671 0.3807