In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns


print("python version: "+sys.version)
print("pandas version: "+pd.__version__)
print("numpy version: "+np.__version__)
print("matplotlib version: "+plt.__version__)
print("seaborn version: "+sns.__version__)

python version: 3.13.7 (main, Aug 15 2025, 12:34:02) [GCC 15.2.1 20250813]
pandas version: 2.3.1
numpy version: 2.3.3
matplotlib version: 3.10.6
seaborn version: 0.13.2


In [None]:
data = 'wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'

In [None]:
!wget $data -O data-homework-3.csv 

In [2]:
df = pd.read_csv('data-homework-3.csv')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [3]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [4]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [5]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
categorical_columns

['lead_source', 'industry', 'employment_status', 'location']

In [6]:
categorical = ['lead_source', 'industry', 'employment_status', 'location']

In [7]:
numerical_columns = list(df.dtypes[df.dtypes != 'object'].index)
numerical_columns

['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score',
 'converted']

In [8]:
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score', 'converted']
numerical_without_converted = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

In [9]:
df[categorical].isnull().sum()

lead_source          128
industry             134
employment_status    100
location              63
dtype: int64

In [10]:
df[numerical].isnull().sum()

number_of_courses_viewed      0
annual_income               181
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [11]:
df[numerical_without_converted].isnull().sum()

number_of_courses_viewed      0
annual_income               181
interaction_count             0
lead_score                    0
dtype: int64

In [12]:
df[categorical].nunique()

lead_source          5
industry             7
employment_status    4
location             7
dtype: int64

In [13]:
df[numerical].nunique()

number_of_courses_viewed      10
annual_income               1267
interaction_count             12
lead_score                   101
converted                      2
dtype: int64

In [14]:
df.lead_source = df.lead_source.fillna('NA')
df.industry = df.industry.fillna('NA')
df.employment_status = df.employment_status.fillna('NA')
df.location = df.location.fillna('NA')
df.annual_income  = df.annual_income.fillna(0)

In [15]:
df[numerical].isnull().sum()

number_of_courses_viewed    0
annual_income               0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [16]:
df[numerical_without_converted].isnull().sum()

number_of_courses_viewed    0
annual_income               0
interaction_count           0
lead_score                  0
dtype: int64

In [17]:
df[categorical].isnull().sum()

lead_source          0
industry             0
employment_status    0
location             0
dtype: int64

In [18]:
# Q1 
# What is the most frequent observation (mode) for the column industry?

df.industry.value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [19]:
# Q2
# Biggest correlation (1 point) 

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score


df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)



In [20]:
df_full_train = df_full_train.reset_index(drop=True)

In [21]:
df_full_train

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,social_media,manufacturing,2,44403.0,self_employed,australia,1,0.71,0
1,events,retail,3,38048.0,student,north_america,6,0.97,1
2,social_media,education,2,71399.0,,europe,1,0.51,1
3,referral,education,2,47912.0,employed,australia,1,0.04,0
4,paid_ads,healthcare,1,34806.0,employed,europe,4,0.32,1
...,...,...,...,...,...,...,...,...,...
1164,events,manufacturing,1,57039.0,employed,south_america,2,0.30,0
1165,events,healthcare,2,56185.0,student,south_america,2,0.44,0
1166,paid_ads,manufacturing,1,56402.0,student,north_america,1,0.02,0
1167,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1


In [22]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [23]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [24]:
y_train_interaction_count = df_train.interaction_count.values
y_val_interaction_count = df_val.interaction_count.values
y_test_interaction_count = df_test.interaction_count.values

del df_train['interaction_count']
del df_val['interaction_count']
del df_test['interaction_count']

y_train_lead_score = df_train.lead_score.values
y_val_lead_score = df_val.lead_score.values
y_test_lead_score = df_test.lead_score.values

del df_train['lead_score']
del df_val['lead_score']
del df_test['lead_score']

y_train_annual_income = df_train.annual_income.values
y_val_annual_income = df_val.annual_income.values
y_test_annual_income = df_test.annual_income.values

del df_train['annual_income']
del df_val['annual_income']
del df_test['annual_income']

y_train_number_of_courses_viewed = df_train.number_of_courses_viewed.values
y_val_number_of_courses_viewed = df_val.number_of_courses_viewed.values
y_test_number_of_courses_viewed = df_test.number_of_courses_viewed.values

del df_train['number_of_courses_viewed']
del df_val['number_of_courses_viewed']
del df_test['number_of_courses_viewed']



In [25]:
print("Max value for number_of_courses_viewed: ",df_full_train.number_of_courses_viewed.max())
print("Max value for annual_income: ",df_full_train.annual_income.max())
print("Max value for interaction_count: ",df_full_train.interaction_count.max())
print("Max value for lead_score: ",df_full_train.lead_score.max())

Max value for number_of_courses_viewed:  8
Max value for annual_income:  109899.0
Max value for interaction_count:  11
Max value for lead_score:  1.0


In [26]:
df_full_train

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,social_media,manufacturing,2,44403.0,self_employed,australia,1,0.71,0
1,events,retail,3,38048.0,student,north_america,6,0.97,1
2,social_media,education,2,71399.0,,europe,1,0.51,1
3,referral,education,2,47912.0,employed,australia,1,0.04,0
4,paid_ads,healthcare,1,34806.0,employed,europe,4,0.32,1
...,...,...,...,...,...,...,...,...,...
1164,events,manufacturing,1,57039.0,employed,south_america,2,0.30,0
1165,events,healthcare,2,56185.0,student,south_america,2,0.44,0
1166,paid_ads,manufacturing,1,56402.0,student,north_america,1,0.02,0
1167,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1


In [34]:
df_full_train[numerical_without_converted].corr().abs()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.012227,0.044381,0.009427
annual_income,0.012227,1.0,0.011959,0.021273
interaction_count,0.044381,0.011959,1.0,0.025393
lead_score,0.009427,0.021273,0.025393,1.0


In [None]:
# Q3 Mutual information 
# Which of these variables has the biggest mutual information score?   

In [48]:
from sklearn.metrics import mutual_info_score

In [55]:
m1 = mutual_info_score(df_full_train.converted, df_full_train.lead_source)
m2 = mutual_info_score(df_full_train.converted, df_full_train.industry)
m3 = mutual_info_score(df_full_train.converted, df_full_train.employment_status)
m4 = mutual_info_score(df_full_train.converted, df_full_train.location)

In [57]:
print("biggest mutual information score")
print("converted - lead_source:\t", m1)
print("converted - industry:\t\t", m2)
print("converted - employment_status:\t", m3)
print("converted - location:\t\t", m3)

biggest mutual information score
converted - lead_source:	 0.025665373935054955
converted - industry:		 0.011684562750165564
converted - employment_status:	 0.013258496589914293
converted - location:		 0.013258496589914293


In [38]:
mutual_info_score(df_full_train.converted, df_full_train.employment_status)

0.013258496589914293

0.0022530354195563346