In [35]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

## Dataset

In [None]:
data =  'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'

In [5]:
!wget -O data.csv $data

--2025-10-12 13:26:54--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘data.csv’


2025-10-12 13:26:55 (971 KB/s) - ‘data.csv’ saved [80876/80876]



In [None]:
df = pd.read_csv('data.csv')

In [18]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


## Data preparation

In [19]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [20]:
categorical_vars = ['lead_source', 'industry', 'employment_status', 'location']
numerical_vars = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

In [21]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [None]:
for var in categorical_vars:
    df[var] = df[var].fillna('NA')

for var in numerical_vars:
    df[var] = df[var].fillna(0.0)

### Question 1

What is the most frequent observation (mode) for the column `industry`?

- `NA`
- `technology`
- `healthcare`
- `retail`

In [32]:
df.industry.value_counts().sort_values(ascending=False)

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [150]:
df.industry.mode()

0    retail
Name: industry, dtype: object

### Question 2

Create the correlation matrix for the numerical features of your dataset. 
In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

- `interaction_count` and `lead_score`
- `number_of_courses_viewed` and `lead_score`
- `number_of_courses_viewed` and `interaction_count`
- `annual_income` and `interaction_count`

Only consider the pairs above when answering this question.


In [33]:
df_numerical = df[numerical_vars]

In [39]:
df_numerical.corr(method='pearson')
df_numerical.corr()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


## Split the data

In [40]:
from sklearn.model_selection import train_test_split

In [None]:
random_state = 42

In [42]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=random_state)
len(df_full_train), len(df_test)

(1169, 293)

In [43]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=random_state)
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [44]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [45]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [46]:
df_train = df_train.drop(columns=['converted'])
df_val = df_val.drop(columns=['converted'])
df_test = df_test.drop(columns=['converted'])

### Question 3

* Calculate the mutual information score between `y` and other categorical variables in the dataset. Use the training set only.
* Round the scores to 2 decimals using `round(score, 2)`.

Which of these variables has the biggest mutual information score?
  
- `industry`
- `location`
- `lead_source`
- `employment_status`


In [47]:
from sklearn.metrics import mutual_info_score

In [None]:
for var in categorical_vars:
    mi_score = mutual_info_score(df_full_train[var], df_full_train.converted)
    print(f'{var} : {round(mi_score, 2)}')

lead_source : 0.03
industry : 0.01
employment_status : 0.01
location : 0.0


## Logistic regression

### One-hot encoding

In [59]:
from sklearn.feature_extraction import DictVectorizer

In [63]:
train_dicts = df_train[categorical_vars + numerical_vars].to_dict(orient='records')
val_dicts = df_val[categorical_vars + numerical_vars].to_dict(orient='records')

In [61]:
dv = DictVectorizer(sparse=False)

In [62]:
X_train = dv.fit_transform(train_dicts)

In [64]:
X_val = dv.transform(val_dicts)

### Fitting the model

In [69]:
from sklearn.linear_model import LogisticRegression

In [71]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [72]:
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


### Question 4

* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

- 0.64
- 0.74
- 0.84
- 0.94

In [88]:
y_val_pred_proba = model.predict_proba(X_val)[:, 1]

In [93]:
accuracy_val = ((y_val_pred_proba >= 0.5) == y_val).mean()
print(round(accuracy_val, 4))

0.6997


In [94]:
y_train_pred_proba = model.predict_proba(X_train)[:, 1]
accuracy_train = ((y_train_pred_proba >= 0.5) == y_train).mean()
print(round(accuracy_train, 4))

0.7386


### Question 5 

* Let's find the least useful feature using the *feature elimination* technique.
* Train a model using the same features and parameters as in Q4 (without rounding).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature. 

Which of following feature has the smallest difference?

- `'industry'`
- `'employment_status'`
- `'lead_score'`

> **Note**: The difference doesn't have to be positive.


In [107]:
train_features = categorical_vars + numerical_vars
train_features

['lead_source',
 'industry',
 'employment_status',
 'location',
 'number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score']

In [None]:
df_accuracies = pd.DataFrame({
    'feature_eliminated': pd.Series(dtype='str'), 
    'modified_accuracy_val': pd.Series(dtype='float'), 
    'modified_accuracy_val_abs_diff': pd.Series(dtype='float')
    })

for feature in train_features:
    modified_train_features = train_features.copy()
    modified_train_features.remove(feature)
    
    train_dicts = df_train[modified_train_features].to_dict(orient='records')
    val_dicts = df_val[modified_train_features].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)
    X_val = dv.transform(val_dicts)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    modified_y_val_pred_proba = model.predict_proba(X_val)[:, 1]
    modified_accuracy_val = ((modified_y_val_pred_proba >= 0.5) == y_val).mean()
    modified_accuracy_val_abs_diff = abs(modified_accuracy_val - accuracy_val)

    df_accuracies = pd.concat([df_accuracies, 
                               pd.DataFrame({'feature_eliminated': [feature], 
                                             'modified_accuracy_val': [modified_accuracy_val], 
                                             'modified_accuracy_val_abs_diff': [modified_accuracy_val_abs_diff]})],
                                             ignore_index=True)


In [None]:
df_accuracies.sort_values('modified_accuracy_val_abs_diff')

Unnamed: 0,feature_eliminated,modified_accuracy_val,modified_accuracy_val_abs_diff
1,industry,0.699659,0.0
2,employment_status,0.696246,0.003413
0,lead_source,0.703072,0.003413
7,lead_score,0.706485,0.006826
3,location,0.709898,0.010239
4,number_of_courses_viewed,0.556314,0.143345
6,interaction_count,0.556314,0.143345
5,annual_income,0.853242,0.153584


### Question 6

* Now let's train a regularized logistic regression.
* Let's try the following values of the parameter `C`: `[0.01, 0.1, 1, 10, 100]`.
* Train models using all the features as in Q4.
* Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these `C` leads to the best accuracy on the validation set?

- 0.01
- 0.1
- 1
- 10
- 100

> **Note**: If there are multiple options, select the smallest `C`.


In [146]:
df_accuracies = pd.DataFrame({
    'parameter_C': pd.Series(dtype='float'), 
    'accuracy_val': pd.Series(dtype='float')
    })

for parameter_C in [0.01, 0.1, 1, 10, 100]: 
    train_dicts = df_train[train_features].to_dict(orient='records')
    val_dicts = df_val[train_features].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)
    X_val = dv.transform(val_dicts)

    model = LogisticRegression(penalty='l1', solver='liblinear', C=parameter_C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_val_pred_proba = model.predict_proba(X_val)[:, 1]
    accuracy_val = ((y_val_pred_proba >= 0.5) == y_val).mean()
    
    df_accuracies = pd.concat(
        [
            df_accuracies, 
            pd.DataFrame(
                    {
                        'parameter_C': [parameter_C], 
                        'accuracy_val': [round(accuracy_val, 3)]
                    })
        ],
        ignore_index=True
    )

In [147]:
df_accuracies

Unnamed: 0,parameter_C,accuracy_val
0,0.01,0.648
1,0.1,0.84
2,1.0,0.863
3,10.0,0.857
4,100.0,0.853
