In [117]:
# importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Downloading the dataset

In [119]:
df = pd.read_csv("course_lead_scoring.csv")

In [121]:
# Check for missing values in df.
df.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [123]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


Data preparation
Check if the missing values are presented in the features.
If there are missing values:
For caterogiral features, replace them with 'NA'
For numerical features, replace with with 0.0

In [125]:
df = df.fillna({col: 'NA' if df[col].dtype == 'object' else 0.0 for col in df.columns})

What is the most frequent observation (mode) for the column industry?

In [127]:
print(f"Mode of industry column:  {df.industry.mode().iloc[0]}")

Mode of industry column:  retail


Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

In [129]:
# Select numerical data in df.
df_num = df.copy().select_dtypes(include='number')

In [131]:
# Show the statistics of df_num.
df_num.describe()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
count,1462.0,1462.0,1462.0,1462.0,1462.0
mean,2.031464,52472.172367,2.976744,0.506108,0.619015
std,1.449717,24254.34703,1.681564,0.288465,0.485795
min,0.0,0.0,0.0,0.0,0.0
25%,1.0,44097.25,2.0,0.2625,0.0
50%,2.0,57449.5,3.0,0.51,1.0
75%,3.0,68241.0,4.0,0.75,1.0
max,9.0,109899.0,11.0,1.0,1.0


In [133]:
# Show the correletions between all the columns.
df_num.corr()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


In [79]:
df_num.corrwith(df.interaction_count)['lead_score'].round(3)

0.01

In [81]:
df_num.corrwith(df.number_of_courses_viewed)['lead_score'].round(3)

-0.005

In [83]:
df_num.corrwith(df.number_of_courses_viewed)['interaction_count'].round(3)

-0.024

In [85]:
df_num.corrwith(df.annual_income)['interaction_count'].round(3)

0.027

Split the data
Split your data in train/val/test sets with 60%/20%/20% distribution.
Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
Make sure that the target value y is not in your dataframe.

In [219]:
X = df[['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']]

# Target (dependent variable)
y = df['converted']

# Verify shapes
print(X.shape)
print(y.shape)

(1462, 4)
(1462,)


In [193]:
# import library for train test split
from sklearn.model_selection import train_test_split

In [221]:
# 1️⃣ Separate features and target
X = df.drop('converted', axis=1)   # all columns except target
y = df['converted']                # target column

# 2️⃣ First split: Train (60%) and Temp (40%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42
)

# 3️⃣ Second split: Validation (20%) and Test (20%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

# 4️⃣ Verify sizes
print("Train set:", X_train.shape, y_train.shape)
print("Validation set:", X_val.shape, y_val.shape)
print("Test set:", X_test.shape, y_test.shape)

Train set: (877, 8) (877,)
Validation set: (292, 8) (292,)
Test set: (293, 8) (293,)


In [241]:
total = len(df)
print(f"Train: {len(X_train)/total:.2%}")
print(f"Validation: {len(X_val)/total:.2%}")
print(f"Test: {len(X_test)/total:.2%}")

Train: 59.99%
Validation: 19.97%
Test: 20.04%


Question 3
Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
Round the scores to 2 decimals using round(score, 2).
Which of these variables has the biggest mutual information score?

industry
location
lead_source
employment_status

Calculating Mutual Information

In [243]:
from sklearn.feature_selection import mutual_info_classif


In [247]:
categorical_cols = ['industry', 'location', 'lead_source', 'employment_status']


In [249]:
from sklearn.preprocessing import LabelEncoder

# Create a copy of training data for safety
X_train_cat = X_train[categorical_cols].copy()

# Encode each categorical column
for col in categorical_cols:
    le = LabelEncoder()
    X_train_cat[col] = le.fit_transform(X_train_cat[col].astype(str))


In [251]:
mi_scores = mutual_info_classif(X_train_cat, y_train, random_state=42)


In [253]:
mi_results = pd.Series(mi_scores, index=categorical_cols).sort_values(ascending=False)
mi_results = mi_results.round(2)
print(mi_results)


employment_status    0.04
industry             0.02
lead_source          0.01
location             0.00
dtype: float64


Question 4
Now let's train a logistic regression.
Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
Fit the model on the training dataset.
To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [255]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [257]:
# Categorical and numerical columns (example names)
cat_cols = ['industry', 'location', 'lead_source', 'employment_status']
num_cols = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']


In [259]:
# One-hot encoder for categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ],
    remainder='passthrough'  # keeps numerical features as-is
)

# Define logistic regression model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        solver='liblinear',
        C=1.0,
        max_iter=1000,
        random_state=42
    ))
])


In [261]:
model.fit(X_train, y_train)


In [263]:
y_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", round(val_accuracy, 2))


Validation Accuracy: 0.74


Question 5
Let's find the least useful feature using the feature elimination technique.
Train a model using the same features and parameters as in Q4 (without rounding).
Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

In [265]:
# Define columns again
cat_cols = ['industry', 'location', 'lead_source', 'employment_status']
num_cols = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
all_features = cat_cols + num_cols

# Build base preprocessing + model pipeline
def build_model(features):
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), [f for f in features if f in cat_cols])
        ],
        remainder='passthrough'  # keep numeric as-is
    )
    
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(
            solver='liblinear',
            C=1.0,
            max_iter=1000,
            random_state=42
        ))
    ])
    return model

In [267]:
# Train using all features
base_model = build_model(all_features)
base_model.fit(X_train, y_train)

# Base accuracy
base_pred = base_model.predict(X_val)
base_acc = accuracy_score(y_val, base_pred)
print(f"Base Accuracy: {base_acc:.4f}")


Base Accuracy: 0.7432


In [271]:
#  Feature elimination & accuracy difference
accuracy_diff = {}

for feature in all_features:
    reduced_features = [f for f in all_features if f != feature]
    
    model = build_model(reduced_features)
    model.fit(X_train[reduced_features], y_train)
    preds = model.predict(X_val[reduced_features])
    acc = accuracy_score(y_val, preds)
    
    # Calculate difference
    accuracy_diff[feature] = round(base_acc - acc, 4)

#  Display differences
diff_df = pd.DataFrame({
    'Feature': accuracy_diff.keys(),
    'Accuracy_Difference': accuracy_diff.values()
}).sort_values(by='Accuracy_Difference', ascending=True)

print(diff_df)

                    Feature  Accuracy_Difference
5             annual_income              -0.1130
3         employment_status              -0.0034
0                  industry               0.0000
1                  location               0.0000
7                lead_score               0.0000
2               lead_source               0.0137
4  number_of_courses_viewed               0.0651
6         interaction_count               0.0685


In [273]:
accuracy_diff = {}

for feature in all_features:
    reduced_features = [f for f in all_features if f != feature]
    
    model = build_model(reduced_features)
    model.fit(X_train[reduced_features], y_train)
    preds = model.predict(X_val[reduced_features])
    acc = accuracy_score(y_val, preds)
    
    # Calculate difference
    accuracy_diff[feature] = round(base_acc - acc, 4)

#  Display differences
diff_df = pd.DataFrame({
    'Feature': accuracy_diff.keys(),
    'Accuracy_Difference': accuracy_diff.values()
}).sort_values(by='Accuracy_Difference', ascending=True)

print(diff_df)

                    Feature  Accuracy_Difference
5             annual_income              -0.1130
3         employment_status              -0.0034
0                  industry               0.0000
1                  location               0.0000
7                lead_score               0.0000
2               lead_source               0.0137
4  number_of_courses_viewed               0.0651
6         interaction_count               0.0685


In [275]:
cat_cols = ['industry', 'location', 'lead_source', 'employment_status']
num_cols = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']


preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)],
    remainder='passthrough'
)

C_values = [0.01, 0.1, 1, 10, 100]
results = []

for C in C_values:
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(
            solver='liblinear',
            C=C,
            max_iter=1000,
            random_state=42
        ))
    ])
    model.fit(X_train, y_train)
    val_pred = model.predict(X_val)
    acc = accuracy_score(y_val, val_pred)
    results.append({"C": C, "val_accuracy": round(acc, 3)})

df_results = pd.DataFrame(results).sort_values("C")
print(df_results)

# (Optional) best C
best_row = max(results, key=lambda r: r["val_accuracy"])
print(f"Best C: {best_row['C']} with validation accuracy {best_row['val_accuracy']}")

        C  val_accuracy
0    0.01         0.743
1    0.10         0.743
2    1.00         0.743
3   10.00         0.743
4  100.00         0.743
Best C: 0.01 with validation accuracy 0.743
