In [None]:
=====================================
# 🧠 Machine Learning Assignment - Lead Scoring Dataset
# Dataset: Bank Marketing (Lead Scoring)
# Source: https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
# ====================================================

# Task: Classification - Predict whether a client signed up (converted)
# ====================================================

In [2]:
# ✅ Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [3]:
# ====================================================
# Task 1 - Data Preparation
# ====================================================

In [4]:
# Load the dataset
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(url)

# Display basic info
print("Dataset shape:", df.shape)
print("First 5 rows:")
display(df.head())

Dataset shape: (1462, 9)
First 5 rows:


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [5]:
# ----------------------------------------------------
# Handle Missing Values
# ----------------------------------------------------

In [6]:
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna('NA')
    else:
        df[col] = df[col].fillna(0.0)

In [7]:
# Confirm missing values handled
print("\nMissing values per column:")
print(df.isnull().sum())


Missing values per column:
lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


In [8]:
# ----------------------------------------------------
# Question 1 - Most frequent observation (mode) for 'industry'
# ----------------------------------------------------

In [9]:
numerical_features = df.select_dtypes(include=[np.number])
corr_matrix = numerical_features.corr()

# Display the correlation matrix
print("\nCorrelation matrix of numerical features:")
display(corr_matrix)


Correlation matrix of numerical features:


Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


In [10]:
# Find the two most correlated features
corr_pairs = (
    corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    .stack()
    .reset_index()
)
corr_pairs.columns = ['Feature_1', 'Feature_2', 'Correlation']
top_corr = corr_pairs.sort_values(by='Correlation', ascending=False).head(1)
print("\nQuestion 2 Answer:\nMost correlated features:")
display(top_corr)


Question 2 Answer:
Most correlated features:


Unnamed: 0,Feature_1,Feature_2,Correlation
3,number_of_courses_viewed,converted,0.435914


In [11]:
# ====================================================
# Task 2 - Split the Data
# ====================================================

In [12]:
# Separate target and features
target = 'converted'
y = df[target].astype(int)
X = df.drop(columns=[target])

In [13]:
# First split: Train (60%) and Temp (40%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

In [14]:
# Second split: Validation (20%) and Test (20%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print("\nDataset split complete:")
print(f"Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")


Dataset split complete:
Train: 877, Validation: 292, Test: 293


In [15]:
# ====================================================
# Question 3 - Mutual Information (Categorical Variables)
# ====================================================

In [16]:
cat_features = X_train.select_dtypes(include=['object'])
mi_scores = mutual_info_classif(pd.get_dummies(cat_features, drop_first=True), y_train)

mi_scores = pd.Series(mi_scores, index=pd.get_dummies(cat_features, drop_first=True).columns)
mi_sorted = mi_scores.sort_values(ascending=False).round(2)

print("\nQuestion 3 Answer:\nMutual Information Scores (Top Features):")
display(mi_sorted.head(10))
print(f"Feature with highest mutual information: {mi_sorted.idxmax()}")


Question 3 Answer:
Mutual Information Scores (Top Features):


industry_education                 0.04
location_north_america             0.04
lead_source_referral               0.03
employment_status_unemployed       0.03
industry_manufacturing             0.02
industry_healthcare                0.02
lead_source_events                 0.01
employment_status_self_employed    0.01
location_middle_east               0.00
location_south_america             0.00
dtype: float64

Feature with highest mutual information: industry_education


In [17]:
# ====================================================
# Question 4 - Logistic Regression (One-Hot Encoded)
# ====================================================

In [18]:
# One-hot encode categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_enc = encoder.fit_transform(X_train)
X_val_enc = encoder.transform(X_val)

In [19]:
# Train the logistic regression model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_enc, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [20]:
# Validate
y_pred_val = model.predict(X_val_enc)
val_accuracy = round(accuracy_score(y_val, y_pred_val), 2)
print(f"\nQuestion 4 Answer:\nValidation Accuracy: {val_accuracy}")


Question 4 Answer:
Validation Accuracy: 0.84


In [21]:
# ====================================================
# Question 5 - Feature Elimination (Least Useful Feature)
# ====================================================

In [22]:
base_accuracy = accuracy_score(y_val, y_pred_val)
print(f"\nBase accuracy with all features: {base_accuracy:.4f}")

feature_names = encoder.get_feature_names_out(X_train.columns)
feature_accuracies = {}

for i, feature in enumerate(feature_names):
    # Drop one feature column
    X_train_drop = np.delete(X_train_enc, i, axis=1)
    X_val_drop = np.delete(X_val_enc, i, axis=1)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_drop, y_train)
    y_pred_drop = model.predict(X_val_drop)
    acc = accuracy_score(y_val, y_pred_drop)
    
    diff = base_accuracy - acc
    feature_accuracies[feature] = diff



Base accuracy with all features: 0.8390


In [23]:
# Find the feature with smallest difference
least_important_feature = min(feature_accuracies, key=feature_accuracies.get)
print(f"\nQuestion 5 Answer:\nLeast useful feature: {least_important_feature}")


Question 5 Answer:
Least useful feature: lead_source_paid_ads


In [24]:
# ====================================================
# Question 6 - Regularized Logistic Regression
# ====================================================

In [25]:
C_values = [0.01, 0.1, 1, 10, 100]
val_accuracies = {}

for c in C_values:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train_enc, y_train)
    y_pred_val = model.predict(X_val_enc)
    acc = round(accuracy_score(y_val, y_pred_val), 3)
    val_accuracies[c] = acc

best_c = max(val_accuracies, key=val_accuracies.get)

print("\nQuestion 6 Answer:")
print("Validation accuracies for each C:")
for c, acc in val_accuracies.items():
    print(f"C={c}: Accuracy={acc}")
print(f"\nBest C value: {best_c} (Accuracy={val_accuracies[best_c]})")


Question 6 Answer:
Validation accuracies for each C:
C=0.01: Accuracy=0.736
C=0.1: Accuracy=0.815
C=1: Accuracy=0.839
C=10: Accuracy=0.825
C=100: Accuracy=0.822

Best C value: 1 (Accuracy=0.839)
