### Data Preparation

In [12]:
import pandas as pd
import numpy as np

# Load data
data = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(data)

# Define target
target = 'converted'

# Separate features and target
X = df.drop(columns=[target])
y = df[target]

# Identify numerical and categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# Handle missing values
X[categorical_cols] = X[categorical_cols].fillna('NA')
X[numerical_cols] = X[numerical_cols].fillna(0.0)

# Reattach target for Q1 & Q2
df_clean = X.copy()
df_clean[target] = y

##### Question 1: Mode of industry

In [13]:
mode_industry = df_clean['industry'].mode()[0]
print("Q1 Answer:", mode_industry)  # Expected: 'retail'

Q1 Answer: retail


##### Question 2: Highest Correlation Pair

In [14]:
# Get numerical columns (excluding target)
num_features = df_clean.select_dtypes(include=[np.number]).columns.tolist()
if target in num_features:
    num_features.remove(target)

corr_matrix = df_clean[num_features].corr()
# Get upper triangle without diagonal
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
# Find max correlation
max_corr = upper.unstack().dropna().abs().idxmax()
max_corr_value = upper.loc[max_corr]

print("Q2 Answer: Features =", max_corr, "| Correlation =", round(max_corr_value, 4))
# Note: In this dataset, correlations are generally very low.
# The pair with the highest absolute correlation is likely ('annual_income', 'lead_score').

Q2 Answer: Features = ('interaction_count', 'annual_income') | Correlation = nan


### Data Splitting

In [15]:
from sklearn.model_selection import train_test_split

# Final features and target
X = df_clean.drop(columns=[target])
y = df_clean[target]

# First split: 80% (train+val) / 20% (test)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
# Second split: 60% train / 20% val
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)

##### Question 3: Mutual Information

In [16]:
from sklearn.metrics import mutual_info_score

# Identify categorical columns in training set
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()

mi_scores = {}
for col in cat_cols:
    mi = mutual_info_score(X_train[col], y_train)
    mi_scores[col] = round(mi, 2)

# Find the variable with the highest MI
best_feature = max(mi_scores, key=mi_scores.get)
print("Q3 Answer: Feature =", best_feature, "| MI Score =", mi_scores[best_feature])
# Expected: 'lead_source' or 'industry' often has the highest MI.

Q3 Answer: Feature = lead_source | MI Score = 0.03


##### Question 4: Logistic Regression Accuracy

In [17]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Convert DataFrames to list of dicts for DictVectorizer
train_dicts = X_train.to_dict(orient='records')
val_dicts = X_val.to_dict(orient='records')

# One-hot encode
dv = DictVectorizer(sparse=False)
X_train_encoded = dv.fit_transform(train_dicts)
X_val_encoded = dv.transform(val_dicts)

# Train model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)

# Predict and evaluate
y_pred = model.predict(X_val_encoded)
accuracy = round(accuracy_score(y_val, y_pred), 2)
print("Q4 Answer: Validation Accuracy =", accuracy)

Q4 Answer: Validation Accuracy = 0.73


##### Question 5: Least Useful Feature (Feature Elimination)

In [18]:
# One-hot encode full feature set
def encode_features(X_train, X_val, features_to_drop=None):
    X_train_copy = X_train.copy()
    X_val_copy = X_val.copy()

    if features_to_drop:
        X_train_copy = X_train_copy.drop(columns=features_to_drop)
        X_val_copy = X_val_copy.drop(columns=features_to_drop)

    train_dicts = X_train_copy.to_dict(orient='records')
    val_dicts = X_val_copy.to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train_enc = dv.fit_transform(train_dicts)
    X_val_enc = dv.transform(val_dicts)

    return X_train_enc, X_val_enc, dv

# Baseline model (all features)
X_train_full, X_val_full, dv_full = encode_features(X_train, X_val)
model_full = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_full.fit(X_train_full, y_train)
y_pred_full = model_full.predict(X_val_full)
acc_full = accuracy_score(y_val, y_pred_full)

print(f"Baseline accuracy: {acc_full:.4f}")

# Evaluate drop for each candidate feature
candidate_features = ['industry', 'employment_status', 'lead_score']
impact = {}

for feat in candidate_features:
    # Ensure feature exists in data
    if feat not in X_train.columns:
        print(f"Warning: {feat} not in dataset")
        continue

    X_train_sub, X_val_sub, _ = encode_features(X_train, X_val, features_to_drop=[feat])
    model_sub = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_sub.fit(X_train_sub, y_train)
    y_pred_sub = model_sub.predict(X_val_sub)
    acc_sub = accuracy_score(y_val, y_pred_sub)

    drop = acc_full - acc_sub
    impact[feat] = drop
    print(f"Feature: {feat} | Accuracy without it: {acc_sub:.4f} | Drop: {drop:.4f}")

# Find feature with smallest drop (least useful)
least_useful = min(impact, key=impact.get)
print(f"\nAnswer: The least useful feature is '{least_useful}'")

Baseline accuracy: 0.7304
Feature: industry | Accuracy without it: 0.7304 | Drop: 0.0000
Feature: employment_status | Accuracy without it: 0.7338 | Drop: -0.0034
Feature: lead_score | Accuracy without it: 0.7304 | Drop: 0.0000

Answer: The least useful feature is 'employment_status'


##### Question 6: Regularized Logistic Regression (Best C)

In [19]:
C_values = [0.01, 0.1, 1, 10, 100]
best_C = None
best_accuracy = 0

for C in C_values:
    model_reg = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_reg.fit(X_train_encoded, y_train)
    y_pred_reg = model_reg.predict(X_val_encoded)
    acc_reg = round(accuracy_score(y_val, y_pred_reg), 3)

    if acc_reg > best_accuracy:
        best_accuracy = acc_reg
        best_C = C

    print(f"C={C}, Accuracy={acc_reg}")

print("Q6 Answer: Best C =", best_C, "with Accuracy =", best_accuracy)
# Expected: C=1 or C=10 often works best, as C=1 is the default and the dataset isn't huge.

C=0.01, Accuracy=0.734
C=0.1, Accuracy=0.73
C=1, Accuracy=0.73
C=10, Accuracy=0.73
C=100, Accuracy=0.73
Q6 Answer: Best C = 0.01 with Accuracy = 0.734
