### Data Preparation

In [1]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv('course_lead_scoring.csv')

# Define target
target = 'converted'

# Separate features and target
X = df.drop(columns=[target])
y = df[target]

# Identify numerical and categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# Handle missing values
X[categorical_cols] = X[categorical_cols].fillna('NA')
X[numerical_cols] = X[numerical_cols].fillna(0.0)

# Reattach target for Q1 & Q2
df_clean = X.copy()
df_clean[target] = y

##### Question 1: Mode of industry

In [2]:
mode_industry = df_clean['industry'].mode()[0]
print("Q1 Answer:", mode_industry)  # Expected: 'retail'

Q1 Answer: retail


##### Question 2: Highest Correlation Pair

In [3]:
# Get numerical columns (excluding target)
num_features = df_clean.select_dtypes(include=[np.number]).columns.tolist()
if target in num_features:
    num_features.remove(target)

corr_matrix = df_clean[num_features].corr()
# Get upper triangle without diagonal
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
# Find max correlation
max_corr = upper.unstack().dropna().abs().idxmax()
max_corr_value = upper.loc[max_corr]

print("Q2 Answer: Features =", max_corr, "| Correlation =", round(max_corr_value, 4))
# Note: In this dataset, correlations are generally very low.
# The pair with the highest absolute correlation is likely ('annual_income', 'lead_score').

Q2 Answer: Features = ('interaction_count', 'annual_income') | Correlation = nan


### Data Splitting

In [4]:
from sklearn.model_selection import train_test_split

# Final features and target
X = df_clean.drop(columns=[target])
y = df_clean[target]

# First split: 80% (train+val) / 20% (test)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
# Second split: 60% train / 20% val
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)

##### Question 3: Mutual Information

In [5]:
from sklearn.metrics import mutual_info_score

# Identify categorical columns in training set
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()

mi_scores = {}
for col in cat_cols:
    mi = mutual_info_score(X_train[col], y_train)
    mi_scores[col] = round(mi, 2)

# Find the variable with the highest MI
best_feature = max(mi_scores, key=mi_scores.get)
print("Q3 Answer: Feature =", best_feature, "| MI Score =", mi_scores[best_feature])
# Expected: 'lead_source' or 'industry' often has the highest MI.

Q3 Answer: Feature = lead_source | MI Score = 0.03


##### Question 4: Logistic Regression Accuracy

In [6]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Convert DataFrames to list of dicts for DictVectorizer
train_dicts = X_train.to_dict(orient='records')
val_dicts = X_val.to_dict(orient='records')

# One-hot encode
dv = DictVectorizer(sparse=False)
X_train_encoded = dv.fit_transform(train_dicts)
X_val_encoded = dv.transform(val_dicts)

# Train model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)

# Predict and evaluate
y_pred = model.predict(X_val_encoded)
accuracy = round(accuracy_score(y_val, y_pred), 2)
print("Q4 Answer: Validation Accuracy =", accuracy)

Q4 Answer: Validation Accuracy = 0.73


##### Question 5: Least Useful Feature (Feature Elimination)

In [9]:
original_accuracy = accuracy  # from Q4

feature_impact = {}
feature_names = dv.get_feature_names_out()  # This is a list of all feature names after encoding

# Loop over each feature by index and name
for i, fname in enumerate(feature_names):
    # Remove feature i from training and validation sets
    X_train_temp = np.delete(X_train_encoded, i, axis=1)
    X_val_temp = np.delete(X_val_encoded, i, axis=1)
    
    # Train temporary model
    model_temp = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_temp.fit(X_train_temp, y_train)
    
    # Predict and evaluate
    y_pred_temp = model_temp.predict(X_val_temp)
    acc_temp = accuracy_score(y_val, y_pred_temp)
    
    # Record the drop in accuracy
    impact = original_accuracy - acc_temp
    feature_impact[fname] = impact

# Now find the least useful feature (smallest drop = least important)
least_useful_feature = min(feature_impact, key=feature_impact.get)
print("Q5 Answer: Least Useful Feature =", least_useful_feature)

Q5 Answer: Least Useful Feature = annual_income


##### Question 6: Regularized Logistic Regression (Best C)

In [8]:
C_values = [0.01, 0.1, 1, 10, 100]
best_C = None
best_accuracy = 0

for C in C_values:
    model_reg = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_reg.fit(X_train_encoded, y_train)
    y_pred_reg = model_reg.predict(X_val_encoded)
    acc_reg = round(accuracy_score(y_val, y_pred_reg), 3)
    
    if acc_reg > best_accuracy:
        best_accuracy = acc_reg
        best_C = C
    
    print(f"C={C}, Accuracy={acc_reg}")

print("Q6 Answer: Best C =", best_C, "with Accuracy =", best_accuracy)
# Expected: C=1 or C=10 often works best, as C=1 is the default and the dataset isn't huge.

C=0.01, Accuracy=0.734
C=0.1, Accuracy=0.73
C=1, Accuracy=0.73
C=10, Accuracy=0.73
C=100, Accuracy=0.73
Q6 Answer: Best C = 0.01 with Accuracy = 0.734
