In [24]:
import pandas as pd
import numpy as np

In [25]:
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(url)

In [26]:
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna('NA')
    else:
        df[col] = df[col].fillna(0.0)

### Question 1

In [27]:
q1 = df['industry'].mode()[0]
print(f"ANSWER Q1: {q1}")

ANSWER Q1: retail


### Question 2

In [28]:
numerical_cols = df.select_dtypes(include=np.number).columns
corr_matrix = df[numerical_cols].corr()

In [29]:
pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'), 
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

In [30]:
max_corr = 0
q2 = ""
for col1, col2 in pairs:
    corr_val = abs(corr_matrix.loc[col1, col2])
    print(f"Correlation between {col1} and {col2}: {corr_val:.4f}")
    if corr_val > max_corr:
        max_corr = corr_val
        q2 = f"{col1} and {col2}"

Correlation between interaction_count and lead_score: 0.0099
Correlation between number_of_courses_viewed and lead_score: 0.0049
Correlation between number_of_courses_viewed and interaction_count: 0.0236
Correlation between annual_income and interaction_count: 0.0270


In [31]:
print(f"ANSWER Q2: {q2}")

ANSWER Q2: annual_income and interaction_count


### Question 3

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
df_train, df_temp = train_test_split(df, test_size=0.4, random_state=42)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)

In [34]:
print(f"Training set: {len(df_train)} samples")
print(f"Validation set: {len(df_val)} samples")
print(f"Test set: {len(df_test)} samples")


Training set: 877 samples
Validation set: 292 samples
Test set: 293 samples


In [35]:
y_train = df_train['converted']
y_val = df_val['converted'] 
y_test = df_test['converted']

In [36]:
df_train = df_train.drop('converted', axis=1)
df_val = df_val.drop('converted', axis=1)
df_test = df_test.drop('converted', axis=1)

In [37]:
from sklearn.metrics import accuracy_score, mutual_info_score

In [38]:
df_train_with_target = pd.concat([df_train, y_train], axis=1)

In [39]:
categorical_features = ['industry', 'location', 'lead_source', 'employment_status']
mi_scores = {}

In [40]:
for feature in categorical_features:
    mi = mutual_info_score(df_train_with_target[feature], df_train_with_target['converted'])
    mi_scores[feature] = round(mi, 2)
    print(f"Mutual info for {feature}: {mi_scores[feature]}")

Mutual info for industry: 0.02
Mutual info for location: 0.0
Mutual info for lead_source: 0.03
Mutual info for employment_status: 0.02


In [41]:
q3 = max(mi_scores, key=mi_scores.get)
print(f"ANSWER Q3: {q3}")

ANSWER Q3: lead_source


### Question 4

In [42]:
from sklearn.feature_extraction import DictVectorizer

In [43]:
features = categorical_features + list(numerical_cols)
features.remove('converted')  # Remove target

In [44]:
# One-hot encoding
train_dict = df_train[features].to_dict(orient='records')
val_dict = df_val[features].to_dict(orient='records')

In [46]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)

#y_train = df_train['converted']
#y_val = df_val['converted']

In [47]:
from sklearn.linear_model import LogisticRegression

In [48]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [49]:
y_pred = model.predict(X_val)
accuracy = round(accuracy_score(y_val, y_pred), 2)
print(f"Validation accuracy: {accuracy}")
print(f"ANSWER Q4: {accuracy}")

Validation accuracy: 0.74
ANSWER Q4: 0.74


### Question 5

In [50]:
original_accuracy = accuracy_score(y_val, model.predict(X_val))
print(f"Original model accuracy: {original_accuracy:.4f}")

Original model accuracy: 0.7432


In [51]:
differences = {}

In [52]:
for feature in ['industry', 'employment_status', 'lead_score']:
    reduced_features = [f for f in features if f != feature]
    
    train_dict_red = df_train[reduced_features].to_dict(orient='records')
    val_dict_red = df_val[reduced_features].to_dict(orient='records')
    
    dv_red = DictVectorizer()
    X_train_red = dv_red.fit_transform(train_dict_red)
    X_val_red = dv_red.transform(val_dict_red)
    
    model_red = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_red.fit(X_train_red, y_train)
    
    red_accuracy = accuracy_score(y_val, model_red.predict(X_val_red))
    difference = abs(original_accuracy - red_accuracy)
    differences[feature] = difference
    print(f"Without '{feature}': accuracy = {red_accuracy:.4f}, difference = {difference:.4f}")

Without 'industry': accuracy = 0.7432, difference = 0.0000
Without 'employment_status': accuracy = 0.7466, difference = 0.0034
Without 'lead_score': accuracy = 0.7432, difference = 0.0000


In [53]:
q5 = min(differences, key=differences.get)
print(f"ANSWER Q5: {q5}")

ANSWER Q5: industry


### Question 6

In [54]:
best_accuracy = 0
best_C = None

In [55]:
for C in [0.01, 0.1, 1, 10, 100]:
    model_reg = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_reg.fit(X_train, y_train)
    
    accuracy_reg = accuracy_score(y_val, model_reg.predict(X_val))
    print(f"C = {C}: accuracy = {accuracy_reg:.3f}")
    
    if accuracy_reg > best_accuracy:
        best_accuracy = accuracy_reg
        best_C = C
    elif accuracy_reg == best_accuracy and C < best_C:
        best_C = C


C = 0.01: accuracy = 0.743
C = 0.1: accuracy = 0.743
C = 1: accuracy = 0.743
C = 10: accuracy = 0.743
C = 100: accuracy = 0.743


In [56]:
print(f"ANSWER Q6: {best_C}")

ANSWER Q6: 0.01
