# Dataset

In [404]:
import pandas as pd

%matplotlib inline
df = pd.read_csv('../datasets/course.csv')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [405]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [406]:
df.converted.value_counts(normalize=True)


converted
1    0.619015
0    0.380985
Name: proportion, dtype: float64

Nearly 62% of users converted to paid customers and 38% did not convert 

In [407]:
categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
numerical_columns = [col for col in df.columns if df[col].dtype != 'object' and col != 'converted']
print("Categorical coulmns:",categorical_columns)
print("Numerical columns:", numerical_columns)

Categorical coulmns: ['lead_source', 'industry', 'employment_status', 'location']
Numerical columns: ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']


In [408]:
df[categorical_columns] = df[categorical_columns].fillna('NA')
df[categorical_columns].head() 

Unnamed: 0,lead_source,industry,employment_status,location
0,paid_ads,,unemployed,south_america
1,social_media,retail,employed,south_america
2,events,healthcare,unemployed,australia
3,paid_ads,retail,,australia
4,referral,education,self_employed,europe


In [409]:
df[numerical_columns] = df[numerical_columns].fillna(0)
df[numerical_columns].head(10)

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
0,1,79450.0,4,0.94
1,1,46992.0,1,0.8
2,5,78796.0,3,0.69
3,2,83843.0,1,0.87
4,3,85012.0,3,0.62
5,1,59904.0,6,0.83
6,0,51283.0,2,0.57
7,5,62975.0,4,0.62
8,4,38648.0,2,0.86
9,3,59866.0,3,0.43


In [410]:
df['industry'].mode()

0    retail
Name: industry, dtype: object

The most frequent observation (mode) for the column industry is retail

# Feature importance for categorical columns

In [411]:
global_converted_mean = df.converted.mean()
for c in categorical_columns:
    print(c)
    df_group = df.groupby(c).converted.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_converted_mean
    df_group['risk'] = df_group['mean'] / global_converted_mean
    display(df_group)
    print()

lead_source


Unnamed: 0_level_0,mean,count,diff,risk
lead_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,0.671875,128,0.05286,1.085394
events,0.596,250,-0.023015,0.96282
organic_search,0.617021,282,-0.001994,0.996779
paid_ads,0.44697,264,-0.172045,0.722066
referral,0.807692,260,0.188677,1.304802
social_media,0.604317,278,-0.014699,0.976255



industry


Unnamed: 0_level_0,mean,count,diff,risk
industry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,0.559701,134,-0.059314,0.904181
education,0.748663,187,0.129648,1.209442
finance,0.595,200,-0.024015,0.961204
healthcare,0.604278,187,-0.014737,0.976193
manufacturing,0.666667,174,0.047652,1.07698
other,0.611111,198,-0.007904,0.987231
retail,0.586207,203,-0.032808,0.946999
technology,0.569832,179,-0.049183,0.920547



employment_status


Unnamed: 0_level_0,mean,count,diff,risk
employment_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,0.56,100,-0.059015,0.904663
employed,0.689024,328,0.070009,1.113098
self_employed,0.653409,352,0.034394,1.055563
student,0.652299,348,0.033284,1.053769
unemployed,0.497006,334,-0.122009,0.802898



location


Unnamed: 0_level_0,mean,count,diff,risk
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,0.698413,63,0.079398,1.128264
africa,0.601064,188,-0.017951,0.971
asia,0.620513,195,0.001498,1.00242
australia,0.605405,185,-0.01361,0.978014
europe,0.652778,216,0.033763,1.054543
middle_east,0.631313,198,0.012298,1.019867
north_america,0.595556,225,-0.023459,0.962102
south_america,0.598958,192,-0.020057,0.967599





In [412]:
from sklearn.metrics import mutual_info_score
def mutual_info_converted_score(series):
    return mutual_info_score(series, df['converted'])

mi = df[categorical_columns].apply(mutual_info_converted_score)
mi.sort_values(ascending=False)


lead_source          0.026574
employment_status    0.011070
industry             0.007267
location             0.001427
dtype: float64

lead_source is more important feature than location

# Feature importance of numerical columns



In [413]:
df[numerical_columns].corrwith(df.converted)

number_of_courses_viewed    0.435914
annual_income               0.053131
interaction_count           0.374573
lead_score                  0.193673
dtype: float64

All variables show positive correlation, (when one feature increase, it increases the converted rate as well)

In [414]:
pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'lead_score')
]
correlations = {pair: df[pair[0]].corr(df[pair[1]]) for pair in pairs}

print("Correlations for pairs:")
for pair, corr_value in correlations.items():
    print(f"{pair}: {corr_value}")

max_pair = max(correlations, key=lambda x: abs(correlations[x]))
print(f"\nPair with biggest correlation: {max_pair} with correlation {correlations[max_pair]}")

Correlations for pairs:
('interaction_count', 'lead_score'): 0.009888182496913077
('number_of_courses_viewed', 'lead_score'): -0.00487899835468127
('number_of_courses_viewed', 'interaction_count'): -0.023565222882888103
('annual_income', 'lead_score'): 0.015609546050138949

Pair with biggest correlation: ('number_of_courses_viewed', 'interaction_count') with correlation -0.023565222882888103


# Split the data

In [415]:
# Split the dataset into train, validation, and test sets
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

print(len(df_train), len(df_val), len(df_test))

876 293 293


In [416]:
df_train = df_train.reset_index(drop=True)  
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [417]:
y_train = df_train['converted'].values
y_val = df_val['converted'].values
y_test = df_test['converted'].values
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [418]:
# Mutual score information in training set
mi_scores = {}
for col in categorical_columns:
    mi = mutual_info_score(df_train[col], y_train)
    mi_scores[col] = round(mi,2)
mi_scores

{'lead_source': 0.04,
 'industry': 0.01,
 'employment_status': 0.01,
 'location': 0.0}

The biggest mutual information score is for lead_source

# One hot encoding for categorical columns

In [419]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

train_dicts = df_train[categorical_columns + numerical_columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical_columns + numerical_columns].to_dict(orient='records')
X_val = dv.transform(val_dicts)

# Train model using Logistic regression

In [420]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [421]:
model.intercept_, model.coef_ # bias and weights

(array([-0.06914728]),
 array([[-1.77843869e-05, -1.47154423e-02,  3.39095225e-02,
          2.66248432e-03,  1.15238518e-02, -1.02527697e-01,
         -2.48510995e-02,  4.93604222e-02, -2.01258344e-02,
         -1.34214865e-02, -3.00232200e-03, -9.25991830e-03,
         -3.17957304e-02, -1.60513114e-02,  3.11339155e-01,
          5.12012528e-02,  2.01511698e-02, -1.20346284e-02,
         -1.16021521e-02, -1.15251880e-01,  7.95303436e-02,
         -2.99401329e-02,  3.95843295e-03, -1.14296944e-02,
         -1.12457415e-02, -5.59987025e-03,  8.26402635e-03,
          5.58598769e-03, -3.33967159e-02, -2.52837052e-02,
          4.53752887e-01]]))

In [422]:
# Predict probabilities on the validation set, take all rows (:) but only the column at index 1 (the probability of class 1)
y_pred_prob = model.predict_proba(X_val)[:,1]
y_pred = model.predict(X_val)


In [423]:
df_pred = pd.DataFrame()
df_pred['predicted_probability'] = y_pred_prob
df_pred['prediction'] = y_pred
df_pred['true_label'] = y_val
df_pred['correct'] = (df_pred['prediction'] == df_pred['true_label'])
df_pred

Unnamed: 0,predicted_probability,prediction,true_label,correct
0,0.611922,1,0,False
1,0.799826,1,1,True
2,0.530213,1,0,False
3,0.471315,0,0,True
4,0.570661,1,0,False
...,...,...,...,...
288,0.419342,0,0,True
289,0.710539,1,1,True
290,0.418185,0,0,True
291,0.744835,1,1,True


In [424]:
# Check the accuracy of our model on the validation set
baseline_accuracy = accuracy_score(y_val, (y_pred_prob >= 0.5))
round(baseline_accuracy,2)


0.7

# Least useful feature using feature elimination

In [425]:
all_features = numerical_columns + categorical_columns
feature_elimination_results = {}

for feature_to_remove in all_features:
    # Define the current feature set excluding the one to remove
    current_features = [feature for feature in all_features if feature != feature_to_remove]
    
    train_dict_reduced = df_train[current_features].to_dict(orient='records')
    val_dict_reduced = df_val[current_features].to_dict(orient='records')

    dv_reduced = DictVectorizer(sparse=False)
    X_train_reduced = dv_reduced.fit_transform(train_dict_reduced)
    X_val_reduced = dv_reduced.transform(val_dict_reduced)

    # Train a new model
    model_reduced = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_reduced.fit(X_train_reduced, y_train)

    # Evaluate the reduced model
    y_pred_reduced = model_reduced.predict_proba(X_val_reduced)[:,1]
    accuracy_reduced = accuracy_score(y_val, (y_pred_reduced >= 0.5))
    
    # Calculate the difference (Original Accuracy - Reduced Accuracy)
    # A small positive diff means the feature was not very important.
    # A negative diff means removing the feature *improved* the model (very low importance or noise).
    diff = baseline_accuracy - accuracy_reduced
    
    feature_elimination_results[feature_to_remove] = {
        'accuracy_without_feature': accuracy_reduced,
        'diff_from_baseline': diff
    }
    
    print(f"Removed: {feature_to_remove:20} | New Acc: {accuracy_reduced:.4f} | Diff: {diff:+.4f}")


Removed: number_of_courses_viewed | New Acc: 0.5563 | Diff: +0.1433
Removed: annual_income        | New Acc: 0.8532 | Diff: -0.1536
Removed: interaction_count    | New Acc: 0.5563 | Diff: +0.1433
Removed: lead_score           | New Acc: 0.7065 | Diff: -0.0068
Removed: lead_source          | New Acc: 0.7031 | Diff: -0.0034
Removed: industry             | New Acc: 0.6997 | Diff: +0.0000
Removed: employment_status    | New Acc: 0.6962 | Diff: +0.0034
Removed: location             | New Acc: 0.7099 | Diff: -0.0102


By eliminating industry there is no change in accuracy.

In [426]:
# Applied scaling to numerical features to see if it improves the model
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

df_train[numerical_columns] = scaler.fit_transform(df_train[numerical_columns])
df_val[numerical_columns] = scaler.transform(df_val[numerical_columns])

for c in [0.01, 0.1, 1, 10, 100]:
    train_dict = df_train[all_features].to_dict(orient='records')
    val_dict = df_val[all_features].to_dict(orient='records')

    X_train = dv.fit_transform(train_dict)
    X_val = dv.transform(val_dict)
    
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_val)[:,1]
    accuracy = accuracy_score(y_val, (y_pred >= 0.5))

    print(f"C={c}  => Accuracy: {accuracy}, Rounded: {accuracy:.3f}")

C=0.01  => Accuracy: 0.8395904436860068, Rounded: 0.840
C=0.1  => Accuracy: 0.856655290102389, Rounded: 0.857
C=1  => Accuracy: 0.8532423208191127, Rounded: 0.853
C=10  => Accuracy: 0.8532423208191127, Rounded: 0.853
C=100  => Accuracy: 0.8532423208191127, Rounded: 0.853


The best accuracy is achieved when C=0.1