In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import matplotlib as plt 

In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv


--2025-10-13 00:32:42--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-13 00:32:43 (1.03 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [2]:
df = pd.read_csv('course_lead_scoring.csv')

In [3]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [4]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [5]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

# Data preparation
- Check if the missing values are presented in the features.
- If there are missing values:
    - For caterogiral features, replace them with 'NA'
    - For numerical features, replace with with 0.0


In [6]:
caterogiral = ['lead_source', 'industry','employment_status','location'] 
numerical = ['number_of_courses_viewed','annual_income','interaction_count','lead_score','converted']

In [7]:
df[caterogiral].isnull().sum()

lead_source          128
industry             134
employment_status    100
location              63
dtype: int64

In [8]:
df[caterogiral] = df[caterogiral].fillna('NA')

In [9]:
df[caterogiral].isnull().sum()

lead_source          0
industry             0
employment_status    0
location             0
dtype: int64

In [10]:
df[numerical].isnull().sum()

number_of_courses_viewed      0
annual_income               181
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [11]:
df[numerical] = df[numerical].fillna(0)
df[numerical].isnull().sum()

number_of_courses_viewed    0
annual_income               0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

## Question 1
- What is the most frequent observation (mode) for the column industry?

   -  NA
   -  technology
   -  healthcare
   -  retail

In [12]:
df['industry'].mode()[0]

'retail'

## Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

    - What are the two features that have the biggest correlation?
    
        - interaction_count and lead_score
        -  number_of_courses_viewed and lead_score
        - number_of_courses_viewed and interaction_count
        - annual_income and interaction_count
        - Only consider the pairs above when answering this question.

Only consider the pairs above when answering this question.

    - Split the data
    - Split your data in train/val/test sets with 60%/20%/20% distribution.
    - Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
    - Make sure that the target value y is not in your dataframe.



In [13]:
df.columns

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')

In [14]:
from sklearn.model_selection import train_test_split


In [15]:
df_full_trian , df_test = train_test_split(df,test_size=0.2, random_state=42)
df_train , df_val = train_test_split(df_full_trian,test_size=0.25, random_state=42)
len(df_train),len(df_test),len(df_val)

(876, 293, 293)

In [16]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [17]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [18]:
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [19]:

df_full_trian = df_full_trian.reset_index(drop=True)

In [20]:
df_full_trian.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [21]:
global_score = df_full_trian.converted.mean()
global_score

np.float64(0.6073567151411463)

In [22]:
pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

In [23]:
df_full_trian[numerical].corrwith(df_full_trian.converted)

number_of_courses_viewed    0.442068
annual_income               0.029612
interaction_count           0.378482
lead_score                  0.225641
converted                   1.000000
dtype: float64

Ans : number_of_courses_viewed and interaction_count have the biggest correlation

## Question 3
Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
Round the scores to 2 decimals using round(score, 2).
Which of these variables has the biggest mutual information score?

   - industry
   -  location
   - lead_source
   - employment_status

In [54]:
from sklearn.metrics import mutual_info_score

In [57]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_trian.converted)

In [60]:
mi_score = df_full_trian[caterogiral].apply(mutual_info_churn_score)
round(mi_score.sort_values(ascending=False),2)

lead_source          0.03
employment_status    0.01
industry             0.01
location             0.00
dtype: float64

Ans:lead_source variables has the biggest mutual information score

## Question 4
    Now let's train a logistic regression.
    Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
    Fit the model on the training dataset.
    To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
    What accuracy did you get?

        - 0.64
        - 0.74
        - 0.84
        - 0.94

In [62]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer 

In [64]:
categorical = ['lead_source', 'industry','employment_status','location'] 
numerical = ['number_of_courses_viewed','annual_income','interaction_count','lead_score']
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
val_dicts = df_val[categorical + numerical].to_dict(orient='records')

In [66]:
dv =DictVectorizer(sparse=False)

In [67]:
x_train = dv.fit_transform(train_dicts)
x_val = dv.transform(val_dicts)

In [68]:
def sigmoid(z): 
    return 1 / (1 + np.exp(-z))

In [79]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(x_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [59]:

df = pd.read_csv('course_lead_scoring.csv')

cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna("NA")

num_cols = df.select_dtypes(exclude=['object']).columns
df[num_cols] = df[num_cols].fillna(0)


y = df['converted']
X = df.drop(columns=['converted'])

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), X_train.select_dtypes(exclude=['object']).columns),
    ('cat', OneHotEncoder(handle_unknown='ignore'), X_train.select_dtypes(include=['object']).columns)
])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
])

model.fit(X_train, y_train)
y_val_pred = model.predict(X_val)

accuracy = round(accuracy_score(y_val, y_val_pred), 2)
print(f"Validation Accuracy: {accuracy}")


Validation Accuracy: 0.84


### Q4 ans: Accuracy is 0.84

## Question 5
Let's find the least useful feature using the feature elimination technique.
Train a model using the same features and parameters as in Q4 (without rounding).
Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
Which of following feature has the smallest difference?

    - 'industry'
    - 'employment_status'
    - 'lead_score'

In [60]:

df = pd.read_csv('course_lead_scoring.csv')
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna("NA")
num_cols = df.select_dtypes(exclude=['object']).columns
df[num_cols] = df[num_cols].fillna(0)
y = df['converted']
X = df.drop(columns=['converted'])

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# --- Step 3: Define preprocessing ---
def build_pipeline(df):
    preprocessor = ColumnTransformer([
    ('num', StandardScaler(), df.select_dtypes(exclude=['object']).columns),
    ('cat', OneHotEncoder(handle_unknown='ignore'), df.select_dtypes(include=['object']).columns)
])

# --- Step 6: Logistic Regression model ---
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
    ])
    return model

# --- Step 4: Train base model ---
base_model = build_pipeline(X_train)
base_model.fit(X_train, y_train)
y_val_pred = base_model.predict(X_val)
base_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Base model accuracy: {base_accuracy:.4f}")


Base model accuracy: 0.8356


In [49]:
X_train_t = X_train.copy()
X_val_t = X_val.copy()
y_train

624     0
1088    1
1196    1
6       0
1327    1
       ..
973     0
968     0
608     1
364     0
226     1
Name: converted, Length: 877, dtype: int64

In [54]:
accuracy_differences = {}


In [55]:
cols_list = ['industry', 'number_of_courses_viewed', 'annual_income','employment_status', 'location', 'interaction_count', 'lead_score']
X_train_reduced = X_train_t[cols_list]
X_val_reduced = X_val_t[cols_list]
model = build_pipeline(X_train_reduced)
model.fit(X_train_reduced, y_train)
y_val_pred = model.predict(X_val_reduced)
new_acc = accuracy_score(y_val, y_val_pred)
accuracy_differences['lead_source'] = base_accuracy - new_acc
accuracy_differences

{'lead_source': 0.0273972602739726}

In [56]:
for col in X_train.columns: 
    X_train_reduced = X_train.drop(columns=[col],errors='ignore')
    X_val_reduced = X_val.drop(columns=[col],errors='ignore')
    model = build_pipeline(X_train_reduced)
    model.fit(X_train_reduced, y_train)
    y_val_pred = model.predict(X_val_reduced)
    new_acc = accuracy_score(y_val, y_val_pred)
    accuracy_differences[col] = base_accuracy - new_acc
accuracy_differences

{'lead_source': 0.0273972602739726,
 'industry': 0.020547945205479423,
 'number_of_courses_viewed': 0.1095890410958904,
 'annual_income': -0.003424657534246589,
 'employment_status': 0.0273972602739726,
 'location': 0.003424657534246589,
 'interaction_count': 0.06506849315068497,
 'lead_score': 0.023972602739726012}

In [57]:
sorted_dict = dict(sorted(accuracy_differences.items(), key=lambda item: item[1]))
print(sorted_dict)

{'annual_income': -0.003424657534246589, 'location': 0.003424657534246589, 'industry': 0.020547945205479423, 'lead_score': 0.023972602739726012, 'lead_source': 0.0273972602739726, 'employment_status': 0.0273972602739726, 'interaction_count': 0.06506849315068497, 'number_of_courses_viewed': 0.1095890410958904}


### Q5 Ans : industry has smallest differece among the  'industry' 'employment_status' 'lead_score'

## Question 6
Now let's train a regularized logistic regression.
Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
Train models using all the features as in Q4.
Calculate the accuracy on the validation dataset and round it to 3 decimal digits.
Which of these C leads to the best accuracy on the validation set?

    - 0.01
    - 0.1
    - 1
    - 10
    - 100


In [65]:
def build_pipeline(df,c):
    preprocessor = ColumnTransformer([
    ('num', StandardScaler(), df.select_dtypes(exclude=['object']).columns),
    ('cat', OneHotEncoder(handle_unknown='ignore'), df.select_dtypes(include=['object']).columns)
])

# --- Step 6: Logistic Regression model ---
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42))
    ])
    return model

In [66]:
c_list = [0.01, 0.1, 1, 10, 100]

In [68]:
for c_val in c_list: 
    base_model = build_pipeline(X_train,c_val)
    base_model.fit(X_train, y_train)
    y_val_pred = base_model.predict(X_val)
    base_accuracy = accuracy_score(y_val, y_val_pred)
    print(f" model accuracy for {c_val} : {base_accuracy:.4f}")
    

 model accuracy for 0.01 : 0.8048
 model accuracy for 0.1 : 0.8151
 model accuracy for 1 : 0.8356
 model accuracy for 10 : 0.8425
 model accuracy for 100 : 0.8459


### Q6 Ans : C -> 100 leads to the best accuracy, 0.8459, on the validation set