This code do two things
* Train the model
* Make predictions

In [1]:
import pandas as pd
import numpy as np
import sklearn
import pickle

In [2]:
print(f'pandas=={pd.__version__}')
print(f'numpy=={np.__version__}')
print(f'sklearn=={sklearn.__version__}')

pandas==2.2.2
numpy==1.26.4
sklearn==1.5.1


In [3]:
pip install uv

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [5]:
data_url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'
df = pd.read_csv(data_url)

df.columns = df.columns.str.lower().str.replace(' ', '_')

In [6]:
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [7]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [8]:
# Function to fill missing values with a value
def fill_null_values(df, cat_fill_value, num_fill_value):
    cat_columns = df.select_dtypes(include=['object']).columns
    num_columns = df.select_dtypes(include=['int64','float64']).columns
    
    # Fill NaNs for categorical columns with the provided value
    df[cat_columns] = df[cat_columns].fillna(cat_fill_value)
    
    # Fill NaNs for numerical columns with the provided value
    df[num_columns] = df[num_columns].fillna(num_fill_value)
    
    return df

In [9]:
df =fill_null_values(df=df, cat_fill_value='NA', num_fill_value=0.0)
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [10]:
df_train=df.copy()
del df_train['converted']

In [11]:
y_train= df.converted
y_train

0       1
1       0
2       1
3       0
4       1
       ..
1457    1
1458    1
1459    1
1460    1
1461    1
Name: converted, Length: 1462, dtype: int64

In [12]:
#categorical = list(df.select_dtypes(include=['object']).columns)
#numerical =list(df_train.select_dtypes(include=['int64','float64']).columns)
categorical = ['lead_source']
numerical = ['number_of_courses_viewed', 'annual_income']

# Pipeline

In [13]:
from sklearn.pipeline import make_pipeline

In [14]:
#It's not convenient to deal with two objects: `dv` and `model`. 
#Let's combine them into one: 
pipeline = make_pipeline(
    DictVectorizer(),
    LogisticRegression(solver='liblinear')
)

In [15]:
dv = DictVectorizer()
# Converts df to a list of dictionaries
train_dict = df[categorical + numerical].to_dict(orient='records')


# DicVectorizer dv converts df to a list of dictionaries  
#X_train = dv.fit_transform(train_dict)

# Model - Logistic Regression
#model = LogisticRegression(solver='liblinear')
#model.fit(X_train, y_train)

pipeline.fit(train_dict,y_train)

In [16]:
train_dict[0]

{'lead_source': 'paid_ads',
 'number_of_courses_viewed': 1,
 'annual_income': 79450.0}

# Save model in pickle

In [17]:
with open('model.bin','wb') as f_out:
    pickle.dump(pipeline, f_out)
with open('model.bin','rb') as f_in:
    pipeline = pickle.load(f_in)

# Model for a customer

In [18]:
customer={ 'lead_source': 'paid_ads',
 'number_of_courses_viewed': 1,
 'annual_income': 79450.0,
 'interaction_count': 4,
 'lead_score': 0.94 }

#X = dv.transform(customer)

# predict probability of churning - 54.15 %
converted = pipeline.predict_proba(customer)[0,1]

print('Prob of convert: ',converted)

if converted>=0.5:
    print("send email with promo")
else:
    print("don't do anything")

Prob of convert:  0.5950564440312289
send email with promo


In [None]:
"""
{
  "lead_source": "paid_ads",
  "number_of_courses_viewed": 1,
  "annual_income": 79450.0,
  "interaction_count": 4,
  "lead_score": 0.94
}


"""