This is a starter notebook for an updated module 5 of ML Zoomcamp

The code is based on the modules 3 and 4. We use the same dataset: [telco customer churn](https://www.kaggle.com/datasets/blastchar/telco-customer-churn)

In [39]:
import pandas as pd
import numpy as np
import sklearn
import pickle

In [19]:
print(f'pandas=={pd.__version__}')
print(f'numpy=={np.__version__}')
print(f'sklearn=={sklearn.__version__}')

pandas==2.2.2
numpy==1.26.4
sklearn==1.5.1


In [20]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [21]:
data_url = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'

df = pd.read_csv(data_url)

df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)

df.churn = (df.churn == 'yes').astype(int)

In [22]:
y_train = df.churn

In [23]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']

categorical = [
    'gender',
    'seniorcitizen',
    'partner',
    'dependents',
    'phoneservice',
    'multiplelines',
    'internetservice',
    'onlinesecurity',
    'onlinebackup',
    'deviceprotection',
    'techsupport',
    'streamingtv',
    'streamingmovies',
    'contract',
    'paperlessbilling',
    'paymentmethod',
]

## Model

In [24]:
dv = DictVectorizer()

# Converts df to a list of dictionaries
train_dict = df[categorical + numerical].to_dict(orient='records')
train_dict[:3]

[{'gender': 'female',
  'seniorcitizen': 0,
  'partner': 'yes',
  'dependents': 'no',
  'phoneservice': 'no',
  'multiplelines': 'no_phone_service',
  'internetservice': 'dsl',
  'onlinesecurity': 'no',
  'onlinebackup': 'yes',
  'deviceprotection': 'no',
  'techsupport': 'no',
  'streamingtv': 'no',
  'streamingmovies': 'no',
  'contract': 'month-to-month',
  'paperlessbilling': 'yes',
  'paymentmethod': 'electronic_check',
  'tenure': 1,
  'monthlycharges': 29.85,
  'totalcharges': 29.85},
 {'gender': 'male',
  'seniorcitizen': 0,
  'partner': 'no',
  'dependents': 'no',
  'phoneservice': 'yes',
  'multiplelines': 'no',
  'internetservice': 'dsl',
  'onlinesecurity': 'yes',
  'onlinebackup': 'no',
  'deviceprotection': 'yes',
  'techsupport': 'no',
  'streamingtv': 'no',
  'streamingmovies': 'no',
  'contract': 'one_year',
  'paperlessbilling': 'no',
  'paymentmethod': 'mailed_check',
  'tenure': 34,
  'monthlycharges': 56.95,
  'totalcharges': 1889.5},
 {'gender': 'male',
  'seniorc

In [25]:
# DicVectorizer dv converts df to a list of dictionaries  
X_train = dv.fit_transform(train_dict)

# Model - Logistic Regression
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

In [26]:
# feature matrix
X_train[:10].todense()

matrix([[1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
         1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
         1.00000e+00, 0.00000e+00, 0.00000e+00, 2.98500e+01, 0.00000e+00,
         1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
         1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
         0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
         0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
         0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
         1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 2.98500e+01],
        [0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
         0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00,
         1.00000e+00, 0.00000e+00, 0.00000e+00, 5.69500e+01, 1.00000e+00,
         0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
         0.00000e+00, 0.00000e+00, 1.

## Save model with pickle

In [43]:
with open('model.bin','wb') as f_out:
    pickle.dump((dv, model), f_out)

In [44]:
with open('model.bin','rb') as f_in:
    (dv, model) = pickle.load(f_in)

In [45]:
# Files
!ls -lh

total 340K
-rw-r--r-- 1 rlz-98 rlz-98  337 Oct 25 21:56 Dockerfile
-rw-r--r-- 1 rlz-98 rlz-98  483 Oct 25 21:56 fly.toml
-rw-r--r-- 1 rlz-98 rlz-98 2.5K Oct 27 13:33 model.bin
-rw-r--r-- 1 rlz-98 rlz-98  197 Oct 25 21:56 ping.py
-rw-r--r-- 1 rlz-98 rlz-98  730 Oct 25 21:56 predict_old.py
-rw-r--r-- 1 rlz-98 rlz-98 1.8K Oct 25 21:56 predict.py
-rw-r--r-- 1 rlz-98 rlz-98  295 Oct 25 21:56 pyproject.toml
-rw-r--r-- 1 rlz-98 rlz-98  16K Oct 25 21:56 README.md
-rw-r--r-- 1 rlz-98 rlz-98  37K Oct 27 13:32 starter.ipynb
-rw-r--r-- 1 rlz-98 rlz-98  864 Oct 25 21:56 test.py
-rw-r--r-- 1 rlz-98 rlz-98 2.0K Oct 25 21:56 train.py
-rw-r--r-- 1 rlz-98 rlz-98  57K Oct 25 21:56 uv.lock
-rw-r--r-- 1 rlz-98 rlz-98  81K Oct 25 21:56 workshop-uv-fastapi.ipynb


# Model for a customer

In [46]:
# Created manually
customer = {'gender': 'male',
  'seniorcitizen': 0,
  'partner': 'no',
  'dependents': 'yes',
  'phoneservice': 'no',
  'multiplelines': 'no_phone_service',
  'internetservice': 'dsl',
  'onlinesecurity': 'no',
  'onlinebackup': 'yes',
  'deviceprotection': 'no',
  'techsupport': 'no',
  'streamingtv': 'no',
  'streamingmovies': 'no',
  'contract': 'month-to-month',
  'paperlessbilling': 'yes',
  'paymentmethod': 'electronic_check',
  'tenure': 6, #6 months
  'monthlycharges': 29.85,
  'totalcharges': 129.85
}

X = dv.transform(customer)

# predict probability of churning - 54.15 %
churn = model.predict_proba(X)[0,1]
churn

0.5415539262282826

In [48]:
if churn >=0.5:
    print("send email with promo")
else:
    print("don't do anything")

send email with promo


# Pipeline 

In [49]:
from sklearn.pipeline import make_pipeline

In [50]:
pipeline = make_pipeline(
    DictVectorizer(),
    LogisticRegression(solver='liblinear')
)

In [51]:
# Converts df to a list of dictionaries
train_dict = df[categorical + numerical].to_dict(orient='records')

# DicVectorizer dv converts df to a list of dictionaries  
#X_train = dv.fit_transform(tra# DicVectorizer dv converts df to a list of dictionaries  
X_train = dv.fit_transform(train_dict)

# Model - Logistic Regression
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)in_dict)

# Model - Logistic Regression
#model = LogisticRegression(solver='liblinear')
#model.fit(X_train, y_train)

pipeline.fit(train_dict,y_train)

# References
* Workshop- How to Deploy Machine Learning Models with FastAPI, Docker, and Fly.io
    * Video: https://www.youtube.com/watch?v=jzGzw98Eikk
    * Github: https://github.com/alexeygrigorev/workshops/tree/main/mlzoomcamp-fastapi-uv
* Python - Pickle: https://www.geeksforgeeks.org/python/understanding-python-pickling-example/