This is a starter notebook for an updated module 5 of ML Zoomcamp

The code is based on the modules 3 and 4. We use the same dataset: [telco customer churn](https://www.kaggle.com/datasets/blastchar/telco-customer-churn)

In [39]:
import pandas as pd
import numpy as np
import sklearn
import pickle

In [19]:
print(f'pandas=={pd.__version__}')
print(f'numpy=={np.__version__}')
print(f'sklearn=={sklearn.__version__}')

pandas==2.2.2
numpy==1.26.4
sklearn==1.5.1


In [20]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [21]:
data_url = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'

df = pd.read_csv(data_url)

df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)

df.churn = (df.churn == 'yes').astype(int)

In [54]:
df

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,0
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.50,0
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,1
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,...,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.30,1840.75,0
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.70,151.65,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-resvb,male,0,yes,yes,24,yes,yes,dsl,yes,...,yes,yes,yes,yes,one_year,yes,mailed_check,84.80,1990.50,0
7039,2234-xaduh,female,0,yes,yes,72,yes,yes,fiber_optic,no,...,yes,no,yes,yes,one_year,yes,credit_card_(automatic),103.20,7362.90,0
7040,4801-jzazl,female,0,yes,yes,11,no,no_phone_service,dsl,yes,...,no,no,no,no,month-to-month,yes,electronic_check,29.60,346.45,0
7041,8361-ltmkd,male,1,yes,no,4,yes,yes,fiber_optic,no,...,no,no,no,no,month-to-month,yes,mailed_check,74.40,306.60,1


In [22]:
y_train = df.churn

In [55]:
y_train

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: churn, Length: 7043, dtype: int64

In [23]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']

categorical = [
    'gender',
    'seniorcitizen',
    'partner',
    'dependents',
    'phoneservice',
    'multiplelines',
    'internetservice',
    'onlinesecurity',
    'onlinebackup',
    'deviceprotection',
    'techsupport',
    'streamingtv',
    'streamingmovies',
    'contract',
    'paperlessbilling',
    'paymentmethod',
]

## Model

In [24]:
dv = DictVectorizer()

# Converts df to a list of dictionaries
train_dict = df[categorical + numerical].to_dict(orient='records')
train_dict[:3]

[{'gender': 'female',
  'seniorcitizen': 0,
  'partner': 'yes',
  'dependents': 'no',
  'phoneservice': 'no',
  'multiplelines': 'no_phone_service',
  'internetservice': 'dsl',
  'onlinesecurity': 'no',
  'onlinebackup': 'yes',
  'deviceprotection': 'no',
  'techsupport': 'no',
  'streamingtv': 'no',
  'streamingmovies': 'no',
  'contract': 'month-to-month',
  'paperlessbilling': 'yes',
  'paymentmethod': 'electronic_check',
  'tenure': 1,
  'monthlycharges': 29.85,
  'totalcharges': 29.85},
 {'gender': 'male',
  'seniorcitizen': 0,
  'partner': 'no',
  'dependents': 'no',
  'phoneservice': 'yes',
  'multiplelines': 'no',
  'internetservice': 'dsl',
  'onlinesecurity': 'yes',
  'onlinebackup': 'no',
  'deviceprotection': 'yes',
  'techsupport': 'no',
  'streamingtv': 'no',
  'streamingmovies': 'no',
  'contract': 'one_year',
  'paperlessbilling': 'no',
  'paymentmethod': 'mailed_check',
  'tenure': 34,
  'monthlycharges': 56.95,
  'totalcharges': 1889.5},
 {'gender': 'male',
  'seniorc

In [25]:
# DicVectorizer dv converts df to a list of dictionaries  
X_train = dv.fit_transform(train_dict)

# Model - Logistic Regression
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

In [26]:
# feature matrix
X_train[:10].todense()

matrix([[1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
         1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
         1.00000e+00, 0.00000e+00, 0.00000e+00, 2.98500e+01, 0.00000e+00,
         1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
         1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
         0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
         0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
         0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
         1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 2.98500e+01],
        [0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
         0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00,
         1.00000e+00, 0.00000e+00, 0.00000e+00, 5.69500e+01, 1.00000e+00,
         0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
         0.00000e+00, 0.00000e+00, 1.

# Pipeline 

In [49]:
from sklearn.pipeline import make_pipeline

In [50]:
#It's not convenient to deal with two objects: `dv` and `model`. 
#Let's combine them into one: 
pipeline = make_pipeline(
    DictVectorizer(),
    LogisticRegression(solver='liblinear')
)

In [56]:
# Converts df to a list of dictionaries
train_dict = df[categorical + numerical].to_dict(orient='records')

# DicVectorizer dv converts df to a list of dictionaries  
#X_train = dv.fit_transform(tra# DicVectorizer dv converts df to a list of dictionaries  
X_train = dv.fit_transform(train_dict)

# Model - Logistic Regression
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

# Model - Logistic Regression
#model = LogisticRegression(solver='liblinear')
#model.fit(X_train, y_train)

pipeline.fit(train_dict,y_train)

# Save model in pickle

In [52]:
with open('model.bin','wb') as f_out:
    pickle.dump(pipeline, f_out)
with open('model.bin','rb') as f_in:
    pipeline = pickle.load(f_in)

# Model for a customer

In [57]:
# Created manually
customer = {'gender': 'male',
  'seniorcitizen': 0,
  'partner': 'no',
  'dependents': 'yes',
  'phoneservice': 'no',
  'multiplelines': 'no_phone_service',
  'internetservice': 'dsl',
  'onlinesecurity': 'no',
  'onlinebackup': 'yes',
  'deviceprotection': 'no',
  'techsupport': 'no',
  'streamingtv': 'no',
  'streamingmovies': 'no',
  'contract': 'month-to-month',
  'paperlessbilling': 'yes',
  'paymentmethod': 'electronic_check',
  'tenure': 6, #6 months
  'monthlycharges': 29.85,
  'totalcharges': 129.85
}

#X = dv.transform(customer)

# predict probability of churning - 54.15 %
churn = model.predict_proba(X)[0,1]

print(churn)

if churn >=0.5:
    print("send email with promo")
else:
    print("don't do anything")

0.5415539262282826
send email with promo


# Turning the notebook into a script

We can now turn this notebook into a training script:

```bash
jupyter nbconvert --to=script workshop-uv-fastapi.ipynb
mv workshop-uv-fastapi.py train.py
```

# References
* Workshop- How to Deploy Machine Learning Models with FastAPI, Docker, and Fly.io
    * Video: https://www.youtube.com/watch?v=jzGzw98Eikk
    * Github: https://github.com/alexeygrigorev/workshops/tree/main/mlzoomcamp-fastapi-uv
* Python - Pickle: https://www.geeksforgeeks.org/python/understanding-python-pickling-example/