In [9]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [10]:
df = pd.read_csv('Telco-Customer-Churn.csv')

### Model

In [21]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

str_cols = list(df.dtypes[df.dtypes == 'object'].keys())
str_cols.remove('customerid')
for col in str_cols:
    df[col] = df[col].str.lower().str.replace(" ", "_")
    
df['churn'] = (df['churn'] == 'yes').astype(int)
df['totalcharges'] = pd.to_numeric(df['totalcharges'], errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)
df.totalcharges.isna().sum()

0

In [17]:
numeric_cols = ['tenure', 'monthlycharges', 'totalcharges']
categorical_cols = ['gender', 'seniorcitizen', 'partner', 'dependents',
       'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod']

In [117]:
from sklearn.model_selection import train_test_split
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
len(df_train), len(df_val), len(df_test)

(4225, 1409, 1409)

In [119]:
X_train = dv.transform(df_train[categorical_cols + numeric_cols].to_dict(orient='records'))
y_train = df_train.churn

X_val = dv.transform(df_val[categorical_cols + numeric_cols].to_dict(orient='records'))
y_val = df_val.churn

X_test = dv.transform(df_test[categorical_cols + numeric_cols].to_dict(orient='records'))
y_test = df_test.churn

In [120]:
def train(df, y_train, C=1.0):
    dicts = df[categorical_cols + numeric_cols].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)
    
    model = LogisticRegression(max_iter=1000, C=C)
    model.fit(X_train, y_train)
    
    return dv, model

In [123]:
def predict(df: pd.DataFrame, dv: DictVectorizer, model: LogisticRegression):
    dicts = df[categorical_cols + numeric_cols].to_dict(orient='records')
    
    X = dv.transform(dicts)
    return model.predict_proba(X)[:,1]

In [125]:
C = 1

In [127]:
dv, model = train(df_train_full, df_train_full.churn, C)

y_pred = predict(df_test, dv, model)

roc_auc_score(y_test, y_pred)

0.862495730123076

### Save the model

In [130]:
import pickle

In [132]:
output_file = f"model_C={C}.bin"
output_file

'model_C=1.bin'

In [134]:
f_out = open(output_file, 'wb')
pickle.dump((dv, model), f_out)
f_out.close()

In [139]:
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)

### Load model

In [1]:
import pickle

In [3]:
with open('model_C=1.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [5]:
model

In [7]:
dv

In [125]:
from random import randint
import json
ci = randint(0, len(df))

customer = df.iloc[ci][numeric_cols + categorical_cols].to_dict()
y_actual = df.iloc[ci].churn
print(json.dumps(customer))
print()
print(y_actual)

{"tenure": 8, "monthlycharges": 81.25, "totalcharges": 585.95, "gender": "male", "seniorcitizen": 1, "partner": "no", "dependents": "no", "phoneservice": "yes", "multiplelines": "yes", "internetservice": "fiber_optic", "onlinesecurity": "no", "onlinebackup": "no", "deviceprotection": "yes", "techsupport": "no", "streamingtv": "no", "streamingmovies": "no", "contract": "month-to-month", "paperlessbilling": "yes", "paymentmethod": "electronic_check"}

1


In [127]:
X = dv.transform([customer])

In [129]:
y_pred = (model.predict_proba(X)[0, 1] < 1).astype(int)
model.predict_proba(X)[0, 1], y_pred

(0.7144085079834862, 1)