In [1]:
import numpy as np
import pandas as pd
from scipy.stats import randint
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, recall_score
from sklearn.model_selection import cross_validate, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.svm import SVC

In [2]:
data = pd.read_csv('../../datasets/exercises/customer-churn.csv')
X, y = data[['tenure', 'Contract', 'PaymentMethod', 'MonthlyCharges']], data['Churn']

In [3]:
categorical_featues = ['Contract', 'PaymentMethod']
numerical_features = ['tenure', 'MonthlyCharges']

categorical_transformer = Pipeline([
    ('onehotencoder', OneHotEncoder(drop='first')),
])

numerical_transformer = Pipeline([
    ('scaler', MinMaxScaler()),
])

preprocessor = ColumnTransformer([
    ('categorical_transformer', categorical_transformer, categorical_featues),
    ('numerical_transformer', numerical_transformer, numerical_features),
])

pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('model', SVC(random_state=1909, gamma='auto')),
])

In [4]:
custom_scorer = make_scorer(recall_score, pos_label='Yes')

In [5]:
res_cv = cross_validate(pipeline, X, y, scoring=custom_scorer, cv=10, return_train_score=True)
res_recall_tr = np.mean(res_cv['train_score']) * 100
res_recall_te = np.mean(res_cv['test_score']) * 100
f'Average Recall on Training and Test Sets (Benchmark): {res_recall_tr:.2f}%/{res_recall_te:.2f}%'

'Average Recall on Training and Test Sets (Benchmark): 88.39%/88.39%'

In [6]:
C, kernel, degree, gamma = randint(1, 100), ['rbf', 'poly'], randint(1, 100), ['auto']
param_distributions = {'model__C': C, 'model__kernel': kernel, 'model__degree': degree, 'model__gamma': gamma}

In [7]:
rs = RandomizedSearchCV(pipeline, param_distributions=param_distributions, 
                        n_iter=5, scoring=custom_scorer, n_jobs=-1, iid=False, 
                        cv=10, random_state=1909)
rs = rs.fit(X, y)
f'Optimal parameters: {rs.best_params_}'

"Optimal parameters: {'model__C': 57, 'model__degree': 61, 'model__gamma': 'auto', 'model__kernel': 'poly'}"

In [8]:
pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('model', SVC(C=57, kernel='poly', degree=51, gamma='auto', random_state=1909)),
])

pipeline = pipeline.fit(X, y)

In [9]:
prediction_set = pd.DataFrame({'tenure': [20], 'Contract': ['One year'], 'PaymentMethod': ['Credit card (automatic)'], 'MonthlyCharges': [50]})

In [10]:
pipeline.predict(prediction_set)

array(['Yes'], dtype=object)

In [11]:
res_cv = cross_validate(pipeline, X, y, scoring=custom_scorer, cv=10, return_train_score=True)
res_recall_tr = np.mean(res_cv['train_score']) * 100
res_recall_te = np.mean(res_cv['test_score']) * 100
f'Average Recall on Training and Test Sets: {res_recall_tr:.2f}%/{res_recall_te:.2f}%'

'Average Recall on Training and Test Sets: 100.00%/100.00%'