In [1]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from acquire import Acquire
from prepare import Prepare
import pandas as pd
pd.options.display.max_columns = 50

#### Note: GridSearchCV will be used here because the data set is small enough to check all possible combinations. With a large enough data set RandomizedSearchCV is preferred because n_iter can be adjusted to determine how many combinations to check instead of all as with GridSearchCv

In [2]:
a = Acquire()
p = Prepare()
telco = a.get_telco_data()
train, val, test = p.prep_telco(telco, modeling=True)
train.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type,no_phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,steaming_movies
0,1.0,0.0,0.0,1.0,32.0,1.0,0.0,20.5,696.8,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,0.0,55.0,1.0,1.0,113.6,6292.7,0.0,2.0,2.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
2,1.0,0.0,1.0,1.0,25.0,1.0,0.0,25.5,630.6,0.0,2.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,1.0,0.0,12.0,1.0,1.0,98.1,1060.2,1.0,0.0,2.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,63.0,1.0,0.0,102.6,6296.75,0.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0


In [29]:
# train.info()

In [4]:
X_train = train.drop("churn", axis=1)
y_train = train["churn"]
print(X_train.shape)
print(y_train.shape)

(5088, 20)
(5088,)


In [14]:
def grid_search(X, y, model, params_dic):
    grid = GridSearchCV(model, params_dic)
    grid.fit(X, y)
    return grid.best_estimator_, grid.best_score_, grid.best_params_

In [15]:
rf_grid = {"n_estimators": [5, 10, 25, 50], "criterion": ["gini", "entropy"], "max_depth": [3, 5, 10], "min_samples_split": [5, 10, 20], "min_samples_leaf": [2, 5, 10], "max_features": [None, "auto", "log2"], "bootstrap": [True, False]}
knn_grid = {"n_neighbors": [2, 5, 10]}
lr_grid = {"penalty": ["l1", "l2"], "C": [0.25, 0.50, 0.75, 1.0, 10]}
models = [RandomForestClassifier(random_state=7), KNeighborsClassifier(), LogisticRegression(solver="liblinear", random_state=7)]

In [16]:
# rf = RandomForestClassifier(n_estimators=5, criterion="gini", max_depth=5, min_samples_split=5, min_samples_leaf=5, max_features="auto", bootstrap=True)
# rf.fit(X_train, y_train)

In [17]:
rf_best_est, rf_best_score, rf_best_params = grid_search(X_train, y_train, models[0], rf_grid)

In [19]:
knn_best_est, knn_best_score, knn_best_params = grid_search(X_train, y_train, models[1], knn_grid)

In [20]:
lr_best_est, lr_best_score, lr_best_params = grid_search(X_train, y_train, models[2], lr_grid)

In [27]:
rf_best_score

0.8032674397714299

In [25]:
knn_best_score

0.7790913990646243

In [26]:
lr_best_score

0.802676889731152