In [1]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from acquire import Acquire
from prepare import Prepare
import pandas as pd
pd.options.display.max_columns = 50

#### Note: GridSearchCV will be used here because the data set is small enough to check all possible combinations. With a large enough data set RandomizedSearchCV is preferred because n_iter can be adjusted to determine how many combinations to check instead of all as with GridSearchCv

In [2]:
a = Acquire()
p = Prepare()
telco = a.get_telco_data()
train, val, test = p.prep_telco(telco, modeling=True)
train.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type,no_phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,steaming_movies
0,1.0,0.0,0.0,1.0,32.0,1.0,0.0,20.5,696.8,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,0.0,55.0,1.0,1.0,113.6,6292.7,0.0,2.0,2.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
2,1.0,0.0,1.0,1.0,25.0,1.0,0.0,25.5,630.6,0.0,2.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,1.0,0.0,12.0,1.0,1.0,98.1,1060.2,1.0,0.0,2.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,63.0,1.0,0.0,102.6,6296.75,0.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0


In [None]:
# train.info()

In [4]:
X_train = train.drop("churn", axis=1)
y_train = train["churn"]
print(X_train.shape)
print(y_train.shape)

(5088, 20)
(5088,)


In [36]:
X_val = val.drop("churn", axis=1)
y_val = val["churn"]
print(X_val.shape)
print(y_val.shape)

(898, 20)
(898,)


In [37]:
X_test = test.drop("churn", axis=1)
y_test = test["churn"]
print(X_test.shape)
print(y_test.shape)

(1057, 20)
(1057,)


In [5]:
def grid_search(X, y, model, params_dic):
    grid = GridSearchCV(model, params_dic, n_jobs=-1)
    return grid.fit(X, y)

In [6]:
rf_grid = {"n_estimators": [5, 10, 25, 50], "criterion": ["gini", "entropy"], "max_depth": [3, 5, 10], "min_samples_split": [5, 10, 20], "min_samples_leaf": [2, 5, 10], "max_features": [None, "auto", "log2"], "bootstrap": [True, False]}
knn_grid = {"n_neighbors": [2, 5, 10]}
lr_grid = {"penalty": ["l1", "l2"], "C": [0.25, 0.50, 0.75, 1.0, 10]}
models = [RandomForestClassifier(random_state=7), KNeighborsClassifier(), LogisticRegression(solver="liblinear", random_state=7)]

In [8]:
rf_fit_model = grid_search(X_train, y_train, models[0], rf_grid)

In [9]:
knn_fit_model = grid_search(X_train, y_train, models[1], knn_grid)

In [10]:
lr_fit_model = grid_search(X_train, y_train, models[2], lr_grid)

In [53]:
# rf_fit_model.best_estimator_
# rf_fit_model.best_params_
# rf_fit_model.cv_results_
# rf_fit_model.n_features_in_
# rf_fit_model.feature_names_in_

#### Best Random Foreset Classifier Model
- bootstrap = True
- criterion = entropy
- max_depth=5
- max_features=None
- min_samples_leaf=10
- min_samples_split=5
- n_estimators=50
- random_state=7

In [23]:
# knn_fit_model.best_estimator_
# knn_fit_model.best_params_

#### Best K Nearest Neighbors Classifier Model
- n_neighbors = 10

In [26]:
# lr_fit_model.best_estimator_
# lr_fit_model.best_params_

#### Best Logistic Regression Model
- C = 0.75
- penalty = l1
- random_state = 7
- solver = liblinear

In [29]:
rfc = RandomForestClassifier(criterion='entropy', max_depth=5, max_features=None,
                        min_samples_leaf=10, min_samples_split=5,
                        n_estimators=50, random_state=7, bootstrap=True)
rfc.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', max_depth=5, max_features=None,
                       min_samples_leaf=10, min_samples_split=5,
                       n_estimators=50, random_state=7)

In [31]:
rfc.score(X_train, y_train)

0.8123034591194969

In [46]:
rfc.score(X_val, y_val)

0.7984409799554566

In [38]:
rfc_y_pred = rfc.predict(X_val)

In [39]:
print(classification_report(y_val, rfc_y_pred))

              precision    recall  f1-score   support

         0.0       0.84      0.90      0.87       660
         1.0       0.65      0.51      0.57       238

    accuracy                           0.80       898
   macro avg       0.74      0.71      0.72       898
weighted avg       0.79      0.80      0.79       898



In [32]:
knnc = KNeighborsClassifier(n_neighbors=10)
knnc.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [33]:
knnc.score(X_train, y_train)

0.813875786163522

In [45]:
knnc.score(X_val, y_val)

0.7750556792873051

In [40]:
knnc_y_pred = knnc.predict(X_val)

In [47]:
print(classification_report(y_val, knnc_y_pred))

              precision    recall  f1-score   support

         0.0       0.81      0.91      0.86       660
         1.0       0.62      0.39      0.48       238

    accuracy                           0.78       898
   macro avg       0.71      0.65      0.67       898
weighted avg       0.76      0.78      0.76       898



In [34]:
lrc = LogisticRegression(C=0.75, penalty='l1', random_state=7, solver='liblinear')
lrc.fit(X_train, y_train)

LogisticRegression(C=0.75, penalty='l1', random_state=7, solver='liblinear')

In [35]:
lrc.score(X_train, y_train)

0.8024764150943396

In [44]:
lrc.score(X_val, y_val)

0.7951002227171492

In [42]:
lrc_y_pred = lrc.predict(X_val)

In [43]:
print(classification_report(y_val, lrc_y_pred))

              precision    recall  f1-score   support

         0.0       0.85      0.88      0.86       660
         1.0       0.62      0.57      0.59       238

    accuracy                           0.80       898
   macro avg       0.74      0.72      0.73       898
weighted avg       0.79      0.80      0.79       898



### Model Performance

#### Random Forest Classifier
- X_train, y_train score: 0.8123034591194969
- X_val, y_val score: 0.7984409799554566

#### K Nearest Neighbors Classifier
- X_train, y_train score: 0.813875786163522
- X_val, y_val score: 0.7750556792873051

#### Logistic Regression
- X_train, y_train score: 0.8024764150943396
- X_val, y_val score: 0.7951002227171492

#### Will use Logistic Regression to run the test set

In [54]:
lrc.score(X_test, y_test)

0.8098391674550615

In [55]:
lrc_y_pred_test = lrc.predict(X_test)

In [57]:
print(classification_report(y_test, lrc_y_pred_test))

              precision    recall  f1-score   support

         0.0       0.86      0.89      0.87       777
         1.0       0.66      0.58      0.62       280

    accuracy                           0.81      1057
   macro avg       0.76      0.74      0.75      1057
weighted avg       0.80      0.81      0.81      1057



## The Logistic Regression model did decent on the test set and I don't see signs of overfitting
### Ready to copy over to README and report.ipynb