In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

from ipynb.fs.full.Cleaning_FE import data_cleaning, min_max_scaling, apr

pd.options.display.max_columns = None

In [24]:
df = pd.read_csv("swan_data.csv")

In [25]:
df =  data_cleaning(df)

In [26]:
features = list(df.columns)
features.remove("Latitude")
features.remove("Longitude")
features.remove("Churn Value")
X = df[features]
y = df["Churn Value"]

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [28]:
def feature_engineering(df):
    # OHE Payment Method and Internet Service columns
    df = pd.get_dummies(df, 
                        columns = ['Payment Method', 'Internet Service'], 
                        prefix = ['pay','is'], 
                        drop_first = True, 
                        dtype = int)
        
    return df

In [29]:
X_train = feature_engineering(X_train)
X_test = feature_engineering(X_test)

In [30]:
clf = RandomForestClassifier()

In [31]:
rf_params = {
    'class_weight':['balanced'],
    'criterion':['entropy'],
    'n_estimators': [100, 150, 200],
    'max_depth': [3, 5, 7, 10, 15],
}

gs = GridSearchCV(clf, param_grid=rf_params, cv=5, verbose=3, scoring='f1')

In [32]:
gs.fit(X_train, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV 1/5] END class_weight=balanced, criterion=entropy, max_depth=3, n_estimators=100;, score=0.629 total time=   0.2s
[CV 2/5] END class_weight=balanced, criterion=entropy, max_depth=3, n_estimators=100;, score=0.636 total time=   0.2s
[CV 3/5] END class_weight=balanced, criterion=entropy, max_depth=3, n_estimators=100;, score=0.608 total time=   0.3s
[CV 4/5] END class_weight=balanced, criterion=entropy, max_depth=3, n_estimators=100;, score=0.683 total time=   0.2s
[CV 5/5] END class_weight=balanced, criterion=entropy, max_depth=3, n_estimators=100;, score=0.649 total time=   0.3s
[CV 1/5] END class_weight=balanced, criterion=entropy, max_depth=3, n_estimators=150;, score=0.621 total time=   0.5s
[CV 2/5] END class_weight=balanced, criterion=entropy, max_depth=3, n_estimators=150;, score=0.639 total time=   0.5s
[CV 3/5] END class_weight=balanced, criterion=entropy, max_depth=3, n_estimators=150;, score=0.622 total time=   

In [33]:
gs.best_params_

{'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': 7,
 'n_estimators': 200}

In [34]:
y_pred = gs.predict(X_train)
apr(y_pred, y_train)

Accuracy:0.7978345757898474
Precision:0.5862851952770209
Recall:0.8499012508229098
F1:0.6938994893845741


(0.7978345757898474,
 0.5862851952770209,
 0.8499012508229098,
 0.6938994893845741)

In [35]:
y_test_pred = gs.predict(X_test)
apr(y_test_pred, y_test)

Accuracy:0.7615330021291696
Precision:0.5131578947368421
Recall:0.78
F1:0.6190476190476191


(0.7615330021291696, 0.5131578947368421, 0.78, 0.6190476190476191)

In [37]:
train_results = X_train.copy()
train_results["Churn Probability"] = gs.predict_proba(X_train)[:,1]
train_results["Churn Value"] = y_train

test_results = X_test.copy()
test_results["Churn Probability"] = gs.predict_proba(X_test)[:,1]
test_results["Churn Value"] = y_test

results = pd.concat([train_results, test_results])

In [38]:
my_file = open("most_likely_churners.txt", "w")
my_file.write(f"{list(results[results['Churn Value'] == 0].sort_values('Churn Probability', ascending=False).index[:50])}")
my_file.close()