In [1]:
# Importing dependencies
import numpy as np
import pandas
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import math
import seaborn as sns
from sqlalchemy import create_engine

In [None]:
protocol = 'postgresql'
username = 'postgres'
password = '258080'
host = 'localhost'
port = 5432
database_name = 'churn_pred'
rds_connection_string = f'{protocol}://{username}:{password}@{host}:{port}/{database_name}'
engine = create_engine(rds_connection_string)
connection = engine.connect()

In [None]:
# Loading and displaying a dataset
df = pd.read_sql("SELECT * FROM fact_churn", connection)
df.head()

In [None]:
# Split the data into X_train, X_test, y_train, y_test
X = df.drop(['churn'], axis=1)
y = df['churn'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.25)

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Checking data split in training
X_train.shape

In [None]:
# Checking data split in testing
X_test.shape

In [None]:
# Retrieving sample of the data
X.sample(5)

In [None]:
# Tuning Hyper-parameter
rfc = RandomForestClassifier(random_state=42)

In [None]:
# Creating the model
rfc.fit(X_train_scaled, y_train)
print(f"Training Data Score: {rfc.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rfc.score(X_test_scaled, y_test)}")

In [None]:
# Optimising Hyper-parameter
rfc_optimised = RandomForestClassifier(criterion='entropy', max_depth=10, random_state=42, class_weight='balanced')

In [None]:
# Testing the optimised model
rfc_optimised.fit(X_train_scaled, y_train)
print(f"Training Data Score: {rfc.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rfc.score(X_test_scaled, y_test)}")

In [None]:
# Creating a confusion matrix
y_pred = rfc_optimised.predict(X_test_scaled)
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
confusion_matrix

In [None]:
# Creating confusion matrix diagram
class_names=[0,1]
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

In [None]:
# Create a heatmap
sns.heatmap(pd.DataFrame(confusion_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
# Create the classification report
target_names = ['churn: no', 'churn: yes']
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
# Create area under the curve graph
y_pred_proba = rfc_optimised.predict_proba(X_test_scaled)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr, tpr,label='data 1, auc=' + str(auc))
plt.legend(loc=4)
plt.show()