# MRP PROJECT

   Customer churn prediction models have been proven to provide companies the ability to determine if a customer will churn based on a variety of factors. However, this space still has room for improvement and this project aims to determine if a customer churn prediction accuracy can be improved through the use of customer segmentation by cluster groups of customers based on similarities in their buying patterns and company engagement behaviours. Through this process, companies will also gain the ability to better understand which variables lead to a higher churn rate while also being able to use these clusters for ulterior targeted marketing initiatives.

### Importing packages and dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split 
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix,f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn import metrics

In [None]:
df_train = pd.read_csv('train.csv')
df_train

In [None]:
df_test = pd.read_csv('test.csv')
df_test

In [None]:
targetFeature = 'churn_risk_score'

### EDA

In [None]:
fig, ax = plt.subplots(figsize = (15,8))
sns.color_palette("Spectral", as_cmap=True)
sns.set_style('white')
sns.countplot(data=final_df_train.sort_values(by='feedback'), x='feedback', hue='churn_risk_score', palette = 'Spectral_r')
plt.xlabel('Feedback', labelpad = 15,  fontsize=14)
plt.xticks(rotation=45)
plt.ylabel('Total Feedback', labelpad = 15, fontsize=14)
plt.title('Count of Feedback Type based on Churn Risk Score', y = 1.02,  fontsize=18, fontweight = 'bold')

In [None]:
fig, ax = plt.subplots(figsize = (15,8))
sns.color_palette("Spectral", as_cmap=True)
sns.set_style('white')
sns.countplot(data=final_df_train.sort_values(by='churn_risk_score'), x='churn_risk_score', hue='gender',palette=['#f699cd',"#6495ED"])
plt.xlabel('Churn Risk Score', labelpad = 15,  fontsize=14);
plt.ylabel('Total Count ', labelpad = 15, fontsize=14);
plt.title('Count of Gender based on Churn Risk Score', y = 1.02,  fontsize=18, fontweight = 'bold')

In [None]:
fig, ax = plt.subplots(figsize = (15,8))
sns.color_palette("Spectral", as_cmap=True)
sns.set_style('white')
sns.countplot(data=final_df_train.sort_values(by='membership_category'), x='membership_category', hue='churn_risk_score', palette='Spectral_r')
plt.xlabel('Membership Type', labelpad = 15,  fontsize=14);
plt.ylabel('Total Count of Memberships', labelpad = 15, fontsize=14);
plt.title('Count of Membership Type based on Churn Risk Score', y = 1.02,  fontsize=18, fontweight = 'bold')

In [None]:
fig, ax = plt.subplots(figsize = (15,8))
sns.set_style('white')
sns.heatmap(df_train.corr(), vmax=1, vmin=-1, annot=True, cmap='vlag')
plt.title('Correlation Plot for Numerical Variables', y = 1.02,  fontsize=18, fontweight = 'bold')

### Data Cleaning

In [None]:
print(df_train.dtypes)

In [None]:
df_train['gender'] = df_train['gender'].replace('Unknown',np.NaN) # replace Unknown with NaN for further imputing
df_train['joined_through_referral'] = df_train['joined_through_referral'].replace('?',np.NaN) # replace ? with NaN for further imputing
df_train['referral_id'] = df_train['referral_id'].replace('xxxxxxxx',np.NaN)  # replace xxxxx with NaN for further imputing
df_train['medium_of_operation'] = df_train['medium_of_operation'].replace('?',np.NaN) # replace ? with NaN for further imputing
df_train['days_since_last_login'] = df_train['days_since_last_login'].replace(-999,np.NaN)  # replace -999 with NaN for further imputing
df_train.loc[~(df_train['avg_time_spent'] > 0), 'avg_time_spent']=np.nan # replace all negative values with NaN for further imputing
df_train.loc[~(df_train['points_in_wallet'] > 0), 'points_in_wallet']=np.nan # replace all negative values with NaN for further imputing

df_train['avg_frequency_login_days'] = df_train['avg_frequency_login_days'].replace('Error',np.NaN) # replace Error with NaN for further imputing
df_train['avg_frequency_login_days']=df_train['avg_frequency_login_days'].astype('float')
df_train.loc[~(df_train['avg_frequency_login_days'] > 0), 'avg_frequency_login_days']=np.nan # replace all negative values with NaN for further imputing

In [None]:
## based on research, mode is best for imputing for categorical variables with a small number of unique values, which 
## is the case in this dataset, as is with most categorical variables
cat_var_imputing = df_train[['gender','region_category','joined_through_referral','preferred_offer_types','medium_of_operation',]]
for i, variable in enumerate(cat_var_imputing):
    df_train[variable].fillna(df_train[variable].mode()[0], inplace=True)

In [None]:
## based on research, for numerical variables, KNN imputing is preferred:
### In this approach, we specify a distance from the missing values which is also known as the K parameter. 
## The missing value will be predicted in reference to the mean of the neighbours. It is implemented by the KNNimputer() method

num_imputing = df_train[['points_in_wallet','avg_time_spent','days_since_last_login','avg_frequency_login_days']]
imp = KNNImputer(n_neighbors=2)
imputed_vals=imp.fit_transform(num_imputing)

temp_dataset = pd.DataFrame({
    'points_in_wallet':imputed_vals.T[0],
    'avg_time_spent':imputed_vals.T[1],
    'days_since_last_login':imputed_vals.T[2],
    'avg_frequency_login_days':imputed_vals.T[3]

})

df_train.drop(['points_in_wallet','avg_time_spent','days_since_last_login','avg_frequency_login_days'], axis=1, inplace=True)

final_df_train = pd.concat([df_train, temp_dataset], axis=1)

In [None]:
final_df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
df_test['gender'] = df_test['gender'].replace('Unknown',np.NaN) # replace Unknown with NaN for further imputing
df_test['joined_through_referral'] = df_test['joined_through_referral'].replace('?',np.NaN) # replace ? with NaN for further imputing
df_test['referral_id'] = df_test['referral_id'].replace('xxxxxxxx',np.NaN)  # replace xxxxx with NaN for further imputing
df_test['medium_of_operation'] = df_test['medium_of_operation'].replace('?',np.NaN) # replace ? with NaN for further imputing
df_test['days_since_last_login'] = df_test['days_since_last_login'].replace(-999,np.NaN)  # replace -999 with NaN for further imputing
df_test.loc[~(df_test['avg_time_spent'] > 0), 'avg_time_spent']=np.nan # replace all negative values with NaN for further imputing
df_test.loc[~(df_test['points_in_wallet'] > 0), 'points_in_wallet']=np.nan # replace all negative values with NaN for further imputing

df_test['avg_frequency_login_days'] = df_test['avg_frequency_login_days'].replace('Error',np.NaN) # replace Error with NaN for further imputing
df_test['avg_frequency_login_days']=df_test['avg_frequency_login_days'].astype('float')
df_test.loc[~(df_test['avg_frequency_login_days'] > 0), 'avg_frequency_login_days']=np.nan # replace all negative values with NaN for further imputing

In [None]:
## based on research, mode is best for imputing for categorical variables with a small number of unique values, which 
## is the case in this dataset, as is with most categorical variables
cat_var_imputing = df_test[['gender','region_category','joined_through_referral','preferred_offer_types','medium_of_operation',]]
for i, variable in enumerate(cat_var_imputing):
    df_test[variable].fillna(df_test[variable].mode()[0], inplace=True)

In [None]:
## based on research, for numerical variables, KNN imputing is preferred:
### In this approach, we specify a distance from the missing values which is also known as the K parameter. 
## The missing value will be predicted in reference to the mean of the neighbours. It is implemented by the KNNimputer() method

num_imputing_test = df_test[['points_in_wallet','avg_time_spent','days_since_last_login','avg_frequency_login_days']]
imp_test = KNNImputer(n_neighbors=2)
imputed_vals_test=imp_test.fit_transform(num_imputing_test)

temp_dataset2 = pd.DataFrame({
    'points_in_wallet':imputed_vals_test.T[0],
    'avg_time_spent':imputed_vals_test.T[1],
    'days_since_last_login':imputed_vals_test.T[2],
    'avg_frequency_login_days':imputed_vals_test.T[3]

})

df_test.drop(['points_in_wallet','avg_time_spent','days_since_last_login','avg_frequency_login_days'], axis=1, inplace=True)

final_df_test = pd.concat([df_test, temp_dataset2], axis=1)

In [None]:
final_df_test.isnull().sum()

In [None]:
#final_df_train['days_since_last_login'] = final_df_train['days_since_last_login'].astype('int64')
final_df_train.info()

In [None]:
final_df_test['days_since_last_login'] = final_df_test['days_since_last_login'].astype('int64')
final_df_test.info()

In [None]:
# getting rid of negative churn risk
negative_churn = np.where(final_df_train['churn_risk_score'] == -1)
final_df_train.drop(negative_churn[0],inplace=True)
final_df_train.index = range(0,final_df_train.shape[0])

In [None]:
# adding year variable to training and testing df
final_df_train['year']=final_df_train.joining_date.apply(lambda k:2021-int(k.split('-')[0]))
final_df_test['year']=final_df_test.joining_date.apply(lambda k:2021-int(k.split('-')[0]))

### Outlier Detection

In [None]:
final_df_outliers=final_df_train.select_dtypes(include=[np.number])

fig, ax = plt.subplots(nrows=3,ncols=3,figsize=(20, 15)) 
for variable, subplot in zip(final_df_outliers.columns, ax.flatten()):
    z = sns.boxplot(x = final_df_outliers[variable], orient = "h" , ax=subplot, flierprops={"marker": "x"}, 
                    medianprops={"color": "#f03a2e"}, dodge=False, palette = ['#4296f5'])
    z.set_xlabel(variable, fontsize = 14, fontweight = 'bold')
fig.delaxes(ax[2][2])
fig.suptitle('Boxplot for Numerical Variables before Outlier Removal', y = 0.91,  fontsize=18, fontweight = 'bold')
plt.show()

From the above, it can be seen that avg_transaction_value, points_in_wallet, avg_time_spent and avg_frequency_login_days all have outliers.

In [None]:
Q1 = final_df_train.quantile(0.25) 
Q3 = final_df_train.quantile(0.75) 
IQR = Q3 - Q1 

In [None]:
final_data_train_iqr = final_df_train[~((final_df_train < (Q1 - 1.5 * IQR)) |(final_df_train > (Q3 + 1.5 * IQR))).any(axis=1)] 
final_data_train_iqr.reset_index(inplace=True)
final_data_train_iqr.drop('index',axis=1, inplace=True)
final_data_train_iqr

In [None]:
## AFTER REMOVING OUTLIERS:
final_df_outliers2=final_data_train_iqr.select_dtypes(include=[np.number])

fig, ax = plt.subplots(nrows=3,ncols=3,figsize=(20, 15)) 
for variable, subplot in zip(final_df_outliers2.columns, ax.flatten()):
    z = sns.boxplot(x = final_df_outliers2[variable], orient = "h", ax=subplot, flierprops={"marker": "x"}, 
                medianprops={"color": "#f03a2e"}, dodge=False, palette = ['#4296f5']) 
    z.set_xlabel(variable, fontsize = 14, fontweight = 'bold')
fig.delaxes(ax[2][2])
fig.suptitle('Boxplot for Numerical Variables after Outlier Removal', y = 0.91,  fontsize=18, fontweight = 'bold')
plt.show()

### Categorical Encoding and Balancing Dataset

In [None]:
final_df_train['churn_risk_score'].value_counts()

In [None]:
cats = ['gender','region_category','membership_category','joined_through_referral',
      'preferred_offer_types','medium_of_operation','internet_option','used_special_discount',
       'offer_application_preference','past_complaint','complaint_status','feedback']
df_train_balanced = final_df_train[cats]
df_test_balanced = final_df_test[cats]

In [None]:
# categorical encoding of newly created test and training dataset
df_train_balanced = pd.get_dummies(df_train_balanced)
df_test_balanced = pd.get_dummies(df_test_balanced)

In [None]:
nums_train = ['days_since_last_login','avg_time_spent','avg_transaction_value','avg_frequency_login_days',
'points_in_wallet','used_special_discount','churn_risk_score']
df_train_balanced_num = final_df_train[nums_train]
nums_test = ['days_since_last_login','avg_time_spent','avg_transaction_value','avg_frequency_login_days',
'points_in_wallet','used_special_discount']
df_test_balanced_num = final_df_test[nums_test]

In [None]:
final_df_train = pd.concat([df_train_balanced_num,df_train_balanced],axis=1)
final_df_test = pd.concat([df_test_balanced_num,df_test_balanced],axis=1)
final_df_train.drop('used_special_discount',axis=1,inplace=True)
final_df_test.drop('used_special_discount',axis=1,inplace=True)

In [None]:
X = final_df_train.drop(['churn_risk_score'],axis=1).values
Y = final_df_train['churn_risk_score'].values

# Applying SMOTE Over Sampling Strategy for balancing dataset
oversample = SMOTE()
X_ov,Y_ov = oversample.fit_resample(X,Y)

In [None]:
scaler = RobustScaler()
X_ov = scaler.fit_transform(X_ov)
X_test = final_df_test.values
X_test = scaler.transform(X_test)

### Feature Importance and Identifying Parameters

In [None]:
Features = final_df_train.drop(['churn_risk_score'] ,axis = 1)
Target = final_df_train['churn_risk_score']

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(Features, Target, test_size=0.20, random_state=42)
model =  RandomForestClassifier(random_state = 0)
model.fit(X_train1, y_train1)

In [None]:
important_features = pd.DataFrame({'Features': X_train1.columns, 'Importance': model.feature_importances_})
selected_features = important_features.loc[(important_features["Importance"] >= 0.01)]
important_features = selected_features.sort_values('Importance', ascending = False)

In [None]:
plt.rcParams["figure.figsize"] = (20,20)
sns.set(font_scale=1.5)
sns.set_style('white')
sns.barplot(x = 'Importance', y = 'Features', data = important_features, palette = 'Spectral' )
plt.title('Feature Importance', y = 1.02,  fontsize=25, fontweight = 'bold')
plt.xlabel('Importance', labelpad = 15, fontsize = 18)
plt.ylabel('Features', labelpad = 15, fontsize = 18)

plt.show()

### Data Modeling

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X_ov,Y_ov,train_size=0.7)
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

In [None]:
## RANDOM FOREST CLASSIFIER

rf = RandomForestClassifier(n_estimators=1000,max_depth=25)
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
print(classification_report(y_true=y_test,y_pred=y_pred))
print(f1_score(y_true=y_test,y_pred=y_pred,average='macro'))

In [None]:
fig, ax = plt.subplots(figsize = (10,8))
sns.set_style('white')
cm = confusion_matrix(y_true=y_test,y_pred=y_pred)
sns.heatmap(cm, annot=True, ax = ax, cmap = 'Blues', fmt = 'g')
plt.title('Correlation Plot for Random Forest Classifier', y = 1.02,  fontsize=18, fontweight = 'bold')
ax.set_xlabel('Predicted Label')
ax.set_ylabel('Actual Label')
plt.show()

In [None]:
## MLP Classifier with 3 hidden layers of decreasing nodes

mlp = MLPClassifier(hidden_layer_sizes=(120,95,70))
mlp.fit(x_train,y_train)
y_pred = mlp.predict(x_test)
print(classification_report(y_true=y_test,y_pred=y_pred))
print(f1_score(y_true=y_test,y_pred=y_pred,average='macro'))

In [None]:
fig, ax = plt.subplots(figsize = (10,8))
sns.set_style('white')
cm = confusion_matrix(y_true=y_test,y_pred=y_pred)
sns.heatmap(cm, annot=True, ax = ax, cmap = 'Blues',fmt = 'g')
plt.title('Correlation Plot for MLP Classifier', y = 1.02,  fontsize=18, fontweight = 'bold')
ax.set_xlabel('Predicted Label')
ax.set_ylabel('Actual Label')
plt.show()

In [None]:
## Gradient Boosting Classifier

gb = GradientBoostingClassifier(n_estimators=1000,max_depth=25,min_samples_leaf = 4, min_samples_split= 5)
gb.fit(x_train,y_train)
y_pred = gb.predict(x_test)
print(classification_report(y_true=y_test,y_pred=y_pred))
print(f1_score(y_true=y_test,y_pred=y_pred,average='macro'))

In [None]:
fig, ax = plt.subplots(figsize = (10,8))
sns.set_style('white')
cm = confusion_matrix(y_true=y_test,y_pred=y_pred)
sns.heatmap(cm, annot=True, ax = ax, cmap = 'Blues', fmt = 'g')
plt.title('Correlation Plot for Gradient Boosting Classifier', y = 1.02,  fontsize=18, fontweight = 'bold')
ax.set_xlabel('Predicted Label')
ax.set_ylabel('Actual Label')
plt.show()

### Model Performance

In [None]:
# Training
y_pred2tr = rf.predict(x_train)
y_pred3tr = mlp.predict(x_train)
y_pred4tr = gb.predict(x_train)

# Testing
y_pred2 = rf.predict(x_test)
y_pred3 = mlp.predict(x_test)
y_pred4 = gb.predict(x_test)


In [None]:
def accuracy(y_test , ypred):
    auc = accuracy_score(y_test , ypred)
    return auc

def f1(y_test , ypred):
    f = f1_score(y_test, ypred, average='macro')
    return f

In [None]:
# Final Comparison Table
results = pd.DataFrame({'Prediction Model':['Random Forest','MLP','Gradient Boosting'],
                    'Training Accuracy (%)':[accuracy(y_train,y_pred2tr), accuracy(y_train,y_pred3tr), accuracy(y_train,y_pred4tr)],
                    'Testing Accuracy (%)':[accuracy(y_test,y_pred2), accuracy(y_test,y_pred3), accuracy(y_test,y_pred4)],
                    'Testing f1-score (%)':[f1(y_test,y_pred2), f1(y_test,y_pred3), f1(y_test,y_pred4)]})
results.style.highlight_max(color = 'lightgreen', subset = 'Testing Accuracy (%)')

## --- Churn Prediction with Segmentation Method # 2 -  K Means / Mini Batch K Means --- 

In [None]:
# Use standard scaler to scale and standardize variables

col_names = ['days_since_last_login', 'avg_time_spent', 'avg_transaction_value', 'avg_frequency_login_days',
            'points_in_wallet', 'churn_risk_score', 'gender_F', 'gender_M', 'region_category_City',
            'region_category_Town', 'region_category_Village', 'membership_category_Basic Membership',
            'membership_category_Gold Membership', 'membership_category_No Membership', 'membership_category_Platinum Membership',
            'membership_category_Premium Membership', 'membership_category_Silver Membership', 'joined_through_referral_No',
            'joined_through_referral_Yes', 'preferred_offer_types_Credit/Debit Card Offers',
            'preferred_offer_types_Gift Vouchers/Coupons', 'preferred_offer_types_Without Offers',
            'medium_of_operation_Both', 'medium_of_operation_Desktop', 'medium_of_operation_Smartphone',
            'internet_option_Fiber_Optic', 'internet_option_Mobile_Data', 'internet_option_Wi-Fi', 'used_special_discount_No',
            'used_special_discount_Yes', 'offer_application_preference_No', 'offer_application_preference_Yes',
            'past_complaint_No', 'past_complaint_Yes', 'complaint_status_No Information Available',
            'complaint_status_Not Applicable', 'complaint_status_Solved', 'complaint_status_Solved in Follow-up',
            'complaint_status_Unsolved', 'feedback_No reason specified', 'feedback_Poor Customer Service',
            'feedback_Poor Product Quality', 'feedback_Poor Website', 'feedback_Products always in Stock',
            'feedback_Quality Customer Care', 'feedback_Quality Customer Care', 'feedback_Reasonable Price',
            'feedback_Too many ads', 'feedback_User Friendly Website']
sd=StandardScaler()
features = final_df_train[col_names]
scaler = sd.fit(features.values)
features = scaler.transform(features.values)
scaled_features = pd.DataFrame(features, columns = col_names)
scaled_features.head()

In [None]:
pca = PCA(n_components=5)
principalComponents = pca.fit_transform(scaled_features)
features = range(pca.n_components_)
plt.figure(figsize=(16,8))
bar_colors = ['tab:green', 'tab:green', 'tab:green', 'tab:orange', 'tab:blue']
plt.bar(features, pca.explained_variance_ratio_, color = bar_colors)
plt.title('PCA Features Variance Plot', y = 1.02,  fontsize=18, fontweight = 'bold')
plt.xlabel('PCA Features', labelpad = 15,  fontsize=14)
plt.ylabel('Variance Percent (%)', labelpad = 15,  fontsize=14)
plt.xticks(features, fontsize=14)
plt.yticks(fontsize=14)

PCA_components = pd.DataFrame(principalComponents)


In [None]:
# Using KElbowVisualizer package from yellowbricks to create elbow plot
model = KMeans()
elbowplot = KElbowVisualizer(model, k=(1,50),size=(1080, 500))
elbowplot.fit(PCA_components.iloc[:,:3])
elbowplot.show()

In [None]:
kmeans_model = KMeans(n_clusters=6, init='k-means++',random_state=42)
kmeans_model.fit(PCA_components.iloc[:,:2])

print("silhouette_score is :",silhouette_score(PCA_components.iloc[:,:2], model.labels_, metric='euclidean'))

In [None]:
kmeans = model.predict(PCA_components.iloc[:,:3])
cluster_df = pd.DataFrame(final_df_train)
cluster_df['cluster'] = kmeans
cluster_df.head()

In [None]:
avg_df = final_df_train.groupby(['cluster'], as_index=False).mean()
avg_df

In [None]:
f, ax = plt.subplots(nrows=1, ncols=4, figsize=(18, 8))
sns.set_style('white')
sns.set(font_scale=1.5)
sns.barplot(x='cluster',y='avg_transaction_value',data=avg_df ,ax=ax[0], palette = 'Spectral')
sns.barplot(x='cluster',y='points_in_wallet',data=avg_df, ax=ax[1], palette = 'Spectral')
sns.barplot(x='cluster',y='feedback_Products always in Stock',data=avg_df, ax=ax[2], palette = 'Spectral')
sns.barplot(x='cluster',y='feedback_Quality Customer Care',data=avg_df, ax=ax[3], palette = 'Spectral')
plt.suptitle('Total Count per Variable Based on Cluster', y = 0.9,  fontsize=18, fontweight = 'bold')
plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.8, 
                    wspace=0.4, 
                    hspace=0.4)

fig.tight_layout()
plt.show()

In [None]:
sns.set_style('white')
f, ax = plt.subplots(nrows=1, ncols=4, figsize=(18, 8))
sns.set(font_scale=1.5)
sns.barplot(x='cluster',y='avg_frequency_login_days',data=avg_df ,ax=ax[0], palette = 'Spectral')
sns.barplot(x='cluster',y='churn_risk_score',data=avg_df, ax=ax[1], palette = 'Spectral')
sns.barplot(x='cluster',y='feedback_Poor Customer Service',data=avg_df, ax=ax[2], palette = 'Spectral')
sns.barplot(x='cluster',y='feedback_Poor Website',data=avg_df, ax=ax[3], palette = 'Spectral')
#plt.suptitle('', y = 0.9,  fontsize=18, fontweight = 'bold')
plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.8, 
                    wspace=0.4, 
                    hspace=0.4)

fig.tight_layout()
plt.show()

In [None]:
col_names_test = ['days_since_last_login', 'avg_time_spent', 'avg_transaction_value', 'avg_frequency_login_days',
            'points_in_wallet', 'gender_F', 'gender_M', 'region_category_City',
            'region_category_Town', 'region_category_Village', 'membership_category_Basic Membership',
            'membership_category_Gold Membership', 'membership_category_No Membership', 'membership_category_Platinum Membership',
            'membership_category_Premium Membership', 'membership_category_Silver Membership', 'joined_through_referral_No',
            'joined_through_referral_Yes', 'preferred_offer_types_Credit/Debit Card Offers',
            'preferred_offer_types_Gift Vouchers/Coupons', 'preferred_offer_types_Without Offers',
            'medium_of_operation_Both', 'medium_of_operation_Desktop', 'medium_of_operation_Smartphone',
            'internet_option_Fiber_Optic', 'internet_option_Mobile_Data', 'internet_option_Wi-Fi', 'used_special_discount_No',
            'used_special_discount_Yes', 'offer_application_preference_No', 'offer_application_preference_Yes',
            'past_complaint_No', 'past_complaint_Yes', 'complaint_status_No Information Available',
            'complaint_status_Not Applicable', 'complaint_status_Solved', 'complaint_status_Solved in Follow-up',
            'complaint_status_Unsolved', 'feedback_No reason specified', 'feedback_Poor Customer Service',
            'feedback_Poor Product Quality', 'feedback_Poor Website', 'feedback_Products always in Stock',
            'feedback_Quality Customer Care', 'feedback_Quality Customer Care', 'feedback_Reasonable Price',
            'feedback_Too many ads', 'feedback_User Friendly Website']
sd_test=StandardScaler()
features_test = final_df_test[col_names_test]
scaler_test = sd_test.fit(features_test.values)
features_test = scaler_test.transform(features_test.values)
scaled_features_test = pd.DataFrame(features_test, columns = col_names_test)
scaled_features_test.head()

In [None]:
pca_test = PCA(n_components=5)
principalComponents_test = pca_test.fit_transform(scaled_features_test)
features_test = range(pca_test.n_components_)
plt.figure(figsize=(16,8))
bar_colors = ['tab:green', 'tab:green', 'tab:green', 'tab:orange', 'tab:blue']
plt.bar(features_test, pca_test.explained_variance_ratio_, color = bar_colors)
plt.title('PCA Features Variance Plot', y = 1.02,  fontsize=18, fontweight = 'bold')
plt.xlabel('PCA Features', labelpad = 15,  fontsize=14)
plt.ylabel('Variance Percent (%)', labelpad = 15,  fontsize=14)
plt.xticks(features, fontsize=14)
plt.yticks(fontsize=14)

PCA_components_test = pd.DataFrame(principalComponents_test)

In [None]:
kmeans_model_test = KMeans()
elbowplot_test = KElbowVisualizer(kmeans_model_test, k=(1,50),size=(1080, 500))
elbowplot_test.fit(PCA_components_test.iloc[:,:3])        
elbowplot_test.show()       

In [None]:
model_test = KMeans(n_clusters=5, init='k-means++',random_state=42)
model_test.fit(PCA_components_test.iloc[:,:2])
print("silhouette_score is :",silhouette_score(PCA_components_test.iloc[:,:2], model_test.labels_, metric='euclidean'))

In [None]:
kmeans_test = model_test.predict(PCA_components_test.iloc[:,:2])
cluster_df_test = pd.DataFrame(final_df_test)
cluster_df_test['cluster'] = kmeans_test
cluster_df_test.head()

In [None]:
# creating new datasets based on each cluster (subset datasets)

grouped = cluster_df.groupby('cluster')
 
df_grouped_0 = grouped.get_group(0)
df_grouped_0 = pd.DataFrame(df_grouped_0)

df_grouped_1 = grouped.get_group(1)
df_grouped_1 = pd.DataFrame(df_grouped_1)

df_grouped_2 = grouped.get_group(2)
df_grouped_2 = pd.DataFrame(df_grouped_2)

df_grouped_3 = grouped.get_group(3)
df_grouped_3 = pd.DataFrame(df_grouped_3)

df_grouped_4 = grouped.get_group(4)
df_grouped_4 = pd.DataFrame(df_grouped_4)

df_grouped_5 = grouped.get_group(5)
df_grouped_5 = pd.DataFrame(df_grouped_5)

In [None]:
X_0 = df_grouped_2.drop(['churn_risk_score'],axis=1).values
Y_0 = df_grouped_2['churn_risk_score'].values

oversample = SMOTE()
X_ov_0,Y_ov_0 = oversample.fit_resample(X_0,Y_0)

In [None]:
scaler = RobustScaler()
X_ov_0 = scaler.fit_transform(X_ov_0)
X_test_0 = cluster_df_test.values
X_test_0 = scaler.transform(X_test_0)

In [None]:
x_train2,x_test2,y_train2,y_test2 = train_test_split(X_ov_0,Y_ov_0,train_size=0.7)
print(x_train2.shape,y_train2.shape)
print(x_test2.shape,y_test2.shape)

In [None]:
## RANDOM FOREST CLASSIFIER

rf2 = RandomForestClassifier(n_estimators=1000,max_depth=25)
rf2.fit(x_train2,y_train2)
y_pred2 = rf2.predict(x_test2)
print(classification_report(y_true=y_test2,y_pred=y_pred2))
print(f1_score(y_true=y_test2,y_pred=y_pred2,average='macro'))

In [None]:
## MLP Classifier with decreasing node sizes

mlp2 = MLPClassifier(hidden_layer_sizes=(120,95,70))
mlp2.fit(x_train2,y_train2)
y_pred2 = mlp2.predict(x_test2)
print(classification_report(y_true=y_test2,y_pred=y_pred2))
print(f1_score(y_true=y_test2,y_pred=y_pred2,average='macro'))

In [None]:
## Gradient Boosting Classifier

gb2 = GradientBoostingClassifier(n_estimators=1000, max_depth=25,min_samples_leaf = 4, min_samples_split= 5)
gb2.fit(x_train2,y_train2)
y_pred2 = gb2.predict(x_test2)
print(classification_report(y_true=y_test2,y_pred=y_pred2))
print(f1_score(y_true=y_test2,y_pred=y_pred2,average='macro'))

In [None]:
# Training
y_pred2tr2 = rf2.predict(x_train2)
y_pred3tr2 = mlp2.predict(x_train2)
y_pred4tr2 = gb2.predict(x_train2)

# Testing
y_pred2_2 = rf2.predict(x_test2)
y_pred3_2 = mlp2.predict(x_test2)
y_pred4_2 = gb2.predict(x_test2)

In [None]:
# Final Comparison Table - Cluster 0 

results2 = pd.DataFrame({'Prediction Model':['Random Forest','MLP','Gradient Boosting'],
                    'Training Accuracy (%)':[accuracy(y_train2,y_pred2tr2), accuracy(y_train2,y_pred3tr2), accuracy(y_train2,y_pred4tr2)],
                    'Testing Accuracy (%)':[accuracy(y_test2,y_pred2_2), accuracy(y_test2,y_pred3_2), accuracy(y_test2,y_pred4_2)],
                    'Testing f1-score (%)':[f1(y_test2,y_pred2_2), f1(y_test2,y_pred3_2), f1(y_test2,y_pred4_2)]})

results2.style.highlight_max(color = 'lightgreen', subset = 'Testing Accuracy (%)')

In [None]:
# FOR CLUSTER 4
X_1 = df_grouped_4.drop(['churn_risk_score'],axis=1).values
Y_1 = df_grouped_4['churn_risk_score'].values

oversample = SMOTE()
X_ov_1,Y_ov_1 = oversample.fit_resample(X_1,Y_1)

In [None]:
scaler = RobustScaler()
X_ov_1 = scaler.fit_transform(X_ov_1)
X_test_1 = cluster_df_test.values
X_test_1 = scaler.transform(X_test_1)

In [None]:
x_train3,x_test3,y_train3,y_test3 = train_test_split(X_ov_1,Y_ov_1,train_size=0.7)
print(x_train3.shape,y_train3.shape)
print(x_test3.shape,y_test3.shape)

In [None]:
## RANDOM FOREST CLASSIFIER

rf3 = RandomForestClassifier(n_estimators=1000,max_depth=25)
rf3.fit(x_train3,y_train3)
y_pred3 = rf3.predict(x_test3)
print(classification_report(y_true=y_test3,y_pred=y_pred3))
print(f1_score(y_true=y_test3,y_pred=y_pred3,average='macro'))

In [None]:
## MLP Classifier with 3 hidden layers of decreasing nodes

from sklearn.neural_network import MLPClassifier
mlp3 = MLPClassifier(hidden_layer_sizes=(120,95,70))
mlp3.fit(x_train3,y_train3)
y_pred3 = mlp3.predict(x_test3)
print(classification_report(y_true=y_test3,y_pred=y_pred3))
print(f1_score(y_true=y_test3,y_pred=y_pred3,average='macro'))

In [None]:
## Gradient Boosting Classifier

gb3 = GradientBoostingClassifier(n_estimators=1000,max_depth=25,min_samples_leaf = 4, min_samples_split= 5)
gb3.fit(x_train3,y_train3)
y_pred3 = gb3.predict(x_test3)
print(classification_report(y_true=y_test3,y_pred=y_pred3))
print(f1_score(y_true=y_test3,y_pred=y_pred3,average='macro'))

In [None]:
# Training
y_pred2tr3 = rf3.predict(x_train3)
y_pred3tr3 = mlp3.predict(x_train3)
y_pred4tr3 = gb3.predict(x_train3)

# Testing
y_pred2_3 = rf3.predict(x_test3)
y_pred3_3 = mlp3.predict(x_test3)
y_pred4_3 = gb3.predict(x_test3)

In [None]:
# Final Comparison Table - Cluster 4

results3 = pd.DataFrame({'Prediction Model':['Random Forest','MLP','Gradient Boosting'],
                    'Training Accuracy (%)':[accuracy(y_train3,y_pred2tr3), accuracy(y_train3,y_pred3tr3), accuracy(y_train3,y_pred4tr3)],
                    'Testing Accuracy (%)':[accuracy(y_test3,y_pred2_3), accuracy(y_test3,y_pred3_3), accuracy(y_test3,y_pred4_3)],
                    'Testing f1-score (%)':[f1(y_test3,y_pred2_3), f1(y_test3,y_pred3_3), f1(y_test3,y_pred4_3)]})
results3.style.highlight_max(color = 'lightgreen', subset = 'Testing Accuracy (%)')


In [None]:
# FOR CLUSTER 3
X_3 = df_grouped_3.drop(['churn_risk_score'],axis=1).values
Y_3 = df_grouped_3['churn_risk_score'].values

In [None]:
oversample = SMOTE()
X_ov_3,Y_ov_3 = oversample.fit_resample(X_3,Y_3)

In [None]:
scaler = RobustScaler()
X_ov_3 = scaler.fit_transform(X_ov_3)
X_test_3 = cluster_df_test.values
X_test_3 = scaler.transform(X_test_3)

In [None]:
x_train4,x_test4,y_train4,y_test4 = train_test_split(X_ov_3,Y_ov_3,train_size=0.7)
print(x_train4.shape,y_train4.shape)
print(x_test4.shape,y_test4.shape)

In [None]:
## RANDOM FOREST CLASSIFIER

rf4 = RandomForestClassifier(n_estimators=1000,max_depth=25)
rf4.fit(x_train4,y_train4)
y_pred4 = rf4.predict(x_test4)
print(classification_report(y_true=y_test4,y_pred=y_pred4))
print(f1_score(y_true=y_test4,y_pred=y_pred4,average='macro'))

In [None]:
## MLP Classifier with 3 hidden layers of decreasing nodes

mlp4 = MLPClassifier(hidden_layer_sizes=(120,95,70))
mlp4.fit(x_train4,y_train4)
y_pred4 = mlp4.predict(x_test4)
print(classification_report(y_true=y_test4,y_pred=y_pred4))
print(f1_score(y_true=y_test4,y_pred=y_pred4,average='macro'))

In [None]:
## Gradient Boosting Classifier

gb4 = GradientBoostingClassifier(n_estimators=100,max_depth=10,min_samples_leaf = 4, min_samples_split= 5)
gb4.fit(x_train4,y_train4)
y_pred4 = gb4.predict(x_test4)
print(classification_report(y_true=y_test4,y_pred=y_pred4))
print(f1_score(y_true=y_test4,y_pred=y_pred4,average='macro'))

In [None]:
# Training
y_pred2tr4 = rf4.predict(x_train4)
y_pred3tr4 = mlp4.predict(x_train4)
y_pred4tr4 = gb4.predict(x_train4)

# Testing
y_pred2_4 = rf4.predict(x_test4)
y_pred3_4 = mlp4.predict(x_test4)
y_pred4_4 = gb4.predict(x_test4)

In [None]:
# Final Comparison Table - Cluster 3

results4 = pd.DataFrame({'Prediction Model':['Random Forest','MLP','Gradient Boosting'],
                    'Training Accuracy (%)':[accuracy(y_train4,y_pred2tr4), accuracy(y_train4,y_pred3tr4), accuracy(y_train4,y_pred4tr4)],
                    'Testing Accuracy (%)':[accuracy(y_test4,y_pred2_4), accuracy(y_test4,y_pred3_4), accuracy(y_test4,y_pred4_4)],
                    'Testing f1-score (%)':[f1(y_test4,y_pred2_4), f1(y_test4,y_pred3_4), f1(y_test4,y_pred4_4)]}
                    )

results4.style.highlight_max(color = 'lightgreen', subset = 'Testing Accuracy (%)')

## --- Churn Prediction with Segmentation Method # 3 -  DBSCAN --- 

In [None]:
dbscan = DBSCAN(eps=0.8, min_samples=9)
db_clusters = dbscan.fit_predict(PCA_components.iloc[:,:2])
print(db_clusters)
cluster_df['dbscan_cluster'] = db_clusters
cluster_df.head()

In [None]:
avg_df_db = cluster_df.groupby(['dbscan_cluster'], as_index=False).mean()
avg_df_db

In [None]:
dbscan_test = DBSCAN(eps=0.8, min_samples=9)
db_clusters_test = dbscan_test.fit_predict(PCA_components_test.iloc[:,:2])
print(db_clusters_test)
cluster_df_test['dbscan_cluster'] = db_clusters_test
cluster_df_test.head()

In [None]:
sns.set_style('white')
f, ax = plt.subplots(nrows=1, ncols=4, figsize=(18, 5))
sns.set(font_scale=1.5)
sns.barplot(x='dbscan_cluster',y='days_since_last_login',data=avg_df_db ,ax=ax[0], palette = 'Spectral')
sns.barplot(x='dbscan_cluster',y='avg_time_spent',data=avg_df_db, ax=ax[1], palette = 'Spectral')
sns.barplot(x='dbscan_cluster',y='avg_transaction_value',data=avg_df_db, ax=ax[2], palette = 'Spectral')
sns.barplot(x='dbscan_cluster',y='avg_frequency_login_days',data=avg_df_db, ax=ax[3], palette = 'Spectral')
plt.suptitle('Total Count per Variable Based on Cluster', y = 0.9,  fontsize=18, fontweight = 'bold')
plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.8, 
                    wspace=0.4, 
                    hspace=0.4)

fig.tight_layout()
plt.show()


In [None]:
sns.set_style('white')
f, ax = plt.subplots(nrows=1, ncols=4, figsize=(18, 5))
sns.set(font_scale=1.5)
sns.barplot(x='dbscan_cluster',y='points_in_wallet',data=avg_df_db ,ax=ax[0], palette = 'Spectral')
sns.barplot(x='dbscan_cluster',y='churn_risk_score',data=avg_df_db, ax=ax[1], palette = 'Spectral')
sns.barplot(x='dbscan_cluster',y='membership_category_Basic Membership',data=avg_df_db, ax=ax[2], palette = 'Spectral')
sns.barplot(x='dbscan_cluster',y='membership_category_No Membership',data=avg_df_db, ax=ax[3], palette = 'Spectral')
#plt.suptitle('Total Count per Variable Based on Cluster', y = 0.9,  fontsize=18, fontweight = 'bold')
plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.8, 
                    wspace=0.4, 
                    hspace=0.4)

fig.tight_layout()
plt.show()

In [None]:
# creating new datasets based on each cluster (subset datasets)

grouped3 = cluster_df.groupby('dbscan_cluster')
 
df_grouped_00 = grouped.get_group(0)
df_grouped_00 = pd.DataFrame(df_grouped_00)

df_grouped_11 = grouped.get_group(1)
df_grouped_11 = pd.DataFrame(df_grouped_11)

df_grouped_22 = grouped.get_group(2)
df_grouped_22 = pd.DataFrame(df_grouped_22)

df_grouped_33 = grouped.get_group(3)
df_grouped_33 = pd.DataFrame(df_grouped_33)

df_grouped_44 = grouped.get_group(4)
df_grouped_44 = pd.DataFrame(df_grouped_44)

df_grouped_55 = grouped.get_group(5)
df_grouped_55 = pd.DataFrame(df_grouped_55)

In [None]:
X_00 = df_grouped_11.drop(['churn_risk_score'],axis=1).values
Y_00 = df_grouped_11['churn_risk_score'].values

oversample = SMOTE()
X_ov_00,Y_ov_00 = oversample.fit_resample(X_00,Y_00)

In [None]:
scaler = RobustScaler()
X_ov_00 = scaler.fit_transform(X_ov_00)
X_test_00 = cluster_df_test.values
X_test_00 = scaler.transform(X_test_00)

In [None]:
x_train00,x_test00,y_train00,y_test00 = train_test_split(X_ov_00,Y_ov_00,train_size=0.7)
print(x_train00.shape,y_train00.shape)
print(x_test00.shape,y_test00.shape)

In [None]:
## RANDOM FOREST CLASSIFIER

rf00 = RandomForestClassifier(n_estimators=1000,max_depth=25)
rf00.fit(x_train00,y_train00)
y_pred00 = rf00.predict(x_test00)
print(classification_report(y_true=y_test00,y_pred=y_pred00))
print(f1_score(y_true=y_test00,y_pred=y_pred00,average='macro'))

In [None]:
## MLP Classifier with 3 hidden layers of decreasing nodes

mlp00 = MLPClassifier(hidden_layer_sizes=(120,95,70))
mlp00.fit(x_train00,y_train00)
y_pred00 = mlp00.predict(x_test00)
print(classification_report(y_true=y_test00,y_pred=y_pred00))
print(f1_score(y_true=y_test00,y_pred=y_pred00,average='macro'))

In [None]:
## Gradient Boosting Classifier

gb00 = GradientBoostingClassifier(n_estimators=1000,max_depth=25,min_samples_leaf = 4, min_samples_split= 5)
gb00.fit(x_train00,y_train00)
y_pred00 = gb00.predict(x_test00)
print(classification_report(y_true=y_test00,y_pred=y_pred00))
print(f1_score(y_true=y_test00,y_pred=y_pred00,average='macro'))

In [None]:
# Training
y_pred2tr00 = rf00.predict(x_train00)
y_pred3tr00 = mlp00.predict(x_train00)
y_pred4tr00 = gb00.predict(x_train00)

# Testing
y_pred20 = rf00.predict(x_test00)
y_pred30 = mlp00.predict(x_test00)
y_pred40 = gb00.predict(x_test00)


In [None]:
# Final Comparison Table - Cluster 1 - DBSCAN

from sklearn.metrics import accuracy_score

results9 = pd.DataFrame({'Prediction Model':['Random Forest','MLP','Gradient Boosting'],
                    'Training Accuracy (%)':[accuracy(y_train00,y_pred2tr00), accuracy(y_train00,y_pred3tr00), accuracy(y_train00,y_pred4tr00)],
                    'Testing Accuracy (%)':[accuracy(y_test00,y_pred20), accuracy(y_test00,y_pred30), accuracy(y_test00,y_pred40)],
                    'Testing f1-Score (%)':[f1(y_test00,y_pred20), f1(y_test00,y_pred30), f1(y_test00,y_pred40)]}
                    )

results9.style.highlight_max(color = 'lightgreen', subset = 'Testing Accuracy (%)')

## --- Churn Prediction with Segmentation Method # 4 -  GMM --- 

In [None]:
cov_type =['full','tied','diag','spherical']
num_clusters=np.arange(1,12)
results_=pd.DataFrame(columns=['Covariance Type','# of Clusters','Silhouette Score','Davies Bouldin Score'])
for i in cov_type:
    for x in num_clusters:       
        gmm_cluster=GaussianMixture(n_components=x,covariance_type=i,random_state=5)
        clusters=gmm_cluster.fit_predict(PCA_components.iloc[:,:2])
        if len(np.unique(clusters))>=2:
            results_=results_.append({"Covariance Type":i,'# of Clusters':x,
                                      "Silhouette Score":metrics.silhouette_score(PCA_components.iloc[:,:2],clusters),
                                      'Davies Bouldin Score':metrics.davies_bouldin_score(PCA_components.iloc[:,:2],clusters)},ignore_index=True)
display(results_.sort_values(by=["Silhouette Score"], ascending=False)[:10])

In [None]:
gmm2=GaussianMixture(n_components=6, covariance_type='spherical',max_iter=2000, random_state=5).fit(PCA_components.iloc[:,:2])
labels = gmm2.predict(PCA_components.iloc[:,:2])
cluster_df = pd.DataFrame(final_df_train)
cluster_df['gmm_cluster'] = labels
cluster_df.head()

In [None]:
avg_df_gmm = cluster_df.groupby(['gmm_cluster'], as_index=False).mean()
avg_df_gmm

In [None]:
sns.set_style('white')
f, ax = plt.subplots(nrows=1, ncols=4, figsize=(18, 5))
sns.set(font_scale=1.5)
sns.barplot(x='gmm_cluster',y='days_since_last_login',data=avg_df_gmm ,ax=ax[0], palette = 'Spectral')
sns.barplot(x='gmm_cluster',y='avg_time_spent',data=avg_df_gmm, ax=ax[1], palette = 'Spectral')
sns.barplot(x='gmm_cluster',y='avg_transaction_value',data=avg_df_gmm, ax=ax[2], palette = 'Spectral')
sns.barplot(x='gmm_cluster',y='avg_frequency_login_days',data=avg_df_gmm, ax=ax[3], palette = 'Spectral')
plt.suptitle('Total Count per Variable Based on Cluster', y = 0.9,  fontsize=18, fontweight = 'bold')
plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.8, 
                    wspace=0.4, 
                    hspace=0.4)

fig.tight_layout()
plt.show()


In [None]:
sns.set_style('white')
f, ax = plt.subplots(nrows=1, ncols=4, figsize=(18, 5))
sns.set(font_scale=1.5)
sns.barplot(x='gmm_cluster',y='points_in_wallet',data=avg_df_gmm ,ax=ax[0], palette = 'Spectral')
sns.barplot(x='gmm_cluster',y='churn_risk_score',data=avg_df_gmm, ax=ax[1], palette = 'Spectral')
sns.barplot(x='gmm_cluster',y='membership_category_Basic Membership',data=avg_df_gmm, ax=ax[2], palette = 'Spectral')
sns.barplot(x='gmm_cluster',y='membership_category_No Membership',data=avg_df_gmm, ax=ax[3], palette = 'Spectral')
#plt.suptitle('Total Count per Variable Based on Cluster', y = 0.9,  fontsize=18, fontweight = 'bold')
plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.8, 
                    wspace=0.4, 
                    hspace=0.4)

fig.tight_layout()
plt.show()

In [None]:
cov_type=['full','tied','diag','spherical']
num_clusters=np.arange(1,12)
results_=pd.DataFrame(columns=['Covariance Type','# of Clusters','Silhouette Score','Davies Bouldin Score'])
for i in cov_type:
    for x in num_clusters:       
        gmm_cluster=GaussianMixture(n_components=n,covariance_type=i,random_state=5)
        clusters=gmm_cluster.fit_predict(PCA_components_test.iloc[:,:2])
        if len(np.unique(clusters))>=2:
            results_=results_.append({"Covariance type":i,'# of Clusters':x,
                                      "Silhouette Score":metrics.silhouette_score(PCA_components_test.iloc[:,:2],clusters),
                                      'Davies Bouldin Score':metrics.davies_bouldin_score(PCA_components_test.iloc[:,:2],clusters)},ignore_index=True)
display(results_.sort_values(by=["Silhouette Score"], ascending=False)[:10])

In [None]:
gmm3=GaussianMixture(n_components=6, covariance_type='full',max_iter=2000, random_state=5).fit(PCA_components_test.iloc[:,:2])
labels_test = gmm3.predict(PCA_components_test.iloc[:,:2])
cluster_df_test = pd.DataFrame(final_df_test)
cluster_df_test['gmm_cluster'] = labels_test
cluster_df_test

In [None]:
# creating new datasets based on each cluster (subset datasets)
grouped3 = cluster_df.groupby('gmm_cluster')
 
df2_grouped_0 = grouped3.get_group(0)
df2_grouped_0 = pd.DataFrame(df2_grouped_0)

df2_grouped_1 = grouped3.get_group(1)
df2_grouped_1 = pd.DataFrame(df2_grouped_1)

df2_grouped_2 = grouped3.get_group(2)
df2_grouped_2 = pd.DataFrame(df2_grouped_2)

df2_grouped_3 = grouped3.get_group(3)
df2_grouped_3 = pd.DataFrame(df2_grouped_3)

df2_grouped_4 = grouped3.get_group(4)
df2_grouped_4 = pd.DataFrame(df2_grouped_4)

df2_grouped_5 = grouped3.get_group(5)
df2_grouped_5 = pd.DataFrame(df2_grouped_5)


In [None]:
X_10 = df2_grouped_0.drop(['churn_risk_score'],axis=1).values
Y_10 = df2_grouped_0['churn_risk_score'].values

oversample = SMOTE()
X_ov_10,Y_ov_10 = oversample.fit_resample(X_10,Y_10)

In [None]:
x_train5,x_test5,y_train5,y_test5 = train_test_split(X_ov_10,Y_ov_10,train_size=0.7)
print(x_train5.shape,y_train5.shape)
print(x_test5.shape,y_test5.shape)

In [None]:
## RANDOM FOREST CLASSIFIER

rf10 = RandomForestClassifier(n_estimators=1000,max_depth=25)
rf10.fit(x_train5,y_train5)
y_pred5 = rf10.predict(x_test5)
print(classification_report(y_true=y_test5,y_pred=y_pred5))
print(f1_score(y_true=y_test5,y_pred=y_pred5,average='macro'))

In [None]:
## MLP Classifier with 3 hidden layers of decreasing nodes

mlp10 = MLPClassifier(hidden_layer_sizes=(120, 95, 70))
mlp10.fit(x_train5,y_train5)
y_pred5 = mlp10.predict(x_test5)
print(classification_report(y_true=y_test5,y_pred=y_pred5))
print(f1_score(y_true=y_test5,y_pred=y_pred5,average='macro'))

In [None]:
## Gradient Boosting Classifier

gb10 = GradientBoostingClassifier(n_estimators=1000,max_depth=25,min_samples_leaf = 4, min_samples_split= 5)
gb10.fit(x_train5,y_train5)
y_pred5 = gb10.predict(x_test5)
print(classification_report(y_true=y_test5,y_pred=y_pred5))
print(f1_score(y_true=y_test5,y_pred=y_pred5,average='macro'))

In [None]:
fig, ax = plt.subplots(figsize = (10,8))
sns.set_style('white')
cm = confusion_matrix(y_true=y_test5,y_pred=y_pred5)
sns.heatmap(cm, annot=True, ax = ax, cmap = 'Blues', fmt = 'g')
plt.title('Correlation Plot for Gradient Boosting Classifier', y = 1.02,  fontsize=18, fontweight = 'bold')
ax.set_xlabel('Predicted Label')
ax.set_ylabel('Actual Label')
plt.show()

In [None]:
# Training
y_pred2tr10 = rf10.predict(x_train5)
y_pred3tr10 = mlp10.predict(x_train5)
y_pred4tr10 = gb10.predict(x_train5)

# Testing
y_pred2_10 = rf10.predict(x_test5)
y_pred3_10 = mlp10.predict(x_test5)
y_pred4_10 = gb10.predict(x_test5)

In [None]:
# Final Comparison Table - Cluster 0 

results10 = pd.DataFrame({'Prediction Model':['Random Forest','MLP','Gradient Boosting'],
                    'Training Accuracy (%)':[accuracy(y_train5,y_pred2tr10), accuracy(y_train5,y_pred3tr10), accuracy(y_train5,y_pred4tr10)],
                    'Testing Accuracy (%)':[accuracy(y_test5,y_pred2_10), accuracy(y_test5,y_pred3_10), accuracy(y_test5,y_pred4_10)],
                    'Testing f1-score (%)':[f1(y_test5,y_pred2_10), f1(y_test5,y_pred3_10), f1(y_test5,y_pred4_10)]}
                    )
results10.style.highlight_max(color = 'lightgreen', subset = 'Testing Accuracy (%)')
