In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

trainDatasetfilePath = 'https://raw.githubusercontent.com/shekharmnnit/ML/main/Customer%20Churn%20Dataset/customer_churn_dataset-training-master.csv'
testDatasetPath= 'https://raw.githubusercontent.com/shekharmnnit/ML/main/Customer%20Churn%20Dataset/customer_churn_dataset-testing-master.csv'
training_df = pd.read_csv(trainDatasetfilePath)
testing_df = pd.read_csv(testDatasetPath)

training_df = training_df.drop(columns = 'CustomerID')
newColumnNames = {col : col.replace(' ','_') for col in training_df.columns}
training_df = training_df.rename(columns = newColumnNames)

testing_df = testing_df.drop(columns = 'CustomerID')
newColumnNames = {col : col.replace(' ','_') for col in testing_df.columns}
testing_df = testing_df.rename(columns = newColumnNames)

print("----Training Data set----------")
print(training_df.head(5).to_string())
print(f"Number of observation in training dataset: {training_df.shape[0]}")
print("Null value count")
print(training_df.isna().sum())
print(f"Training: Total number of rows with null value = {training_df.isna().sum().sum()}")

print("----Testing Data set----------")
print(testing_df.head(5).to_string())
print(f"Number of observation in test dataset: {testing_df.shape[0]}")
print("Null value count")
print(testing_df.isna().sum())
print(f"Testing: Total number of rows with null value = {testing_df.isna().sum().sum()}")

print('-------------------Train Data cleaning-------------------')
print(training_df[training_df['Age'].isna()].to_string()) # row detail with na value
print("199295 row has null value for all the columns, so removing 199295")
training_df = training_df.drop(training_df[training_df['Age'].isna()].index)
print(training_df.isna().sum())

HTTPError: HTTP Error 404: Not Found

In [None]:
training_df.duplicated().sum()
testing_df.duplicated().sum()
print(f"Train data duplicated= {training_df.duplicated().sum()}")
print(f"Test data duplicated= {testing_df.duplicated().sum()}")
print("no need to remove duplicate data as duplicate data is 0")

In [None]:
# Down sampling
# Calculate the proportions of classes
train_class_distribution = training_df['Churn'].value_counts()
print("Churn of Train data")
print(train_class_distribution)
test_class_distribution = testing_df['Churn'].value_counts()
print('Churn of test data')
print(test_class_distribution)

train_proportion_0 = train_class_distribution.get(0, 0) / len(training_df['Churn'])
train_proportion_1 = train_class_distribution.get(1, 0) / len(training_df['Churn'])
test_proportion_0 = test_class_distribution.get(0, 0) / len(testing_df['Churn'])
test_proportion_1 = test_class_distribution.get(1, 0) / len(testing_df['Churn'])
print(f'train-churn-0=   {train_proportion_0.round(2)}\n train-churn-1=  {train_proportion_1.round(2)}\n test-churn-0=   {test_proportion_0.round(2)}\n test-churn-1=   {test_proportion_1.round(2)}')


In [None]:
# traning data count plot
sns.set(style="whitegrid")
sns.countplot(data=training_df, x="Churn")

# You can customize the plot further
plt.title("traning- Churn Count Plot")
plt.xlabel("Churn")
plt.ylabel("Count")
plt.show()


In [None]:
# remove observation train
churn_1_rows = training_df[training_df['Churn'] == 1]
random_sample = churn_1_rows.sample(n=59166, random_state=5508)
training_df = training_df.drop(random_sample.index)



# traning data count plot
sns.set(style="whitegrid")
sns.countplot(data=training_df, x="Churn")
plt.title("traning- Churn Count Plot")
plt.xlabel("Churn")
plt.ylabel("Count")
plt.show()

In [None]:
# test data count plot
sns.set(style="whitegrid")
sns.countplot(data=testing_df, x="Churn")

# You can customize the plot further
plt.title("testing- Churn Count Plot")
plt.xlabel("Churn")
plt.ylabel("Count")
plt.show()


In [None]:
# remove observation from test
churn_1_rows = testing_df[testing_df['Churn'] == 0]
random_sample = churn_1_rows.sample(n=3388, random_state=5508)
testing_df = testing_df.drop(random_sample.index)

# traning data count plot
sns.set(style="whitegrid")
sns.countplot(data=testing_df, x="Churn")

# You can customize the plot further
plt.title("traning- Churn Count Plot")
plt.xlabel("Churn")
plt.ylabel("Count")
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming your dataset is stored in a variable named 'df'
# Features to be checked for outliers
# features = ['Age', 'Tenure', 'Usage Frequency', 'Support Calls', 'Payment Delay', 'Total Spend']

# Create subplots for each feature
plt.figure(figsize=(20, 20))
for i,feature in enumerate(['Age','Tenure','Usage_Frequency','Support_Calls','Payment_Delay','Total_Spend','Last_Interaction']):
     # Adjust the figure size as needed
    plt.subplot(1,7,i+1)
    sns.boxplot(x=training_df[feature])
    # plt.title(f'Boxplot for {feature}')
plt.show()

In [None]:
# Create combine subplots for each feature
plt.figure(figsize=(20, 20))
for i,feature in enumerate(['Age','Tenure','Usage_Frequency','Support_Calls','Payment_Delay','Total_Spend','Last_Interaction']):
     # Adjust the figure size as needed
    plt.subplot(1,7,i+1)
    sns.boxplot(training_df,x='Churn',y=feature)
    # plt.title(f'Boxplot for {feature}')
plt.show()

In [None]:
# Covariance Matrix display
def Standardized(dataframe):
    return (dataframe - dataframe.mean()) / dataframe.std()
encodingTrainDFForHitmap = pd.get_dummies(training_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
standardizedTrainDfForHitmap= Standardized(encodingTrainDFForHitmap[['Age','Tenure','Usage_Frequency','Support_Calls','Payment_Delay','Total_Spend','Last_Interaction']])
standardizedTrainDfForHitmap=pd.concat([standardizedTrainDfForHitmap,encodingTrainDFForHitmap[['Gender_Male','Subscription_Type_Premium','Subscription_Type_Standard','Contract_Length_Monthly','Contract_Length_Quarterly','Churn']]], axis=1)

covariance_matrix = standardizedTrainDfForHitmap.cov()
# plt.figure(figsize=(10, 8))
sns.heatmap(covariance_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Sample Covariance Matrix Heatmap')
plt.show()


In [None]:
# Correlation coefficients Matrix
Correlation_matrix = standardizedTrainDfForHitmap.corr()
# plt.figure(figsize=(10, 8))
sns.heatmap(Correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Sample Correlation Matrix Heatmap')
plt.show()

In [None]:
# Random Forest Analysis
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
import statsmodels.api as sm
import pandas as pd
# def Standardized(dataframe):
#     return (dataframe - dataframe.mean()) / dataframe.std()

encodingTrainDF = pd.get_dummies(training_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
standardizedTrainDf= Standardized(encodingTrainDF[['Age','Tenure','Usage_Frequency','Support_Calls','Payment_Delay','Total_Spend','Last_Interaction']])
standardizedTrainDf=pd.concat([standardizedTrainDf,encodingTrainDF[['Gender_Male','Subscription_Type_Premium','Subscription_Type_Standard','Contract_Length_Monthly','Contract_Length_Quarterly']]], axis=1)

# X_train= standardizedTrainDf.drop(columns=['Churn'])
X_train = standardizedTrainDf
# X_train = sm.add_constant(X_train)
y_train = training_df['Churn']


encodingTestDF = pd.get_dummies(testing_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
standardizedTestDf= Standardized(encodingTestDF[['Age','Tenure','Usage_Frequency','Support_Calls','Payment_Delay','Total_Spend','Last_Interaction']])
standardizedTestDf=pd.concat([standardizedTestDf,encodingTestDF[['Gender_Male','Subscription_Type_Premium','Subscription_Type_Standard','Contract_Length_Monthly','Contract_Length_Quarterly']]], axis=1)

X_test= standardizedTestDf
# X_test = sm.add_constant(X_test)
y_test = testing_df['Churn']

# print(X_test.head(5).to_string())

In [None]:
# building RandomForestRegressor model
rf= RandomForestRegressor(random_state=5805)
rf.fit(X_train,y_train)
featureImportancs= rf.feature_importances_

indices= np.argsort(featureImportancs)
sortedFeatureImportancs = featureImportancs[indices]
sortedFeatureName= X_train.columns[indices]

In [None]:
# building RandomForestRegressor model
# plot
plt.figure(figsize=(12,8))
plt.barh(range(sortedFeatureImportancs.size), sortedFeatureImportancs)
plt.yticks(range(sortedFeatureName.size), sortedFeatureName)
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('Feature Importance vs Feature')
plt.grid(True)
plt.show()

In [None]:
# building RandomForestRegressor model
threshold= 0.0015
selectedFeatures = [sortedFeatureName[i] for i, importance in enumerate(sortedFeatureImportancs) if importance >= threshold]
eliminatedFeatures = [sortedFeatureName[i] for i,importance in enumerate(sortedFeatureImportancs) if importance < threshold]
for i,j in enumerate(sortedFeatureName):
  print(f'{j}= {sortedFeatureImportancs[i].round(4)}')
print("selected features= ", selectedFeatures)
print("eliminated Features= ", eliminatedFeatures)

In [None]:
# PCA-Principal Component Analysis
# X_train is standardized
from sklearn.decomposition import PCA
import copy
pcaDatasetDF=copy.deepcopy(X_train)
pcaDatasetDF = pcaDatasetDF.astype(np.float64)
print(f'original data conditional number={np.linalg.cond(pcaDatasetDF)}')
pca=PCA()
pca.fit(pcaDatasetDF)
pcaCoordinate=pca.transform(pcaDatasetDF)
EVR= pca.explained_variance_ratio_
CVR = np.cumsum(EVR) # cumulative explained variance
numComponents = np.argmax(CVR >= 0.95) +1
print("number of Principal Component = "+ str(numComponents))

In [None]:
# PCA-Principal Component Analysis
pca=PCA(numComponents)
pca.fit(pcaDatasetDF)
pcaCoordinate=pca.transform(pcaDatasetDF)
# print(pcaCoordinate)


In [None]:
# PCA-Principal Component Analysis
newPCAdf = pd.DataFrame(pcaCoordinate)
# print(newPCAdf.shape[1])
newColumnNames = {col : 'Component-'+ str(col+1) for col in newPCAdf.columns}
newPCAdf = newPCAdf.rename(columns = newColumnNames)
print(f'transform data conditional number={np.linalg.cond(newPCAdf)}')
print(newPCAdf.head(5).to_string())

In [None]:
# PCA-Principal Component Analysis
# plt.figure(figsize=(8, 6))
# plt.plot(np.arange(1, pcaDatasetDF.shape[1]+ 1), CVR, marker='o')
# plt.xlabel('Number of Features')
# plt.ylabel('Cumulative Explained Variance')
# plt.grid(True)
# plt.show()

plt.figure(figsize=(8, 6))
plt.plot(np.arange(1, pcaDatasetDF.shape[1]+ 1), CVR, marker='o')
plt.xlabel('Number of Features')
plt.ylabel('Cumulative Explained Variance')
plt.axhline(y=0.95, color='red', label='95% Threshold')
plt.axvline(x=numComponents, color='green', label=f'{numComponents} Features')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# SVD-Singular Value Decomposition Analysis
# X_train is standardized
from sklearn.decomposition import TruncatedSVD
svdDatasetDF=copy.deepcopy(X_train)
svd=TruncatedSVD(svdDatasetDF.shape[1])
svd.fit(svdDatasetDF)
svdCoordinate=svd.transform(svdDatasetDF)

EVR= svd.explained_variance_ratio_
CVR = np.cumsum(EVR) # cumulative explained variance
numComponents = np.argmax(CVR >= 0.95)+1
print("number of Component = "+ str(numComponents))

In [None]:
# SVD-Singular Value Decomposition Analysis
svd=TruncatedSVD(numComponents)
svd.fit(svdDatasetDF)
svdCoordinate=svd.transform(svdDatasetDF)

In [None]:
# SVD-Singular Value Decomposition Analysis

newSVDdf = pd.DataFrame(svdCoordinate)
newColumnNames = {col: 'Component-' + str(col + 1) for col in newSVDdf.columns}
newSVDdf = newSVDdf.rename(columns=newColumnNames)
print(newSVDdf.head().to_string())

# df_svd = pd.DataFrame(pcaCoordinate, columns=[f'Component_{i+1}' for i in range(pcaCoordinate.shape[1])])
# print(df_svd.head(5))

In [None]:
# SVD-Singular Value Decomposition Analysis
# plt.figure(figsize=(8, 6))
# plt.plot(np.arange(1, newSVDdf.shape[1]+ 1), CVR, marker='o')
# plt.xlabel('Number of Features')
# plt.ylabel('Cumulative Explained Variance')
# plt.grid(True)
# plt.show()
plt.figure(figsize=(8, 6))
plt.plot(np.arange(1, svdDatasetDF.shape[1]+ 1), CVR, marker='o')
plt.xlabel('Number of Features')
plt.ylabel('Cumulative Explained Variance')
plt.axhline(y=0.95, color='red', label='95% Threshold')
plt.axvline(x=numComponents, color='green', label=f'{numComponents} Features')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Assuming you have a DataFrame named 'df' containing your dataset
# Select the features for which you want to calculate VIF
# Exclude non-numeric columns and the target variable 'Churn'


# Calculate VIF for each feature
p_train = X_train.astype(int)

vif_data = pd.DataFrame()
vif_data["Feature"] = p_train.columns
vif_data["VIF"] = [variance_inflation_factor(p_train.values, i) for i in range(p_train.shape[1])]

# Sort the features by VIF in descending order
vif_data = vif_data.sort_values(by="VIF", ascending=False).reset_index(drop=True)

# Print the VIF values
print(vif_data)

In [None]:
# Print the VIF values
# print(vif_data[Feature])

In [None]:
# VIF
# plot
plt.figure(figsize=(12,8))
plt.barh(range(vif_data['VIF'].size), vif_data['VIF'])
plt.yticks(range(vif_data['Feature'].size), vif_data['Feature'])
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('Feature Importance vs Feature')
plt.grid(True)
plt.show()

In [None]:
# T-test analysis - OLS
OLSencodingTrainDF = pd.get_dummies(training_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
OLSstandardizedTrainDf= Standardized(OLSencodingTrainDF[{'Age','Tenure','Usage_Frequency','Payment_Delay','Last_Interaction','Total_Spend'}])
OLSstandardizedTrainDf=pd.concat([OLSstandardizedTrainDf,OLSencodingTrainDF[{'Gender_Male','Subscription_Type_Premium','Subscription_Type_Standard','Contract_Length_Monthly','Contract_Length_Quarterly','Churn'}]], axis=1)

# X_train= standardizedTrainDf.drop(columns=['Churn'])
X_train_ols = OLSstandardizedTrainDf
# X_train = sm.add_constant(X_train)
y_train_ols = Standardized(training_df['Support_Calls'])


OLSencodingTestDF = pd.get_dummies(testing_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
OLSstandardizedTestDf= Standardized(OLSencodingTestDF[{'Age','Tenure','Usage_Frequency','Payment_Delay','Last_Interaction','Total_Spend'}])
OLSstandardizedTestDf=pd.concat([OLSstandardizedTestDf,OLSencodingTestDF[{'Gender_Male','Subscription_Type_Premium','Subscription_Type_Standard','Contract_Length_Monthly','Contract_Length_Quarterly','Churn'}]], axis=1)

X_test_ols= OLSstandardizedTestDf
y_test_ols = Standardized(testing_df['Support_Calls'])

# add const
X_train_ols = sm.add_constant(X_train_ols)
X_test_ols = sm.add_constant(X_test_ols)

# print(X_train_ols.head(5))

In [None]:
# T-test analysis - OLS
results = pd.DataFrame(columns=['Features', 'AIC', 'BIC', 'Adj_R2', 'P-value'])
import statsmodels.api as sm
OLSmodel = sm.OLS(y_train_ols,X_train_ols).fit()
results = results.append({'Features': OLSmodel.pvalues.idxmax(),
      'AIC': OLSmodel.aic,
      'BIC': OLSmodel.bic,
      'Adj_R2': OLSmodel.rsquared_adj,
      'P-value': OLSmodel.pvalues.max()}, ignore_index=True)


In [None]:
# T-test analysis - OLS
print(OLSmodel.summary())


In [None]:
# T-test analysis - OLS
# drop
X_train_ols= X_train_ols.drop(columns=['Subscription_Type_Premium'])

In [None]:
# T-test analysis - OLS
OLSmodel = sm.OLS(y_train_ols,X_train_ols).fit()
results = results.append({'Features': OLSmodel.pvalues.idxmax(),
      'AIC': OLSmodel.aic,
      'BIC': OLSmodel.bic,
      'Adj_R2': OLSmodel.rsquared_adj,
      'P-value': OLSmodel.pvalues.max()}, ignore_index=True)
print(OLSmodel.summary())

In [None]:
# T-test analysis - OLS
X_train_ols= X_train_ols.drop(columns=['Subscription_Type_Standard'])
OLSmodel = sm.OLS(y_train_ols,X_train_ols).fit()
results = results.append({'Features': OLSmodel.pvalues.idxmax(),
      'AIC': OLSmodel.aic,
      'BIC': OLSmodel.bic,
      'Adj_R2': OLSmodel.rsquared_adj,
      'P-value': OLSmodel.pvalues.max()}, ignore_index=True)
print(OLSmodel.summary())


In [None]:
# T-test analysis - OLS
X_train_ols= X_train_ols.drop(columns=['Contract_Length_Quarterly'])
OLSmodel = sm.OLS(y_train_ols,X_train_ols).fit()
results = results.append({'Features': OLSmodel.pvalues.idxmax(),
      'AIC': OLSmodel.aic,
      'BIC': OLSmodel.bic,
      'Adj_R2': OLSmodel.rsquared_adj,
      'P-value': OLSmodel.pvalues.max()}, ignore_index=True)
print(OLSmodel.summary())

In [None]:
# T-test analysis - OLS
X_train_ols= X_train_ols.drop(columns=['Tenure'])
OLSmodel = sm.OLS(y_train_ols,X_train_ols).fit()
results = results.append({'Features': OLSmodel.pvalues.idxmax(),
      'AIC': OLSmodel.aic,
      'BIC': OLSmodel.bic,
      'Adj_R2': OLSmodel.rsquared_adj,
      'P-value': OLSmodel.pvalues.max()}, ignore_index=True)
print(OLSmodel.summary())

In [None]:
# T-test analysis - OLS
X_train_ols= X_train_ols.drop(columns=['Usage_Frequency'])
OLSmodel = sm.OLS(y_train_ols,X_train_ols).fit()
# results = results.append({'Features': OLSmodel.pvalues.idxmax(),
#       'AIC': OLSmodel.aic,
#       'BIC': OLSmodel.bic,
#       'Adj_R2': OLSmodel.rsquared_adj,
#       'P-value': OLSmodel.pvalues.max()}, ignore_index=True)
print(OLSmodel.summary())

In [None]:
# T-test analysis - OLS
print('dropped feature')
print(results.round(6))
# removed dropped feature from test dataset
X_test_ols= X_test_ols.drop(columns=results.Features)

In [None]:
from sklearn.metrics import make_scorer, mean_squared_error
OLStargetPred= OLSmodel.predict(X_test_ols)
originalTestTaarget= testing_df['Support_Calls'] # non Standardized Target test

deStandardizedTargetTest = y_test_ols * originalTestTaarget.std() + originalTestTaarget.mean()
deStandardizedTargetPred= OLStargetPred * originalTestTaarget.std() + originalTestTaarget.mean()

deStandardizedTargetTest= deStandardizedTargetTest.reset_index().drop(['index'],axis=1)
deStandardizedTargetPred= deStandardizedTargetPred.reset_index().drop(['index'],axis=1)

# plot
plt.plot(deStandardizedTargetTest, label='Actual Sales')
plt.plot(deStandardizedTargetPred, label='Predicted Sales')
plt.xlabel("value")
plt.ylabel("Sales")
plt.title(' Actual vs predicted')
plt.legend()
plt.grid(True)
plt.show()
# accuracy
mse2 = mean_squared_error(deStandardizedTargetTest, deStandardizedTargetPred)
print("Mean Squared Error:", mse2.__round__(3))

In [None]:
#remove to recuce load on model building
churn_1_rows = training_df[training_df['Churn'] == 1]
random_sample = churn_1_rows.sample(n=165833, random_state=5508)
training_df = training_df.drop(random_sample.index)
#remove to recuce load on model building
churn_0_rows = training_df[training_df['Churn'] == 0]
random_sample = churn_0_rows.sample(n=165833, random_state=5508)
training_df = training_df.drop(random_sample.index)


print(training_df.shape)

In [None]:
# DecisionTreeClassifier
import pandas as pd
import seaborn as sbn
from sklearn.tree import DecisionTreeClassifier
encodingTrainDFForDT = pd.get_dummies(training_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
# encodingTrainDFForDT = encodingTrainDFForDT.drop(columns=['Contract_Length_Quarterly', 'Usage_Frequency', 'Subscription_Type_Premium', 'Subscription_Type_Standard'])
XTrain_DT = encodingTrainDFForDT[['Age', 'Tenure', 'Usage_Frequency', 'Support_Calls', 'Payment_Delay',
       'Total_Spend', 'Last_Interaction', 'Gender_Male',
       'Subscription_Type_Premium', 'Subscription_Type_Standard',
       'Contract_Length_Monthly', 'Contract_Length_Quarterly']]
yTrain_DT = encodingTrainDFForDT['Churn']

encodingTestDFForDT = pd.get_dummies(testing_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
XTest_DT = encodingTestDFForDT[['Age', 'Tenure', 'Usage_Frequency', 'Support_Calls', 'Payment_Delay',
       'Total_Spend', 'Last_Interaction', 'Gender_Male',
       'Subscription_Type_Premium', 'Subscription_Type_Standard',
       'Contract_Length_Monthly', 'Contract_Length_Quarterly']]
yTest_DT = encodingTestDFForDT['Churn']

In [None]:
# DecisionTreeClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, roc_auc_score, roc_curve
DTclf = DecisionTreeClassifier(random_state=5805)
DTclf.fit(XTrain_DT,yTrain_DT)
yTrainPredicted_DT = DTclf.predict(XTrain_DT)
yTestPredicted_DT = DTclf.predict(XTest_DT)
print(f'DecisionTree Test accuracy {accuracy_score(yTest_DT,yTestPredicted_DT).__round__(5)}')
# DecisionTreeClassifier
# feature importance
from prettytable import PrettyTable
table = PrettyTable()
table.field_names = ["feature", "Feature Importances"]

# featureImportances={}
featureColumns= XTrain_DT.columns
for i,j in enumerate(featureColumns):
  table.add_row([j, DTclf.feature_importances_[i].round(5)])
table.sortby = "Feature Importances"
print(table)

In [None]:
# DecisionTreeClassifier
newXTrain_DT= XTrain_DT.drop(['Contract_Length_Quarterly'],axis=1)
newXTest_DT=XTest_DT.drop(['Contract_Length_Quarterly'],axis=1)

In [None]:
# DecisionTreeClassifier
DTclf = DecisionTreeClassifier(random_state=5805)
DTclf.fit(newXTrain_DT,yTrain_DT)
yTrainPredicted_DT = DTclf.predict(newXTrain_DT)
yTestPredicted_DT = DTclf.predict(newXTest_DT)
print(f'DecisionTree Test accuracy {accuracy_score(yTest_DT,yTestPredicted_DT).__round__(5)}')
# DecisionTreeClassifier
# feature importance
from prettytable import PrettyTable
table = PrettyTable()
table.field_names = ["feature", "Feature Importances"]

# featureImportances={}
featureColumns= newXTrain_DT.columns
for i,j in enumerate(featureColumns):
  table.add_row([j, DTclf.feature_importances_[i].round(5)])
table.sortby = "Feature Importances"
print(table)

In [None]:
# DecisionTreeClassifier
newXTrain_DT= XTrain_DT.drop(['Contract_Length_Quarterly','Usage_Frequency'],axis=1)
newXTest_DT=XTest_DT.drop(['Contract_Length_Quarterly','Usage_Frequency'],axis=1)
DTclf = DecisionTreeClassifier(random_state=5805)
DTclf.fit(newXTrain_DT,yTrain_DT)
yTrainPredicted_DT = DTclf.predict(newXTrain_DT)
yTestPredicted_DT = DTclf.predict(newXTest_DT)
print(f'DecisionTree Test accuracy {accuracy_score(yTest_DT,yTestPredicted_DT).__round__(5)}')
# DecisionTreeClassifier
# feature importance
from prettytable import PrettyTable
table = PrettyTable()
table.field_names = ["feature", "Feature Importances"]

# featureImportances={}
featureColumns= newXTrain_DT.columns
for i,j in enumerate(featureColumns):
  table.add_row([j, DTclf.feature_importances_[i].round(5)])
table.sortby = "Feature Importances"
print(table)

In [None]:
# DecisionTreeClassifier
newXTrain_DT= XTrain_DT.drop(['Contract_Length_Quarterly','Usage_Frequency','Tenure'],axis=1)
newXTest_DT=XTest_DT.drop(['Contract_Length_Quarterly','Usage_Frequency','Tenure'],axis=1)
DTclf = DecisionTreeClassifier(random_state=5805)
DTclf.fit(newXTrain_DT,yTrain_DT)
yTrainPredicted_DT = DTclf.predict(newXTrain_DT)
yTestPredicted_DT = DTclf.predict(newXTest_DT)
print(f'DecisionTree Test accuracy {accuracy_score(yTest_DT,yTestPredicted_DT).__round__(5)}')
# DecisionTreeClassifier
# feature importance
from prettytable import PrettyTable
table = PrettyTable()
table.field_names = ["feature", "Feature Importances"]

# featureImportances={}
featureColumns= newXTrain_DT.columns
for i,j in enumerate(featureColumns):
  table.add_row([j, DTclf.feature_importances_[i].round(5)])
table.sortby = "Feature Importances"
print(table)

In [None]:
# DecisionTreeClassifier
newXTrain_DT= XTrain_DT.drop(['Contract_Length_Quarterly','Usage_Frequency','Tenure','Subscription_Type_Premium'],axis=1)
newXTest_DT=XTest_DT.drop(['Contract_Length_Quarterly','Usage_Frequency','Tenure','Subscription_Type_Premium'],axis=1)
DTclf = DecisionTreeClassifier(random_state=5805)
DTclf.fit(newXTrain_DT,yTrain_DT)
yTrainPredicted_DT = DTclf.predict(newXTrain_DT)
yTestPredicted_DT = DTclf.predict(newXTest_DT)
print(f'DecisionTree Test accuracy {accuracy_score(yTest_DT,yTestPredicted_DT).__round__(5)}')
# DecisionTreeClassifier
# feature importance
from prettytable import PrettyTable
table = PrettyTable()
table.field_names = ["feature", "Feature Importances"]

# featureImportances={}
featureColumns= newXTrain_DT.columns
for i,j in enumerate(featureColumns):
  table.add_row([j, DTclf.feature_importances_[i].round(5)])
table.sortby = "Feature Importances"
print(table)

In [None]:
# DecisionTreeClassifier Pre-Pruned
newXTrain_DT= XTrain_DT.drop(['Contract_Length_Quarterly','Usage_Frequency','Tenure','Subscription_Type_Premium','Subscription_Type_Standard'],axis=1)
newXTest_DT=XTest_DT.drop(['Contract_Length_Quarterly','Usage_Frequency','Tenure','Subscription_Type_Premium','Subscription_Type_Standard'],axis=1)
DTclf = DecisionTreeClassifier(random_state=5805)
DTclf.fit(newXTrain_DT,yTrain_DT)
yTrainPredicted_DT = DTclf.predict(newXTrain_DT)
yTestPredicted_DT = DTclf.predict(newXTest_DT)
print(f'DecisionTree Test accuracy {accuracy_score(yTest_DT,yTestPredicted_DT).__round__(5)}')
# DecisionTreeClassifier
# feature importance
from prettytable import PrettyTable
table = PrettyTable()
table.field_names = ["feature", "Feature Importances"]

# featureImportances={}
featureColumns= newXTrain_DT.columns
for i,j in enumerate(featureColumns):
  table.add_row([j, DTclf.feature_importances_[i].round(5)])
table.sortby = "Feature Importances"
print(table)

In [None]:
print('Hence feature to removed from Decision tree classifier are:- Contract_Length_Quarterly ,Usage_Frequency, Tenure, Subscription_Type_Premium')

In [None]:
# DecisionTreeClassifier Pre-Pruned
from sklearn.model_selection import GridSearchCV
from sklearn import tree
import warnings
warnings.filterwarnings("ignore")
tuned_parameters = {
'max_depth': [20], #[None, 5, 10, 20]
'min_samples_split': [2], #[2, 5, 10]
'min_samples_leaf': [3], #[1, 2,3,4]
'max_features': ['sqrt'], #['auto', 'sqrt', 'log']
'splitter': ['best'], #['best', 'random']
'criterion': ['entropy'] #['gini', 'entropy','log_loss']
}
DTclf = DecisionTreeClassifier(random_state=5805)
gridSearch = GridSearchCV(DTclf, tuned_parameters)
gridSearch.fit(XTrain_DT, yTrain_DT)
print("Best parameters found: ", gridSearch.best_params_)

In [None]:
# DecisionTreeClassifier Pre-Pruned
gridSearch.best_estimator_.fit(XTrain_DT, yTrain_DT)
yTestProbPrePruned= gridSearch.best_estimator_.predict_proba(XTest_DT)[::, -1]
yTestPredPrePruned = gridSearch.best_estimator_.predict(XTest_DT)
prePrunedAccuracy= accuracy_score(yTest_DT, yTestPredPrePruned)
print(f'Pre-Pruned Test accuracy {prePrunedAccuracy.__round__(5)}')

In [None]:
# DecisionTreeClassifier Post-Pruned
DTclf = DecisionTreeClassifier(random_state=5805)
DTclf.fit(XTrain_DT,yTrain_DT)

cpppath = DTclf.cost_complexity_pruning_path(XTrain_DT,yTrain_DT)
alphas = cpppath['ccp_alphas']
# Grid search for best alpha
postPrunAccuracyTrain, postPrunAccuracyTest = [],[]
for i in alphas:
    DTclf = DecisionTreeClassifier(random_state=5805,ccp_alpha=i)
    DTclf.fit(XTrain_DT,yTrain_DT)
    yTrainPredPostPruned = DTclf.predict(XTrain_DT)
    postPrunAccuracyTrain.append(accuracy_score(yTrain_DT, yTrainPredPostPruned))
    yTestPredPostPruned = DTclf.predict(XTest_DT)
    postPrunAccuracyTest.append(accuracy_score(yTest_DT, yTestPredPostPruned))
print(f'alpha={alphas[postPrunAccuracyTest.index(max(postPrunAccuracyTest))].round(5)}')

In [None]:
# DecisionTreeClassifier Post-Pruned
fig, ax = plt.subplots()
ax.set_xlabel('alpha')
ax.set_ylabel('accuracy')
ax.set_title("Alpha for training and testing sets VS Accuracy")
ax.plot(alphas, postPrunAccuracyTrain, label="Train", drawstyle="steps-post")
ax.plot(alphas, postPrunAccuracyTest, label="Test", drawstyle="steps-post")
ax.legend()
plt.grid()
plt.tight_layout()
plt.show()

In [None]:
# DecisionTreeClassifier Post-Pruned
DTclf = DecisionTreeClassifier(random_state=5805, ccp_alpha=alphas[postPrunAccuracyTest.index(max(postPrunAccuracyTest))].round(5))
DTclf.fit(XTrain_DT, yTrain_DT)
yTrainPredPostPruned = DTclf.predict(XTrain_DT)
yTestPredPostPruned = DTclf.predict(XTest_DT)
yTestProbPostPruned= DTclf.predict_proba(XTest_DT)[::, -1]
postrePrunedAccuracy= accuracy_score(yTest_DT, yTestPredPostPruned)
postrePrunedAccuracy_train= accuracy_score(yTrain_DT, yTrainPredPostPruned)
print(f'Post-Pruned Test accuracy {accuracy_score(yTest_DT, yTestPredPostPruned).__round__(2)}')
print(f'Post-Pruned Train accuracy {accuracy_score(yTrain_DT, yTrainPredPostPruned).__round__(2)}')

In [None]:
# DecisionTreeClassifier Post-Pruned
for key, value in DTclf.get_params().items():
  print(f"{key}: {value}")

confusionMatrixPrePruned = confusion_matrix(yTest_DT, yTestPredPrePruned)
confusionMatrixPostPruned = confusion_matrix(yTest_DT, yTestPredPostPruned)
recallPrePruned = recall_score(yTest_DT, yTestPredPrePruned)
recallPostPruned = recall_score(yTest_DT, yTestPredPostPruned)
rocAucPrePruned = roc_auc_score(yTest_DT, yTestProbPrePruned)
rocAucPostPruned = roc_auc_score(yTest_DT, yTestProbPostPruned)

table1 = PrettyTable()
table1.field_names = ["","Accuracy", "confusion Matrix", "recall", 'AUC']
table1.add_row(["Pre-Pruned",prePrunedAccuracy.round(2),confusionMatrixPrePruned,recallPrePruned.round(2),rocAucPrePruned.round(2)])
table1.add_row(["Post-Pruned",postrePrunedAccuracy.round(2),confusionMatrixPostPruned,recallPostPruned.round(2),rocAucPostPruned.round(2)])
print(table1)

In [None]:
# Post-Pruned
fprPrePrunedtree, tprPrePrunedtree, _ = roc_curve(yTest_DT, yTestProbPrePruned)
fprPostPrunedtree, tprPostPrunedtree, _ = roc_curve(yTest_DT,yTestProbPostPruned)
plt.plot(fprPrePrunedtree, tprPrePrunedtree, label='Pre-Pruned')
plt.plot(fprPostPrunedtree, tprPostPrunedtree, label='Post-Pruned')
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('8.ROC Curve')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Logistic regression
#remove  'Contract_Length_Quarterly','Usage_Frequency','Tenure','Subscription_Type_Premium’
removefeature= []
finalFeature= [item for item in ['Age', 'Tenure', 'Usage_Frequency', 'Support_Calls', 'Payment_Delay',
       'Total_Spend', 'Last_Interaction', 'Gender_Male',
       'Subscription_Type_Premium', 'Subscription_Type_Standard',
       'Contract_Length_Monthly', 'Contract_Length_Quarterly'] if item not in removefeature]

from sklearn.linear_model import LogisticRegression
encodingTrainDFForLR = pd.get_dummies(training_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
# encodingTrainDFForDT = encodingTrainDFForDT.drop(columns=['Contract_Length_Quarterly', 'Usage_Frequency', 'Subscription_Type_Premium', 'Subscription_Type_Standard'])
XTrain_LR = encodingTrainDFForLR[finalFeature]
yTrain_LR = encodingTrainDFForLR['Churn']

encodingTestDFForLR = pd.get_dummies(testing_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
XTest_LR = encodingTestDFForLR[finalFeature]
yTest_LR = encodingTestDFForLR['Churn']

In [None]:
# Logistic regression
logregclf = LogisticRegression()
logregclf.fit(XTrain_LR, yTrain_LR)
logregYTestPred = logregclf.predict(XTest_LR)
logregAccuracy = accuracy_score(yTest_LR, logregYTestPred)
yTestProbLogreg= logregclf.predict_proba(XTest_LR)[::, -1]
print(f'Logistic regression Accuracy = {logregAccuracy.__round__(5)}')



In [None]:
# Logistic regression
confusionMatrixlogreg = confusion_matrix(yTest_LR, logregYTestPred)
recallLogreg = recall_score(yTest_LR, logregYTestPred)
rocAucLogreg = roc_auc_score(yTest_LR, yTestProbLogreg)

table2 = PrettyTable()
table2.field_names = ["","Accuracy", "confusion Matrix", "recall", 'AUC']
table2.add_row(["Decision Tree Post-Pruned",postrePrunedAccuracy.round(2), confusionMatrixPostPruned,recallPostPruned.round(2),rocAucPostPruned.round(2)])
table2.add_row(["logistic regression",logregAccuracy.round(2),confusionMatrixlogreg.round(2),recallLogreg.round(2),rocAucLogreg.round(2)])
print(table2)

fprlogregclf, tprlogregclf, _ = roc_curve(yTest_LR, yTestProbLogreg)
# ROC curves
plt.figure(figsize=(8, 6))
plt.plot(fprPostPrunedtree, tprPostPrunedtree,  label='Decision tree-Post-Pruned')
plt.plot(fprlogregclf, tprlogregclf, label='Logistic regression')
plt.plot([0, 1], [0, 1], 'k--', label='Random selection')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid()
plt.show()

In [None]:
# svm
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

encodingTrainDFForSVM = pd.get_dummies(training_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
standardizedTrainDfSVM= Standardized(encodingTrainDFForSVM[['Age','Tenure','Usage_Frequency','Support_Calls','Payment_Delay','Total_Spend','Last_Interaction']])
standardizedTrainDfSVM=pd.concat([standardizedTrainDfSVM,encodingTrainDFForSVM[['Gender_Male','Subscription_Type_Premium','Subscription_Type_Standard','Contract_Length_Monthly','Contract_Length_Quarterly']]], axis=1)
# encodingTrainDFForDT = encodingTrainDFForDT.drop(columns=['Contract_Length_Quarterly', 'Usage_Frequency', 'Subscription_Type_Premium', 'Subscription_Type_Standard'])
XTrain_SVM = standardizedTrainDfSVM
yTrain_SVM = encodingTrainDFForSVM['Churn']

encodingTestDFForSVM = pd.get_dummies(testing_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
standardizedTestDfSVM= Standardized(encodingTestDFForSVM[['Age','Tenure','Usage_Frequency','Support_Calls','Payment_Delay','Total_Spend','Last_Interaction']])
standardizedTestDfSVM=pd.concat([standardizedTestDfSVM,encodingTestDFForSVM[['Gender_Male','Subscription_Type_Premium','Subscription_Type_Standard','Contract_Length_Monthly','Contract_Length_Quarterly']]], axis=1)

XTest_SVM = standardizedTestDfSVM
yTest_SVM = encodingTestDFForSVM['Churn']

In [None]:
# #SVM
# from sklearn.svm import SVC
# svm_model = SVC()
# param_grid = {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['scale', 'auto']}
# 
# grid_search = GridSearchCV(svm_model, param_grid, scoring='accuracy')
# grid_search.fit(XTrain_SVM, yTrain_SVM)
# 
# best_params = grid_search.best_params_
# print(best_params)

In [None]:
# svm
from sklearn.svm import SVC
svmclf = SVC(kernel='linear', C=10,probability=True)
svmclf.fit(XTrain_SVM, yTrain_SVM)
svmYTestPred = svmclf.predict(XTest_SVM)
svmAccuracy = accuracy_score(yTest_SVM, svmYTestPred)

print(f'SVM Accuracy = {svmAccuracy.__round__(5)}')



In [None]:
# from sklearn.metrics import accuracy_score
svmAccuracy=accuracy_score(yTest_SVM, svmYTestPred)
print(f'SVM Accuracy = {svmAccuracy.__round__(5)}')

In [None]:
yTestProbsvm= svmclf.predict_proba(XTest_SVM)[::, -1]
confusionMatrixsvm = confusion_matrix(yTest_SVM, svmYTestPred)
recallsvm = recall_score(yTest_SVM, svmYTestPred)
rocAucsvm = roc_auc_score(yTest_SVM, yTestProbsvm)

table2 = PrettyTable()
table2.field_names = ["","Accuracy", "confusion Matrix", "recall", 'AUC']
table2.add_row(["Decision Tree Post-Pruned",postrePrunedAccuracy.round(2), confusionMatrixPostPruned,recallPostPruned.round(2),rocAucPostPruned.round(2)])
table2.add_row(["logistic regression",logregAccuracy.round(2),confusionMatrixlogreg.round(2),recallLogreg.round(2),rocAucLogreg.round(2)])
table2.add_row(["SVM",svmAccuracy.round(2),confusionMatrixsvm.round(2),recallsvm.round(2),rocAucsvm.round(2)])
print(table2)

In [None]:
# svm
fprsvm, tprsvm, _ = roc_curve(yTest_SVM, yTestProbsvm)
# ROC curves
plt.figure(figsize=(8, 6))
plt.plot(fprPostPrunedtree, tprPostPrunedtree,  label='Decision tree-Post-Pruned')
plt.plot(fprlogregclf, tprlogregclf, label='Logistic regression')
plt.plot(fprsvm, tprsvm, label='SVM')
plt.plot([0, 1], [0, 1], 'k--', label='Random selection')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid()
plt.show()

In [None]:
# Naïve Bayes
#remove  'Contract_Length_Quarterly','Usage_Frequency','Tenure','Subscription_Type_Premium’
removefeature= []
finalFeature= [item for item in ['Age', 'Tenure', 'Usage_Frequency', 'Support_Calls', 'Payment_Delay',
       'Total_Spend', 'Last_Interaction', 'Gender_Male',
       'Subscription_Type_Premium', 'Subscription_Type_Standard',
       'Contract_Length_Monthly', 'Contract_Length_Quarterly'] if item not in removefeature]


encodingTrainDFForNB = pd.get_dummies(training_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
# encodingTrainDFForDT = encodingTrainDFForDT.drop(columns=['Contract_Length_Quarterly', 'Usage_Frequency', 'Subscription_Type_Premium', 'Subscription_Type_Standard'])
XTrain_NB = encodingTrainDFForNB[finalFeature]
yTrain_NB = encodingTrainDFForNB['Churn']

encodingTestDFForNB = pd.get_dummies(testing_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
XTest_NB = encodingTestDFForNB[finalFeature]
yTest_NB = encodingTestDFForNB['Churn']

In [None]:
# Naïve Bayes
from sklearn.naive_bayes import GaussianNB
nbclf = GaussianNB()
nbclf.fit(XTrain_NB, yTrain_NB)
nbYTestPred = nbclf.predict(XTest_NB)
nbAccuracy = accuracy_score(yTest_NB, nbYTestPred)
yTestProbnb= nbclf.predict_proba(XTest_NB)[::, -1]
print(f'Naïve Bayes = {nbAccuracy.__round__(5)}')

In [None]:
# Naïve Bayes
confusionMatrixnb = confusion_matrix(yTest_NB, nbYTestPred)
recallnb = recall_score(yTest_NB, nbYTestPred)
rocAucnb = roc_auc_score(yTest_NB, yTestProbnb)

table2 = PrettyTable()
table2.field_names = ["","Accuracy", "confusion Matrix", "recall", 'AUC']
table2.add_row(["Decision Tree Post-Pruned",postrePrunedAccuracy.round(2), confusionMatrixPostPruned,recallPostPruned.round(2),rocAucPostPruned.round(2)])
table2.add_row(["logistic regression",logregAccuracy.round(2),confusionMatrixlogreg.round(2),recallLogreg.round(2),rocAucLogreg.round(2)])
table2.add_row(["SVM",svmAccuracy.round(2),confusionMatrixsvm.round(2),recallsvm.round(2),rocAucsvm.round(2)])
table2.add_row(["Naïve Bayes",nbAccuracy.round(2),confusionMatrixnb.round(2),recallnb.round(2),rocAucnb.round(2)])
print(table2)

In [None]:
# Naïve Bayes
fprnb, tprnb, _ = roc_curve(yTest_NB, yTestProbnb)
# ROC curves
plt.figure(figsize=(8, 6))
plt.plot(fprPostPrunedtree, tprPostPrunedtree,  label='Decision tree-Post-Pruned')
plt.plot(fprlogregclf, tprlogregclf, label='Logistic regression')
plt.plot(fprsvm, tprsvm, label='SVM')
plt.plot(fprnb, tprnb, label='Naïve Bayes')
plt.plot([0, 1], [0, 1], 'k--', label='Random selection')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid()
plt.show()

In [None]:
# KNN

encodingTrainDFForKNN = pd.get_dummies(training_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
standardizedTrainDfKNN= Standardized(encodingTrainDFForKNN[['Age','Tenure','Usage_Frequency','Support_Calls','Payment_Delay','Total_Spend','Last_Interaction']])
standardizedTrainDfKNN=pd.concat([standardizedTrainDfKNN,encodingTrainDFForKNN[['Gender_Male','Subscription_Type_Premium','Subscription_Type_Standard','Contract_Length_Monthly','Contract_Length_Quarterly']]], axis=1)
# encodingTrainDFForDT = encodingTrainDFForDT.drop(columns=['Contract_Length_Quarterly', 'Usage_Frequency', 'Subscription_Type_Premium', 'Subscription_Type_Standard'])
XTrain_KNN = standardizedTrainDfKNN
yTrain_KNN = encodingTrainDFForKNN['Churn']

encodingTestDFForKNN = pd.get_dummies(testing_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
standardizedTestDfKNN= Standardized(encodingTestDFForKNN[['Age','Tenure','Usage_Frequency','Support_Calls','Payment_Delay','Total_Spend','Last_Interaction']])
standardizedTestDfKNN = pd.concat([standardizedTestDfKNN,encodingTestDFForKNN[['Gender_Male','Subscription_Type_Premium','Subscription_Type_Standard','Contract_Length_Monthly','Contract_Length_Quarterly']]], axis=1)

XTest_KNN = standardizedTestDfKNN
yTest_KNN = encodingTestDFForKNN['Churn']

In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
error_rates = []
k_values = range(1, 40)

for k in k_values:
    knnclf = KNeighborsClassifier(n_neighbors=k)
    knnclf.fit(XTrain_KNN, yTrain_KNN)
    knnYTestPred = knnclf.predict(XTest_KNN)
    error_rates.append(1 - accuracy_score(yTest_KNN, knnYTestPred))


plt.plot(k_values, error_rates, marker='o')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Neighbors (K)')
plt.ylabel('Error Rate')
plt.show()

In [None]:
# KNN
knnclf = KNeighborsClassifier(n_neighbors=25)
knnclf.fit(XTrain_KNN, yTrain_KNN)
knnYTestPred = knnclf.predict(XTest_KNN)
knnAccuracy= accuracy_score(yTest_KNN, knnYTestPred)
print(knnAccuracy)

In [None]:
# KNN
yTestProbknn= knnclf.predict_proba(XTest_KNN)[::, -1]
confusionMatrixknn = confusion_matrix(yTest_KNN, knnYTestPred)
recallknn = recall_score(yTest_KNN, knnYTestPred)
rocAucknn = roc_auc_score(yTest_KNN, yTestProbknn)

table2 = PrettyTable()
table2.field_names = ["","Accuracy", "confusion Matrix", "recall", 'AUC']
table2.add_row(["Decision Tree Post-Pruned",postrePrunedAccuracy.round(2), confusionMatrixPostPruned,recallPostPruned.round(2),rocAucPostPruned.round(2)])
table2.add_row(["logistic regression",logregAccuracy.round(2),confusionMatrixlogreg.round(2),recallLogreg.round(2),rocAucLogreg.round(2)])
table2.add_row(["SVM",svmAccuracy.round(2),confusionMatrixsvm.round(2),recallsvm.round(2),rocAucsvm.round(2)])
table2.add_row(["Naïve Bayes",nbAccuracy.round(2),confusionMatrixnb.round(2),recallnb.round(2),rocAucnb.round(2)])
table2.add_row(["KNN",knnAccuracy.round(2),confusionMatrixknn.round(2),recallknn.round(2),rocAucknn.round(2)])
print(table2)

In [None]:
# KNN
fprknn, tprknn, _ = roc_curve(yTest_KNN, yTestProbknn)
# ROC curves
plt.figure(figsize=(8, 6))
plt.plot(fprPostPrunedtree, tprPostPrunedtree,  label='Decision tree-Post-Pruned')
plt.plot(fprlogregclf, tprlogregclf, label='Logistic regression')
plt.plot(fprsvm, tprsvm, label='SVM')
plt.plot(fprnb, tprnb, label='Naïve Bayes')
plt.plot(fprknn, tprknn, label='KNN')
plt.plot([0, 1], [0, 1], 'k--', label='Random selection')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid()
plt.show()

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
finalFeature= [item for item in ['Age', 'Tenure', 'Usage_Frequency', 'Support_Calls', 'Payment_Delay',
       'Total_Spend', 'Last_Interaction', 'Gender_Male',
       'Subscription_Type_Premium', 'Subscription_Type_Standard',
       'Contract_Length_Monthly', 'Contract_Length_Quarterly'] if item not in removefeature]


encodingTrainDFForRF = pd.get_dummies(training_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
# encodingTrainDFForDT = encodingTrainDFForDT.drop(columns=['Contract_Length_Quarterly', 'Usage_Frequency', 'Subscription_Type_Premium', 'Subscription_Type_Standard'])
XTrain_RF = encodingTrainDFForRF[finalFeature]
yTrain_RF = encodingTrainDFForRF['Churn']

encodingTestDFForRF = pd.get_dummies(testing_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
XTest_RF = encodingTestDFForRF[finalFeature]
yTest_RF = encodingTestDFForRF['Churn']

In [None]:
# Random Forest
rfclf = RandomForestClassifier(n_estimators=100, random_state=5805)
rfclf.fit(XTrain_RF, yTrain_RF)
rfYTestPred = rfclf.predict(XTest_RF)
rfAccuracy = accuracy_score(yTest_RF, rfYTestPred)
yTestProbrf= nbclf.predict_proba(XTest_RF)[::, -1]
print(f'Random Forest = {rfAccuracy.__round__(5)}')

In [None]:
# Random Forest
yTestProbrf= rfclf.predict_proba(XTest_RF)[::, -1]
confusionMatrixrf = confusion_matrix(yTest_RF, rfYTestPred)
recallrf = recall_score(yTest_RF, rfYTestPred)
rocAucrf = roc_auc_score(yTest_RF, yTestProbrf)

table2 = PrettyTable()
table2.field_names = ["","Accuracy", "confusion Matrix", "recall", 'AUC']
table2.add_row(["Decision Tree Post-Pruned",postrePrunedAccuracy.round(2), confusionMatrixPostPruned,recallPostPruned.round(2),rocAucPostPruned.round(2)])
table2.add_row(["logistic regression",logregAccuracy.round(2),confusionMatrixlogreg.round(2),recallLogreg.round(2),rocAucLogreg.round(2)])
table2.add_row(["SVM",svmAccuracy.round(2),confusionMatrixsvm.round(2),recallsvm.round(2),rocAucsvm.round(2)])
table2.add_row(["Naïve Bayes",nbAccuracy.round(2),confusionMatrixnb.round(2),recallnb.round(2),rocAucnb.round(2)])
table2.add_row(["KNN",knnAccuracy.round(2),confusionMatrixknn.round(2),recallknn.round(2),rocAucknn.round(2)])
table2.add_row(["RF",rfAccuracy.round(2),confusionMatrixrf.round(2),recallrf.round(2),rocAucrf.round(2)])
print(table2)

In [None]:
# Random Forest
fprrf, tprrf, _ = roc_curve(yTest_RF, yTestProbrf)
# ROC curves
plt.figure(figsize=(8, 6))
plt.plot(fprPostPrunedtree, tprPostPrunedtree,  label='Decision tree-Post-Pruned')
plt.plot(fprlogregclf, tprlogregclf, label='Logistic regression')
plt.plot(fprsvm, tprsvm, label='SVM')
plt.plot(fprnb, tprnb, label='Naïve Bayes')
plt.plot(fprknn, tprknn, label='KNN')
plt.plot(fprrf, tprrf, label='Random Forest')
plt.plot([0, 1], [0, 1], 'k--', label='Random selection')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid()
plt.show()

In [None]:
# Bagging
from sklearn.ensemble import RandomForestClassifier
finalFeature= [item for item in ['Age', 'Tenure', 'Usage_Frequency', 'Support_Calls', 'Payment_Delay',
       'Total_Spend', 'Last_Interaction', 'Gender_Male',
       'Subscription_Type_Premium', 'Subscription_Type_Standard',
       'Contract_Length_Monthly', 'Contract_Length_Quarterly'] if item not in removefeature]


encodingTrainDFForBA = pd.get_dummies(training_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
# encodingTrainDFForDT = encodingTrainDFForDT.drop(columns=['Contract_Length_Quarterly', 'Usage_Frequency', 'Subscription_Type_Premium', 'Subscription_Type_Standard'])
XTrain_BA = encodingTrainDFForBA[finalFeature]
yTrain_BA = encodingTrainDFForBA['Churn']

encodingTestDFForBA = pd.get_dummies(testing_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
XTest_BA = encodingTestDFForBA[finalFeature]
yTest_BA = encodingTestDFForBA['Churn']

In [None]:
# Bagging
from sklearn.ensemble import BaggingClassifier
baclf = BaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=100),
                                  n_estimators=10, random_state=5805)

baclf.fit(XTrain_BA, yTrain_BA)
baYTestPred = baclf.predict(XTest_BA)
baAccuracy = accuracy_score(yTest_BA, baYTestPred)
yTestProbba= nbclf.predict_proba(XTest_BA)[::, -1]
print(f'Random Forest = {baAccuracy.__round__(5)}')

In [None]:
# Bagging
yTestProbba= baclf.predict_proba(XTest_BA)[::, -1]
confusionMatrixba = confusion_matrix(yTest_BA, baYTestPred)
recallba = recall_score(yTest_BA, baYTestPred)
rocAucba = roc_auc_score(yTest_BA, yTestProbba)

table2 = PrettyTable()
table2.field_names = ["","Accuracy", "confusion Matrix", "recall", 'AUC']
table2.add_row(["Decision Tree Post-Pruned",postrePrunedAccuracy.round(2), confusionMatrixPostPruned,recallPostPruned.round(2),rocAucPostPruned.round(2)])
table2.add_row(["logistic regression",logregAccuracy.round(2),confusionMatrixlogreg.round(2),recallLogreg.round(2),rocAucLogreg.round(2)])
table2.add_row(["SVM",svmAccuracy.round(2),confusionMatrixsvm.round(2),recallsvm.round(2),rocAucsvm.round(2)])
table2.add_row(["Naïve Bayes",nbAccuracy.round(2),confusionMatrixnb.round(2),recallnb.round(2),rocAucnb.round(2)])
table2.add_row(["KNN",knnAccuracy.round(2),confusionMatrixknn.round(2),recallknn.round(2),rocAucknn.round(2)])
table2.add_row(["RF",rfAccuracy.round(2),confusionMatrixrf.round(2),recallrf.round(2),rocAucrf.round(2)])
table2.add_row(["Bagging",baAccuracy.round(2),confusionMatrixba.round(2),recallba.round(2),rocAucba.round(2)])
print(table2)

In [None]:
# Bagging
fprba, tprba, _ = roc_curve(yTest_BA, yTestProbba)
# ROC curves
plt.figure(figsize=(8, 6))
plt.plot(fprPostPrunedtree, tprPostPrunedtree,  label='Decision tree-Post-Pruned')
plt.plot(fprlogregclf, tprlogregclf, label='Logistic regression')
plt.plot(fprsvm, tprsvm, label='SVM')
plt.plot(fprnb, tprnb, label='Naïve Bayes')
plt.plot(fprknn, tprknn, label='KNN')
plt.plot(fprrf, tprrf, label='Random Forest')
plt.plot(fprba, tprba, label='Bagging')
plt.plot([0, 1], [0, 1], 'k--', label='Random selection')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid()
plt.show()

In [None]:
# Stacking
from sklearn.ensemble import RandomForestClassifier
finalFeature= [item for item in ['Age', 'Tenure', 'Usage_Frequency', 'Support_Calls', 'Payment_Delay',
       'Total_Spend', 'Last_Interaction', 'Gender_Male',
       'Subscription_Type_Premium', 'Subscription_Type_Standard',
       'Contract_Length_Monthly', 'Contract_Length_Quarterly'] if item not in removefeature]


encodingTrainDFForST = pd.get_dummies(training_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
# encodingTrainDFForDT = encodingTrainDFForDT.drop(columns=['Contract_Length_Quarterly', 'Usage_Frequency', 'Subscription_Type_Premium', 'Subscription_Type_Standard'])
XTrain_ST = encodingTrainDFForST[finalFeature]
yTrain_ST = encodingTrainDFForST['Churn']

encodingTestDFForST = pd.get_dummies(testing_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
XTest_ST = encodingTestDFForST[finalFeature]
yTest_ST = encodingTestDFForST['Churn']

In [None]:
# Stacking
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
base_estimators = [('rf', RandomForestClassifier(n_estimators=100, random_state=5805)),]
meta_estimator = LogisticRegression()

stclf = StackingClassifier(estimators=base_estimators, final_estimator=meta_estimator)
stclf.fit(XTrain_ST, yTrain_ST)

stYTestPred = stclf.predict(XTest_ST)
stAccuracy = accuracy_score(yTest_ST, stYTestPred)
print(f'Random Forest = {stAccuracy.__round__(5)}')

In [None]:
# Stacking
yTestProbst= stclf.predict_proba(XTest_ST)[::, -1]
confusionMatrixst = confusion_matrix(yTest_ST, stYTestPred)
recallst = recall_score(yTest_ST, stYTestPred)
rocAucst = roc_auc_score(yTest_ST, yTestProbst)

fprst, tprst, _ = roc_curve(yTest_ST, yTestProbst)

In [None]:
#Boosting
finalFeature= [item for item in ['Age', 'Tenure', 'Usage_Frequency', 'Support_Calls', 'Payment_Delay',
       'Total_Spend', 'Last_Interaction', 'Gender_Male',
       'Subscription_Type_Premium', 'Subscription_Type_Standard',
       'Contract_Length_Monthly', 'Contract_Length_Quarterly'] if item not in removefeature]


encodingTrainDFForBOO = pd.get_dummies(training_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
# encodingTrainDFForDT = encodingTrainDFForDT.drop(columns=['Contract_Length_Quarterly', 'Usage_Frequency', 'Subscription_Type_Premium', 'Subscription_Type_Standard'])
XTrain_BOO = encodingTrainDFForBOO[finalFeature]
yTrain_BOO = encodingTrainDFForBOO['Churn']

encodingTestDFForBOO = pd.get_dummies(testing_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
XTest_BOO = encodingTestDFForBOO[finalFeature]
yTest_BOO = encodingTestDFForBOO['Churn']

In [None]:
#Boosting
from sklearn.ensemble import GradientBoostingClassifier

booclf = GradientBoostingClassifier(n_estimators=100, random_state=5805)
booclf.fit(XTrain_BOO, yTrain_BOO)

booYTestPred = booclf.predict(XTest_BOO)
booAccuracy = accuracy_score(yTest_BOO, booYTestPred)
print(f'Boosting = {booAccuracy.__round__(5)}')

In [None]:
# Boosting
yTestProbboo= booclf.predict_proba(XTest_BOO)[::, -1]
confusionMatrixboo = confusion_matrix(yTest_BOO, booYTestPred)
recallboo = recall_score(yTest_BOO, booYTestPred)
rocAucboo = roc_auc_score(yTest_BOO, yTestProbboo)

fprboo, tprboo, _ = roc_curve(yTest_BOO, yTestProbboo)

In [None]:
# Neural Network
encodingTrainDFForNN = pd.get_dummies(training_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
standardizedTrainDfNN= Standardized(encodingTrainDFForNN[['Age','Tenure','Usage_Frequency','Support_Calls','Payment_Delay','Total_Spend','Last_Interaction']])
standardizedTrainDfNN=pd.concat([standardizedTrainDfNN,encodingTrainDFForNN[['Gender_Male','Subscription_Type_Premium','Subscription_Type_Standard','Contract_Length_Monthly','Contract_Length_Quarterly']]], axis=1)
# encodingTrainDFForDT = encodingTrainDFForDT.drop(columns=['Contract_Length_Quarterly', 'Usage_Frequency', 'Subscription_Type_Premium', 'Subscription_Type_Standard'])
XTrain_NN = standardizedTrainDfNN
yTrain_NN = encodingTrainDFForNN['Churn']

encodingTestDFForNN = pd.get_dummies(testing_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
standardizedTestDfNN= Standardized(encodingTestDFForNN[['Age','Tenure','Usage_Frequency','Support_Calls','Payment_Delay','Total_Spend','Last_Interaction']])
standardizedTestDfNN = pd.concat([standardizedTestDfNN,encodingTestDFForNN[['Gender_Male','Subscription_Type_Premium','Subscription_Type_Standard','Contract_Length_Monthly','Contract_Length_Quarterly']]], axis=1)

XTest_NN = standardizedTestDfNN
yTest_NN = encodingTestDFForNN['Churn']

In [None]:
# Neural Network
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.neural_network import MLPClassifier
nnclf = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=5805)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=5805)
nnYTestPred = cross_val_predict(nnclf, XTest_NN, yTest_NN, cv=cv,method='predict_proba')

nnAccuracy = accuracy_score(yTest_NN, nnYTestPred.argmax(axis=1))
print(f'Boosting = {nnAccuracy.__round__(5)}')


In [None]:
# Neural Network
# yTestProbboo= booclf.predict_proba(XTest_BOO)[::, -1]
confusionMatrixnn = confusion_matrix(yTest_NN, nnYTestPred.argmax(axis=1))
recallnn = recall_score(yTest_NN, nnYTestPred.argmax(axis=1))
rocAucnn = roc_auc_score(yTest_NN, nnYTestPred[:, 1])

fprnn, tprnn, _ = roc_curve(yTest_NN, nnYTestPred[:, 1])

In [None]:
# Bagging
yTestProbba= baclf.predict_proba(XTest_BA)[::, -1]
confusionMatrixba = confusion_matrix(yTest_BA, baYTestPred)
recallba = recall_score(yTest_BA, baYTestPred)
rocAucba = roc_auc_score(yTest_BA, yTestProbba)

table2 = PrettyTable()
table2.field_names = ["","Accuracy", "confusion Matrix", "recall", 'AUC']
table2.add_row(["Decision Tree Post-Pruned",postrePrunedAccuracy.round(2), confusionMatrixPostPruned,recallPostPruned.round(2),rocAucPostPruned.round(2)])
table2.add_row(["logistic regression",logregAccuracy.round(2),confusionMatrixlogreg.round(2),recallLogreg.round(2),rocAucLogreg.round(2)])
table2.add_row(["SVM",svmAccuracy.round(2),confusionMatrixsvm.round(2),recallsvm.round(2),rocAucsvm.round(2)])
table2.add_row(["Naïve Bayes",nbAccuracy.round(2),confusionMatrixnb.round(2),recallnb.round(2),rocAucnb.round(2)])
table2.add_row(["KNN",knnAccuracy.round(2),confusionMatrixknn.round(2),recallknn.round(2),rocAucknn.round(2)])
table2.add_row(["RF",rfAccuracy.round(2),confusionMatrixrf.round(2),recallrf.round(2),rocAucrf.round(2)])
table2.add_row(["Bagging",baAccuracy.round(2),confusionMatrixba.round(2),recallba.round(2),rocAucba.round(2)])
print(table2)

In [None]:
# Phase IV: Clustering and Association

In [None]:
#K-mean
removefeature=[]
finalFeature= [item for item in ['Age', 'Tenure', 'Usage_Frequency', 'Support_Calls', 'Payment_Delay',
       'Total_Spend', 'Last_Interaction', 'Gender_Male',
       'Subscription_Type_Premium', 'Subscription_Type_Standard',
       'Contract_Length_Monthly', 'Contract_Length_Quarterly'] if item not in removefeature]



XCat_KM=  training_df[['Gender', 'Subscription_Type', 'Contract_Length']]
XNum_KM= training_df[['Age','Tenure','Usage_Frequency','Support_Calls','Payment_Delay','Total_Spend','Last_Interaction']]

# encodingTrainDFForBOO = pd.get_dummies(training_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
# # encodingTrainDFForDT = encodingTrainDFForDT.drop(columns=['Contract_Length_Quarterly', 'Usage_Frequency', 'Subscription_Type_Premium', 'Subscription_Type_Standard'])
# XTrain_
# XTrain_BOO = encodingTrainDFForBOO[finalFeature]
# yTrain_BOO = encodingTrainDFForBOO['Churn']
# 
# encodingTestDFForBOO = pd.get_dummies(testing_df, columns=['Gender', 'Subscription_Type', 'Contract_Length'],drop_first=True)
# XTest_BOO = encodingTestDFForBOO[finalFeature]
# yTest_BOO = encodingTestDFForBOO['Churn']

print(training_df.head(5))

In [None]:
#K-mean
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from kmodes.kmodes import KModes
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# numerical_features = XNum_KM
# categorical_features = XCat_KM

all_features = pd.concat([XNum_KM, XCat_KM], axis=1)

# scaler = StandardScaler()
# numerical_features_scaled = scaler.fit_transform(numerical_features)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, XNum_KM.columns),
        ('cat', 'passthrough', XCat_KM.columns)
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('kmeans', KModes(n_clusters=3, init='Huang', n_init=5, verbose=1))
])

pipeline.fit(all_features)

# training_df['cluster'] = pipeline.named_steps['kmeans'].labels_

# Assign cluster labels to the original data
# cluster = pd.Series(pipeline.named_steps['kmeans'].labels_)
# 
# # Visualize the clusters
# cluster_size = cluster.value_counts()
# plt.bar(cluster_size.index, cluster_size.values)
# plt.xlabel('Cluster')
# plt.ylabel('Number of Samples')
# plt.title('Cluster Sizes')
# plt.show()


In [None]:
#K-mean
cluster = pd.Series(pipeline.named_steps['kmeans'].labels_)

# Visualize the clusters
cluster_size = cluster.value_counts()
plt.bar(cluster_size.index, cluster_size.values)
plt.xlabel('Cluster')
plt.ylabel('Number of Samples')
plt.title('Cluster Sizes')
plt.show()

In [None]:
#K-mean
cluster_sizes = [50, 30, 20]

# Assuming 'cluster_series' is the Pandas Series containing cluster labels
cluster_counts = cluster.value_counts()

# Plotting a pie chart
plt.figure(figsize=(8, 8))
plt.pie(cluster_counts, labels=cluster_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Cluster Distribution')
plt.show()

In [None]:
#K-mean
print('The sizes of the clusters are imbalanced, with the 0th cluster having the largest number of observations (175,000), followed by the 1st cluster (108,000) and the 2nd cluster (90,000) \nThe 0th cluster is significantly larger than the other clusters, suggesting that it might represent a more dominant or prevalent group in the dataset.\nThe large size of the dominant cluster could pose challenges in terms of interpretability. It might be more challenging to distinguish unique patterns within this cluster due to its size\nThe presence of multiple clusters indicates that there are distinct subgroups in the data. Each cluster may represent a different pattern or behavior among the observations\nThe results of association rule mining would depend on the specific features used and the relationships explored. The imbalanced cluster sizes may influence the rules generated, and its essential to consider the context of the analysis\n ')

In [None]:
# DBSCAN
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Load Titanic dataset
# titanic_data = pd.read_csv("train.csv")

# Select features for clustering (both numerical and categorical)
numerical_features = XNum_KM
categorical_features = XCat_KM

# Define preprocessing steps for numerical features
numerical_preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Define preprocessing steps for categorical features
categorical_preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

# Combine numerical and categorical preprocessing using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_preprocessor, numerical_features.columns),
        ('cat', categorical_preprocessor, categorical_features.columns)
    ])

# Create a pipeline with preprocessing and DBSCAN
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('dbscan', DBSCAN(eps=0.5, min_samples=5))  # Adjust parameters accordingly
])
all_features = pd.concat([XNum_KM, XCat_KM], axis=1)

# Fit the model
labels = pipeline.fit_predict(all_features)

cluster_counts = pd.Series(labels).value_counts().sort_index()
# Create a bar plot
plt.bar(cluster_counts.index, cluster_counts.values)
plt.xlabel('Cluster')
plt.ylabel('Number of Samples')
plt.title('Number of Samples in Each Cluster')
plt.show()
