# Import Data

In [None]:
#Import stuff
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, f1_score
from sklearn.neighbors import DistanceMetric
import warnings
import gower
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import inspect
import matplotlib.pyplot as plt
from sklearn import metrics
from keras import backend as K
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import plot_tree
from sklearn import tree


def recall_m(y_true, y_pred): # TPR
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) # TP
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) # P
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) # TP
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) # TP + FP
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision
    
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def f1_m2(y_true, y_pred):
    return f1_score(y_true, y_pred)
    #precision = precision_m(y_true, y_pred)
    #recall = recall_m(y_true, y_pred)
    #return 2*((precision*recall)/(precision+recall+K.epsilon()))

def TP(y_true, y_pred):
    tp = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) # TP
    y_pos = K.round(K.clip(y_true, 0, 1))
    n_pos = K.sum(y_pos)
    y_neg = 1 - y_pos
    n_neg = K.sum(y_neg)
    n = n_pos + n_neg
    return tp/n

def TN(y_true, y_pred):
    y_pos = K.round(K.clip(y_true, 0, 1))
    n_pos = K.sum(y_pos)
    y_neg = 1 - y_pos
    n_neg = K.sum(y_neg)
    n = n_pos + n_neg
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos
    tn = K.sum(K.round(K.clip(y_neg * y_pred_neg, 0, 1))) # TN
    return tn/n

def FP(y_true, y_pred):
    y_pos = K.round(K.clip(y_true, 0, 1))
    n_pos = K.sum(y_pos)
    y_neg = 1 - y_pos
    n_neg = K.sum(y_neg)
    n = n_pos + n_neg
    tn = K.sum(K.round(K.clip(y_neg * y_pred, 0, 1))) # FP
    return tn/n

def FN(y_true, y_pred):
    y_pos = K.round(K.clip(y_true, 0, 1))
    n_pos = K.sum(y_pos)
    y_neg = 1 - y_pos
    n_neg = K.sum(y_neg)
    n = n_pos + n_neg
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos
    tn = K.sum(K.round(K.clip(y_true * y_pred_neg, 0, 1))) # FN
    return tn/n

In [None]:
#accuracy after each batch
class BCP(tf.keras.callbacks.Callback):
    batch_accuracy = [] # accuracy at given batch
    batch_f1 = [] # f1 at given batch
    
    def __init__(self):
        super(BCP,self).__init__() 
        
    def on_train_batch_end(self, batch, logs=None):
        BCP.batch_accuracy.append(logs.get('accuracy'))
        BCP.batch_f1.append(logs.get('f1_m'))
        


In [None]:
#accuracy after each batch
class BCP2(tf.keras.callbacks.Callback):
    batch_accuracy = [] # accuracy at given batch
    batch_f1 = [] # f1 at given batch
    batch_f1_val = [] # f1 of validation at given batch
    
    def __init__(self, val_data):
        super(BCP,self).__init__() 
        self.validation_data = val_data
        
    def on_train_batch_end(self, batch, logs=None):
        x_val = self.validation_data[0]
        y_val_true = self.validation_data[1]
        y_val_pred = self.model.predict(x_val, verbose=0)
        
        print(vars(self))
        print(vars(batch))
        print(vars(logs))
        
        y_val_class = [0 if val < .5 else 1 for val in y_val_pred]
        batchF1 = f1_m2(y_val_true, y_val_class)
        
        BCP.batch_f1_val.append(batchF1)
        BCP.batch_accuracy.append(logs.get('accuracy'))
        BCP.batch_f1.append(logs.get('f1_m'))

In [None]:
def evalNN(thresh, pred, ytest):
    plotROC(pred, ytest)
    classPred = [0 if val < thresh else 1 for val in pred]
    evaluate(ytest, classPred, thresh)

def thresh(pred, ytest):
    bestacc = 0
    besttp = 0
    bestf1 = 0
    bestthresh = 0
    accList = []
    tpList = []
    f1list = []
    threshlist = []
    for i in range(1,100):
        classPred = [0 if val < (i/100) else 1 for val in pred]
        accuracy = accuracy_score(ytest, classPred)
        tpr = recall_score(ytest, classPred)
        f1 = f1_score(ytest, classPred)
        accList.append(accuracy)
        tpList.append(tpr)
        f1list.append(f1)
        threshlist.append(i)
        if (f1>bestf1):
            bestacc= accuracy
            besttp = tpr
            bestf1 = f1
            bestthresh=i
            
    plt.plot(threshlist, accList, 'b', label='Accuracy')
    plt.plot(threshlist, tpList, 'r', label='Recall')
    plt.plot(threshlist, f1list, 'g', label='F1 score')
    plt.axvline(x = bestthresh, color = 'k', label = 'Best threshold')
    plt.title('Threshold Graph')
    plt.xlabel('Threshold')
    plt.ylabel('Metric Value')
    plt.legend()
    plt.show()

    return (bestthresh/100)

def plotNN(history):
    # Extract accuracy and TPR values from the training history
    accuracy = history.history['accuracy']
    val_accuracy = history.history['val_accuracy']
    f1 = history.history['f1_m']
    val_f1 = history.history['val_f1_m']
    epochs = range(1, len(accuracy) + 1)

    # Plot the accuracy values
    plt.plot(epochs, accuracy, 'b', label='Training Accuracy')
    plt.plot(epochs, val_accuracy, 'r', label='Validation Accuracy')
    
    # Plot the recall values
    plt.plot(epochs, f1, 'k', label='Training f1')
    plt.plot(epochs, val_f1, 'c', label='Validation f1')

    plt.title('Training and Validation Metrics')
    plt.xlabel('Epochs')
    plt.ylabel('Metric Value')
    plt.legend()
    plt.show()

def NN(df, xtest, ytest):
    # Define the model architecture
    model = Sequential()
    model.add(Dense(64, activation='relu', input_dim=30))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy',
                  metrics=['accuracy', f1_m])
    
    xtrain = df.drop("Churn_Yes", axis=1)
    ytrain = df["Churn_Yes"]
    
    BCP.batch_accuracy.clear()
    BCP.batch_f1.clear()
    
    val_data = (xtest, ytest)
    
    history = model.fit(xtrain, ytrain, epochs=200, batch_size=32, shuffle=False, 
                        validation_data=val_data, callbacks = [BCP()], verbose=0)
    
    plt.plot(range(len(BCP.batch_f1)), BCP.batch_f1, 'r', label='F1')
    plt.title('Batch F1 Graph')
    plt.xlabel('Batch')
    plt.ylabel('F1')
    plt.legend()
    plt.show()
    
    plotNN(history)
    
    print("########################   TRAIN   ########################")
    pred = model.predict(xtrain)
    threshold = thresh(pred, ytrain)
    evalNN(threshold, pred, ytrain)
    
    
    print("\n\n\n########################   TEST   ########################")
    pred = model.predict(xtest)
    evalNN(threshold, pred, ytest)
    
def NNSH(df, xtest, ytest):
    # Define the model architecture
    model = Sequential()
    model.add(Dense(64, activation='relu', input_dim=30))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy',
                  metrics=['accuracy', f1_m])
    
    xtrain = df.drop("Churn_Yes", axis=1)
    ytrain = df["Churn_Yes"]
    
    BCP.batch_accuracy.clear()
    BCP.batch_f1.clear()
    
    val_data = (xtest, ytest)
    
    history = model.fit(xtrain, ytrain, epochs=200, batch_size=32, shuffle=True, 
                        validation_data=val_data, callbacks = [BCP()], verbose=0)
    
    plt.plot(range(len(BCP.batch_f1)), BCP.batch_f1, 'r', label='F1')
    plt.title('Batch F1 Graph')
    plt.xlabel('Batch')
    plt.ylabel('F1')
    plt.legend()
    plt.show()
    
    plotNN(history)
    print("########################   TRAIN   ########################")
    pred = model.predict(xtrain)
    threshold = thresh(pred, ytrain)
    evalNN(threshold, pred, ytrain)
    
    
    print("\n\n\n########################   TEST   ########################")
    pred = model.predict(xtest)
    evalNN(threshold, pred, ytest)
    
def plotROC(prob, ytest):
    fpr, tpr, _ = metrics.roc_curve(ytest,  prob)
    auc = metrics.roc_auc_score(ytest, prob)
    #create ROC curve
    plt.plot(fpr,tpr,label="AUC="+str(auc))
    plt.title('ROC and AUC')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.legend(loc=4)
    plt.show()
    
def findClosest(num):
    i = (notSBD['Prob'] - num).abs().idxmin()
    row = notSBD.loc[i]
    notSBD.drop(i, inplace=True)
    return row

def evaluate(acc, pred, bestthresh):
    cm = confusion_matrix(acc, pred)
    bestacc = accuracy_score(acc, pred)
    besttp = recall_score(acc, pred)
    bestf1 = f1_score(acc, pred)
    
    print("Confusion Matrix:")
    print(cm)
    print("Best Threshold:", bestthresh)
    print("Accuracy:", bestacc)
    print("Recall:", besttp)
    print("F1:", bestf1)


    

def dist(df):
    yes = len(df[df["Churn_Yes"] == 1])
    no = len(df[df["Churn_Yes"] == 0])
    print("Churn Yes:", yes)
    print("Churn No:", no)
    return (yes, no)

In [None]:
#read data
churn = pd.read_csv(r"C:\Users\21sla\OneDrive - Dickinson College\Data300\WA_Fn-UseC_-Telco-Customer-Churn.csv")
print("Before:", len(churn))
churn = churn.dropna(how= 'any', axis=0)
print("After:", len(churn))
#churn.head(5)
churn.dtypes

In [None]:
#Drop ID
clean = churn.drop("customerID", axis=1)

#Remove missing values
clean.replace(' ', np.nan, inplace=True)
print("Before:", len(clean))
clean = clean.dropna(how= 'any', axis=0)
print("After:", len(clean))

#set data types
clean["TotalCharges"] = clean["TotalCharges"].astype(float)
clean["SeniorCitizen"] = clean["SeniorCitizen"].astype(object)


In [None]:
#Make dummies
dummies = pd.get_dummies(clean, drop_first= True)
clean = dummies
clean.dtypes

# Log Model

In [None]:
x = clean.drop('Churn_Yes', axis=1)
y = clean['Churn_Yes']

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)


In [None]:
xtrain.head(5)

In [None]:
xtest.head(5)

In [None]:
#Training 
model = LogisticRegression(max_iter=1000)
model.fit(xtrain, ytrain)

In [None]:
#Getting probabilities
yprob = model.predict_proba(xtest)
threshold = thresh(yprob[:, 1], ytest)
#threshold = 0.5
ypred = (yprob[:, 1] > threshold).astype(int)

In [None]:
#Log results
evaluate(ytest,ypred, threshold)

# Oversample

In [None]:
fittedProb = model.predict_proba(xtrain)[:, 1]

In [None]:
fittedClass = (fittedProb > threshold).astype(int)
probdf = xtrain.copy()
probdf["Churn_Yes"] = ytrain
probdf["Prob"] = fittedProb
probdf["Class"] = fittedClass


In [None]:
evaluate(ytrain,fittedClass, threshold)

In [None]:
misses = probdf[probdf["Churn_Yes"] != probdf["Class"]]
len(misses)

In [None]:
###Simple oversample###

class1 = probdf[probdf["Churn_Yes"] == 1]
distribution = dist(probdf)

oversample = class1.sample(n=(distribution[1]-distribution[0]), replace = True)

overDf = pd.concat([probdf, oversample], ignore_index=True)

dist(overDf)

In [None]:
###Oversample misses###

missDf = pd.concat([probdf, misses], ignore_index=True)
len(missDf)


In [None]:
probdf.head(5)

In [None]:
###Hybrid###

missedClass1 = misses[misses["Churn_Yes"] == 1]
overmiss = missedClass1.sample(n=(distribution[1]-distribution[0]), replace = True)

overMissDf = pd.concat([probdf, overmiss], ignore_index=True)
dist(overMissDf)


# Similar Probability, Different Class

In [None]:
class1 = probdf[probdf["Churn_Yes"] == 1].sort_values(by="Prob")
class0 = probdf[probdf["Churn_Yes"] == 0].sort_values(by="Prob")

#Similar but different
SBD = pd.DataFrame()
notSBD = class0.copy()


with warnings.catch_warnings():
    warnings.simplefilter(action='ignore', category=FutureWarning)
    for i in range(len(class1)):
        #print(len(SBD))
        row = class1.iloc[i]
        SBD = SBD.append(row)
        row2 = findClosest(row["Prob"])
        SBD = SBD.append(row2)
    


In [None]:
print(len(notSBD))
print(len(class0))
print(len(probdf))
print(len(SBD))

In [None]:
SBD[["Churn_Yes", "Prob", "Class"]].head(5)

# Hard/Easy To Predict

In [None]:
#Min and max prob
print("Min", max(float(class0["Prob"].head(1)), float(class1["Prob"].head(1))))
print("Max", min(float(class0["Prob"].tail(1)), float(class1["Prob"].tail(1))))
class1 = probdf[probdf["Churn_Yes"] == 1].sort_values(by="Prob")
class0 = probdf[probdf["Churn_Yes"] == 0].sort_values(by="Prob")

In [None]:
def hardToPredict(std, threshold):
    
    upperlimit = threshold + std
    lowerlimit = threshold - std
    
    print("Upperlimit:", upperlimit, "Lowerlimit:", lowerlimit)
    
    C0HP = class0[(class0["Prob"]>upperlimit)].sort_values(by="Prob", ascending = False)
    notC0Hp = class0[(class0["Prob"]<=upperlimit)]

    C1LP = class1[(class1["Prob"]<lowerlimit)].sort_values(by="Prob")
    notC1LP = class1[(class1["Prob"]>=lowerlimit)]
    
    with warnings.catch_warnings():
        warnings.simplefilter(action='ignore', category=FutureWarning)
        notHTP = pd.DataFrame().append(notC0Hp).append(notC1LP)
        HTP = pd.DataFrame()
        
        alternate = min(len(C0HP), len(C1LP))
        for i in range (0, alternate):
            HTP = HTP.append(C0HP.iloc[i]).append(C1LP.iloc[i])
            #print(len(orderedHTP))

        if (alternate == len(C0HP)):
            HTP = HTP.append(C1LP.iloc[(len(C0HP)):(len(C1LP))])

        else:
            HTP = HTP.append(C0HP.iloc[(len(C1LP)):(len(C0HP))])
    
    print("HTP:", len(HTP))
    print("notHTP:", len(notHTP))
    
    return (HTP, notHTP)

In [None]:
stdProb = .5 * probdf["Prob"].std()

fullHTP = hardToPredict(stdProb, threshold)

notHTP = fullHTP[1]
HTP = fullHTP[0]

print("HTP:", len(HTP))
print("notHTP:", len(notHTP))

HTP[["Churn_Yes", "Prob", "Class"]].head(5)




In [None]:
upperlimit = threshold + stdProb
lowerlimit = threshold - stdProb

In [None]:
save = probdf[probdf['Prob'].between(lowerlimit, upperlimit)]

notsave = probdf.copy()
notsave = notsave.drop(save.index)

In [None]:
print(len(save))
print(len(notsave))
len(probdf)

# Weighted Probability

In [None]:
import random
import math

In [None]:
def weightedBatches(higherWeightdf, lowerWeightdf, weightMultiplier, unitsPerBatch, v=False):
    numBatches = math.floor((len(higherWeightdf) + len(lowerWeightdf))/unitsPerBatch)
    
    high = higherWeightdf.copy()
    low = lowerWeightdf.copy()
    
    low["weight"] = 1
    high["weight"] = weightMultiplier
    
    joint = pd.concat([low, high])
    weighted = pd.DataFrame()
    
    numWeighted = []
    
    for i in range(numBatches):
        sample = joint.sample(n = unitsPerBatch, replace=False, weights='weight')
        joint = joint.drop(sample.index)
        
        numWeighted.append(len(sample[sample["weight"] == weightMultiplier]))
        weighted = pd.concat([weighted, sample])
    
    weighted = pd.concat([weighted, joint])
    
    mp = math.floor(len(numWeighted)/2)
    
    if (v):
        print("Weighted units per batch in first half:", sum(numWeighted[0:mp])/mp)

        print("Weighted units per batch in second half:", sum(numWeighted[mp:])/(len(numWeighted)-mp))
    
    return weighted.drop(["weight"], axis=1)


In [None]:
weightedHTP = weightedBatches(HTP, notHTP, 3, 5)


In [None]:
weightedHTP.head(5)

# Bootstrap

In [None]:
def bootstrap(model, xtest, ytest):
    test = xtest.copy()
    test["Churn_Yes"] = ytest

    BSF1 = []

    for i in range(30):
        bootstrapTest = test.iloc[np.random.choice(len(test), size=len(test), replace=True)]
        xtest = bootstrapTest.drop('Churn_Yes', axis=1)
        ytest = bootstrapTest['Churn_Yes']

        pred = model.predict(xtest, verbose=0)
        bestThresh = thresh2(pred, ytest)
        classPred = [0 if val < bestThresh else 1 for val in pred]
        f1 = f1_score(ytest, classPred)
        BSF1.append(f1)
    
    return sum(BSF1)/len(BSF1)

In [None]:
def thresh2(pred, ytest):
    bestf1 = 0
    bestthresh = 0
    f1list = []
    threshlist = []
    for i in range(1,100):
        classPred = [0 if val < (i/100) else 1 for val in pred]
        f1 = f1_score(ytest, classPred)
        f1list.append(f1)
        threshlist.append(i)
        if (f1>bestf1):
            bestf1 = f1
            bestthresh=(i/100)

    return bestthresh

# Neural Network

In [None]:
NNSH(probdf.drop(["Class", "Prob"], axis=1), xtest, ytest)

In [None]:
#Oversample
NNSH(overDf.drop(["Class", "Prob"], axis=1), xtest, ytest)

In [None]:
#Oversample Misses
NN(missDf.drop(["Class", "Prob"], axis=1), xtest, ytest)

In [None]:
#hybrid oversample
NNSH(overMissDf.drop(["Class", "Prob"], axis=1), xtest, ytest)

In [None]:
concatRes = pd.concat([SBD, notSBD]).drop(["Class", "Prob"], axis=1)
#SBD with other
NN(concatRes, xtest, ytest)


In [None]:
concatRes2 = pd.concat([HTP, notHTP]).drop(["Class", "Prob"], axis=1)
#HTP with other
NN(concatRes2, xtest, ytest)


In [None]:
concatRes3 = pd.concat([save, notsave]).drop(["Class", "Prob"], axis=1)
#HTP with other
NN(concatRes3, xtest, ytest)


# Weighted Sample

In [None]:
def WNN(high, low, xtest, ytest):
    # Define the model architecture
    model = Sequential()
    model.add(Dense(64, activation='relu', input_dim=30))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy',
                  metrics=['accuracy', f1_m])

    BCP.batch_accuracy.clear()
    BCP.batch_f1.clear()

    val_data = (xtest, ytest)

    accuracy = []
    val_accuracy = []
    f1 = []
    val_f1 = []
    epochs = range(1, 201)
    
    for i in range(200):
        df = weightedBatches(high, low, 5, 32)
        #print(df.head(1))
        xtrain = df.drop("Churn_Yes", axis=1)
        ytrain = df["Churn_Yes"]

        history = model.fit(xtrain, ytrain, epochs=1, batch_size=32, shuffle=False, 
                            validation_data=val_data, callbacks = [BCP()], verbose=0)
        accuracy.append(history.history['accuracy'])
        val_accuracy.append(history.history['val_accuracy'])
        f1.append(history.history['f1_m'])
        val_f1.append(history.history['val_f1_m'])
        
    ###########
    # Plot the accuracy values
    plt.plot(epochs, accuracy, 'b', label='Training Accuracy')
    plt.plot(epochs, val_accuracy, 'r', label='Validation Accuracy')
    
    # Plot the recall values
    plt.plot(epochs, f1, 'k', label='Training f1')
    plt.plot(epochs, val_f1, 'c', label='Validation f1')

    plt.title('Training and Validation Metrics')
    plt.xlabel('Epochs')
    plt.ylabel('Metric Value')
    plt.legend()
    plt.show()
    ###########


    plt.plot(range(len(BCP.batch_f1)), BCP.batch_f1, 'r', label='F1')
    plt.title('Batch F1 Graph')
    plt.xlabel('Batch')
    plt.ylabel('F1')
    plt.legend()
    plt.show()

    print("########################   TRAIN   ########################")
    pred = model.predict(xtrain)
    threshold = thresh(pred, ytrain)
    evalNN(threshold, pred, ytrain)


    print("\n\n\n########################   TEST   ########################")
    pred = model.predict(xtest)
    evalNN(threshold, pred, ytest)

In [None]:
WNN(HTP.drop(["Class", "Prob"], axis=1), notHTP.drop(["Class", "Prob"], axis=1), xtest, ytest)

In [None]:
WNN(save.drop(["Class", "Prob"], axis=1), notsave.drop(["Class", "Prob"], axis=1), xtest, ytest)

# Easy First

In [None]:
concatRes = pd.concat([notSBD, SBD]).drop_duplicates()
#SBD with other
NN(concatRes.drop(["Class", "Prob"], axis=1), xtest, ytest)

In [None]:
concatRes = pd.concat([notHTP, HTP]).drop_duplicates()
#HTP with other
NN(concatRes.drop(["Class", "Prob"], axis=1), xtest, ytest)

# Add more Hard to Predict

In [None]:
concatRes = pd.concat([HTP, HTP, notHTP])
NN(concatRes.drop(["Class", "Prob"], axis=1), xtest, ytest)

In [None]:
concatRes = pd.concat([HTP, HTP, HTP, notHTP])
NN(concatRes.drop(["Class", "Prob"], axis=1), xtest, ytest)

In [None]:
concatRes = pd.concat([HTP, HTP, HTP, HTP, notHTP])
NN(concatRes.drop(["Class", "Prob"], axis=1), xtest, ytest)

In [None]:
dist(misses)

# Cluster

In [None]:
from kmodes.kprototypes import KPrototypes


In [None]:
# Example dataset
data = probdf.drop(["Churn_Yes","Class", "Prob"], axis=1)
# Specify the column indices of numerical variables
num_cols = [0, 1, 2]

# Specify the column indices of categorical variables
cat_cols = list(range(3, 30))

# Specify the number of clusters
n_clusters = 3

# Initialize and fit the K-Prototypes model
kproto = KPrototypes(n_clusters=n_clusters, init='Cao', verbose=False)
clusters = kproto.fit_predict(data.values, categorical=cat_cols)

# Add the cluster labels to the original dataset
data['Cluster'] = clusters


In [None]:
# Visualize the clusters
plt.scatter(data['tenure'], data['MonthlyCharges'], c=data['Cluster'], cmap='viridis')
plt.xlabel('tenure')
plt.ylabel('MonthlyCharges')
plt.title('K-Prototypes Clustering')
plt.show()

In [None]:
# Visualize the clusters
plt.scatter(class0['tenure'], class0['MonthlyCharges'], c="g", label="0")
plt.scatter(class1['tenure'], class1['MonthlyCharges'], c="b", label="1")
plt.scatter(misses['tenure'], misses['MonthlyCharges'], c="r", marker = "x", label="Misses")
plt.xlabel('tenure')
plt.ylabel('MonthlyCharges')
plt.title('Misses')
plt.legend()
plt.show()

In [None]:
def perc(df, var):
    class0 = df[df["Churn_Yes"] == 0]
    class1 = df[df["Churn_Yes"] == 1]
    
    total = len(class0) + len(class1)
    zero = len(class0)/total
    one = len(class1)/total
    
    class0var0 = class0[class0[var] == 0]
    class0var1 = class0[class0[var] == 1]
    
    class1var0 = class1[class1[var] == 0]
    class1var1 = class1[class1[var] == 1]
    
    zeroZero = zero * len(class0var0)/len(class0)
    zeroOne = zero *len(class0var1)/len(class0)
    
    oneZero = one * len(class1var0)/len(class1)
    oneOne = one * len(class1var1)/len(class1)
    
    return([zeroZero, zeroOne, oneZero, oneOne])
    

In [None]:
misses.head(5)

In [None]:
def plotpie(df, var):
    labels = ["00", "01", "10", "11"]
    sizes = perc(df, var)
    print(sizes)
    colors = ['red', 'green', 'cyan', 'magenta']

    # Create the pie chart
    plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)

    # Set aspect ratio to be equal so that pie is drawn as a circle
    plt.axis('equal')

    # Add a title
    plt.title(var)

    # Display the chart
    plt.show()

In [None]:
columns = misses.columns.tolist()

for i in range(3, 30):
    plotpie(probdf, columns[i])


# RWS, Val Batch, Bootstrap

In [None]:
def NN(df, xtest, ytest):
    
    
    # Define the model architecture
    model = Sequential()
    model.add(Dense(64, activation='relu', input_dim=30))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy',
                  metrics=['accuracy', f1_m, recall_m, precision_m])

    
    
    xtrain = df.drop("Churn_Yes", axis=1)
    ytrain = df["Churn_Yes"]
    
    BCP.batch_accuracy.clear()
    BCP.batch_f1.clear()
    
    val_data = (xtest, ytest)
    
    print("Starting Model Training")
    history = model.fit(xtrain, ytrain, epochs=5, batch_size=50, shuffle=False, 
                        validation_data=val_data, callbacks = [BCP()], verbose=0)
    print("Model Training Finished")
    
    
    plt.plot(range(len(BCP.batch_accuracy)), BCP.batch_accuracy, 'b', label='Accuracy')
    plt.title('Batch Graph')
    plt.xlabel('Batch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()
    
    plt.plot(range(len(BCP.batch_f1_val)), BCP.batch_f1_val, 'r', label='Val F1')
    plt.title('Batch Val F1 Graph')
    plt.xlabel('Batch')
    plt.ylabel('F1')
    plt.legend()
    plt.show()
    
    print("Training ROC:")
    pred = model.predict(xtrain)
    #print(pred)
    plotROC(pred, ytrain)
    plotNN(history)
    evalNN(model, xtest, ytest)
    print("Bootstrap:", bootstrap(model, xtest, ytest))
    
    return (BCP.batch_accuracy, BCP.batch_f1, BCP.batch_f1_val)
    
def NNSh(df, xtest, ytest):
    
    
    # Define the model architecture
    model = Sequential()
    model.add(Dense(64, activation='relu', input_dim=30))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy',
                  metrics=['accuracy' ,f1_m, recall_m, precision_m])
    
    
    
    xtrain = df.drop("Churn_Yes", axis=1)
    ytrain = df["Churn_Yes"]
    
    BCP.batch_accuracy.clear()
    BCP.batch_f1.clear()
    BCP.batch_f1_val.clear()
    
    val_data = (xtest, ytest)
    
    print("Starting Model Training")
    history = model.fit(xtrain, ytrain, epochs=5, batch_size=50, shuffle=True, 
                        validation_data=val_data, callbacks = [BCP(val_data)], verbose=0)
    print("Model Training Finished")
    
    
    plt.plot(range(len(BCP.batch_accuracy)), BCP.batch_accuracy, 'b', label='Accuracy')
    plt.title('Batch Graph')
    plt.xlabel('Batch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()
    
    plt.plot(range(len(BCP.batch_f1_val)), BCP.batch_f1_val, 'r', label='Val F1')
    plt.title('Batch Val F1 Graph')
    plt.xlabel('Batch')
    plt.ylabel('F1')
    plt.legend()
    plt.show()
    
    print("Training ROC:")
    pred = model.predict(xtrain)
    #print(pred)
    plotROC(pred, ytrain)
    plotNN(history)
    evalNN(model, xtest, ytest)
    print("Bootstrap:", bootstrap(model, xtest, ytest))
    
    return (BCP.batch_accuracy, BCP.batch_f1, BCP.batch_f1_val)

def plotBatch(stat, stat2, name):
    plt.plot(range(len(stat)), stat, 'b', label='No Shuffel')
    plt.plot(range(len(stat2)), stat2, 'r', label='Shuffel')
    
    plt.title(name + 'Batch Graph')
    plt.xlabel('Batch')
    plt.ylabel(name)
    plt.legend()
    plt.show()

In [None]:
weightedHTP = weightedBatches(HTP, notHTP, 3, 50)

In [None]:
weightedsave = weightedBatches(save, notsave, 3, 50)

In [None]:
normsh = NNSh(probdf.drop(["Class", "Prob"], axis=1), xtest, ytest)

In [None]:
x = normsh[0].copy()
y = normsh[1].copy()
z = normsh[2].copy()

In [None]:
weightedHTPData = NN(weightedHTP.drop(["Class", "Prob"], axis=1), xtest, ytest)

In [None]:
x1 = weightedHTPData[0].copy()
y1 = weightedHTPData[1].copy()
z1 = weightedHTPData[2].copy()

In [None]:
weightedSaveData = NN(weightedsave.drop(["Class", "Prob"], axis=1), xtest, ytest)

In [None]:
x2 = weightedSaveData[0].copy()
y2 = weightedSaveData[1].copy()
z2 = weightedSaveData[2].copy()

In [None]:
concatRes = pd.concat([SBD, notSBD])
SBDData = NN(concatRes.drop(["Class", "Prob"], axis=1), xtest, ytest)

In [None]:
x3 = SBDData[0].copy()
y3 = SBDData[1].copy()
z3 = SBDData[2].copy()

In [None]:
plotBatch(z1, z, "Weighted HTP F1 Validation")

In [None]:
plotBatch(z2, z, "Weighted Save F1 Validation")

In [None]:
plotBatch(z3, z, "SBD F1 Validation")

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import plot_tree
from sklearn import tree

In [None]:
def varImp(t, df):
    varNames = df.columns.tolist()
    
    imp = t.feature_importances_
    impDict = {}
    
    for feature, importance in zip(varNames, imp):
        impDict[feature] = importance
    
    impDF = pd.DataFrame.from_dict(impDict, orient='index', columns=['Importance'])
    
    return impDF

    

In [None]:
def dtree(x, y, xtest, ytest):
    dtc = DecisionTreeClassifier()
    rfc = RandomForestClassifier(n_estimators=100, random_state=42)
    gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42, max_depth = 2)
    
    dtc.fit(x,y)
    rfc.fit(x,y)
    gbc.fit(x,y)
    
    print("--------------------Regular Tree--------------------")
    dtcimp = varImp(dtc, x)
    print(dtcimp.sort_values("Importance", ascending= False).head(5))
    yfit = dtc.predict(x)
    print("Training:")
    evaluate(y, yfit)
    ypred = dtc.predict(xtest)
    print("Testomg:")
    evaluate(ytest, ypred)
    
    
    print("--------------------Random Forest--------------------")
    rfcimp = varImp(rfc, x)
    print(rfcimp.sort_values("Importance", ascending= False).head(5))
    yfit = rfc.predict(x)
    ypred = rfc.predict(xtest)
    print("Training:")
    evaluate(y, yfit)
    print("Testing:")
    evaluate(ytest, ypred)
    
    print("--------------------Gradient Boost--------------------")
    gbcimp = varImp(gbc, x)
    print(gbcimp.sort_values("Importance", ascending= False).head(5))
    yfit = gbc.predict(x)
    ypred = gbc.predict(xtest)
    print("Training:")
    evaluate(y, yfit)
    print("Testing:")
    evaluate(ytest, ypred)
    
    
    
    
    

In [None]:
dtree(probdf.drop(["Class", "Prob", "Churn_Yes"], axis=1), probdf["Churn_Yes"], xtest, ytest)


In [None]:
dtree(HTP.drop(["Class", "Prob", "Churn_Yes"], axis= 1), HTP["Churn_Yes"], HTPTest.drop(["Class", "Prob", "Churn_Yes"], axis = 1), HTPTest["Churn_Yes"])

In [None]:
dtree(SBD.drop(["Class", "Prob", "Churn_Yes"], axis=1), SBD["Churn_Yes"], xtest, ytest)

In [None]:
dtree(HTP.drop(["Class", "Prob", "Churn_Yes"], axis=1), HTP["Churn_Yes"], xtest, ytest)

In [None]:
dtree(save.drop(["Class", "Prob", "Churn_Yes"], axis=1), save["Churn_Yes"], xtest, ytest)

In [None]:
dtree(notHTP.drop(["Class", "Prob", "Churn_Yes"], axis=1), notHTP["Churn_Yes"], notHTPTest.drop(["Class", "Prob", "Churn_Yes"], axis = 1), notHTPTest["Churn_Yes"])

# Specialized models

In [None]:
def hardToPredictTest(std, threshold):
    
    class0 = class0Test.copy()
    class1 = class1Test.copy()
    upperlimit = threshold + std
    lowerlimit = threshold - std
    
    print("Upperlimit:", upperlimit, "Lowerlimit:", lowerlimit)
    
    C0HP = class0[(class0["Prob"]>upperlimit)].sort_values(by="Prob", ascending = False)
    notC0Hp = class0[(class0["Prob"]<=upperlimit)]

    C1LP = class1[(class1["Prob"]<lowerlimit)].sort_values(by="Prob")
    notC1LP = class1[(class1["Prob"]>=lowerlimit)]
    
    with warnings.catch_warnings():
        warnings.simplefilter(action='ignore', category=FutureWarning)
        notHTP = pd.DataFrame().append(notC0Hp).append(notC1LP)
        HTP = pd.DataFrame()
        
        alternate = min(len(C0HP), len(C1LP))
        for i in range (0, alternate):
            HTP = HTP.append(C0HP.iloc[i]).append(C1LP.iloc[i])
            #print(len(orderedHTP))

        if (alternate == len(C0HP)):
            HTP = HTP.append(C1LP.iloc[(len(C0HP)):(len(C1LP))])

        else:
            HTP = HTP.append(C0HP.iloc[(len(C1LP)):(len(C0HP))])
    
    print("HTP:", len(HTP))
    print("notHTP:", len(notHTP))
    
    return (HTP, notHTP)

def NN(df, xtest, ytest, t2=False, bo = False):
    # Define the model architecture
    model = Sequential()
    model.add(Dense(64, activation='relu', input_dim=30))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy',
                  metrics=['accuracy', recall_m, precision_m, f1_m])

    
    xtrain = df.drop("Churn_Yes", axis=1)
    ytrain = df["Churn_Yes"]
    
    print("Starting Model Training")
    history = model.fit(xtrain, ytrain, epochs=200, batch_size=32, shuffle=True, verbose=0)
    print("Model Training Finished")
    
    if (bo):
        yprob = model.predict(t2.drop(["Class", "Prob", "Churn_Yes"], axis = 1))
        print("NN", yprob[0], "acutal", t2["Churn_Yes"].head(1))
        # Create a scatter plot
        plt.scatter(yprob, t2.index, c=t2["Churn_Yes"], cmap='viridis')
        plt.xlabel('Predicted Probability')
        plt.ylabel('Sample Unit')
        plt.title('Predicted Probabilities and Class Labels')
        plt.show()
        
    print("Training ROC:")
    pred = model.predict(xtrain)
    
    #print(pred)
    plotROC(pred, ytrain)
    #plotNN(history)
    return(evalNN(model, xtest, ytest))

def evaluate(acc, pred):
    cm = confusion_matrix(acc, pred)
    print("Confusion Matrix:")
    print(cm)
    truePositive = cm[1, 1]
    trueNegative = cm[0, 0]
    falsePositive = cm[0, 1]
    falseNegative = cm[1, 0]
    print("\nTrue Positive:", truePositive)
    print("True Negative:", trueNegative)
    print("False Positive:", falsePositive)
    print("False Negative:", falseNegative)
    
    accuracy = accuracy_score(acc, pred)
    print("\nAccuracy:", accuracy)
    tpr = recall_score(acc, pred)
    print("True-Positve Rate:", tpr)
    f1 = f1_score(acc, pred)
    print("F1 score:", f1)
    
    return cm

def evaluateCM(cm):
    print("\nTotal Confusion Matrix:")
    print(cm)
    tp = cm[1, 1]
    tn = cm[0, 0]
    fp = cm[0, 1]
    fn = cm[1, 0]
    
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    
    # Calculate precision and recall
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    # Calculate F1 score
    f1 = 2 * (precision * recall) / (precision + recall)

    print("\nAccuracy:", accuracy)
    print("True-Positve Rate:", recall)
    print("F1 score:", f1)

def evalNN(model, xtest, ytest):
    pred = model.predict(xtest)
    # Plot the histogram
    print("Test ROC:")
    plotROC(pred, ytest)
    bestThresh = thresh(pred, ytest)
    classPred = [0 if val < bestThresh else 1 for val in pred]
    return(evaluate(ytest, classPred))

def NN2(df, df2, test, test2):
    
    df = df.drop(["Class", "Prob"], axis =1)
    len(df)
    xt = test.drop(["Class", "Prob", "Churn_Yes"], axis=1)
    len(xt)
    yt = test["Churn_Yes"]
    len(yt)

    NN1CM = NN(df, xt, yt)
    
    df2 = df2.drop(["Class", "Prob"], axis =1)
    len(df2)
    xt2 = test2.drop(["Class", "Prob", "Churn_Yes"], axis=1)
    len(xt)
    yt2 = test2["Churn_Yes"]
    len(yt)
    
    NN2CM = NN(df2, xt2, yt2)
    
    return (evaluateCM(NN1CM+NN2CM))

def NNTree(df, df2, test, test2):
    
    df = df.drop(["Class", "Prob"], axis =1)
    xt = test.drop(["Class", "Prob", "Churn_Yes"], axis=1)
    yt = test["Churn_Yes"]

    gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42, max_depth = 2)
    gbc.fit(df.drop(["Churn_Yes"], axis=1),df["Churn_Yes"])
    ypred = gbc.predict(xt)
    NN1CM = evaluate(yt, ypred)
    
    df2 = df2.drop(["Class", "Prob"], axis =1)
    len(df2)
    xt2 = test2.drop(["Class", "Prob", "Churn_Yes"], axis=1)
    len(xt)
    yt2 = test2["Churn_Yes"]
    len(yt)
    
    predict = pd.concat([test, test2])
    yprob = gbc.predict_proba(predict.drop(["Churn_Yes", "Class", "Prob"], axis = 1))
    print("tree", yprob[:, 1][0], "acutal", predict["Churn_Yes"].head(1))
    # Create a scatter plot
    plt.scatter(yprob[:, 1], predict.index, c=predict["Churn_Yes"], cmap='viridis')
    plt.xlabel('Predicted Probability')
    plt.ylabel('Sample Unit')
    plt.title('Predicted Probabilities and Class Labels')
    plt.show()
    
    NN2CM = NN(df2, xt2, yt2, predict, True)
    
    return (evaluateCM(NN1CM+NN2CM))

    


In [None]:

#Training 
model = LogisticRegression(max_iter=1000)
model.fit(xtrain, ytrain)


#Getting probabilities
yprob = model.predict_proba(xtest)
thresholdTest = thresh(yprob[:, 1], ytest)
#threshold = 0.5
ypred = (yprob[:, 1] > thresholdTest).astype(int)
#Log results
evaluate(ytest,ypred)

In [None]:
test = xtest.copy()
test["Churn_Yes"] = ytest
test["Prob"] = yprob[:, 1]
test["Class"] = ypred

In [None]:
def findClosestTest(num):
    i = (notSBDTest['Prob'] - num).abs().idxmin()
    row = notSBDTest.loc[i]
    notSBDTest.drop(i, inplace=True)
    return row

In [None]:
class1Test = test[test["Churn_Yes"] == 1].sort_values(by="Prob")
class0Test = test[test["Churn_Yes"] == 0].sort_values(by="Prob")

#Similar but different
SBDTest = pd.DataFrame()
notSBDTest = class0Test.copy()


with warnings.catch_warnings():
    warnings.simplefilter(action='ignore', category=FutureWarning)
    for i in range(len(class1Test)):
        #print(len(SBD))
        row = class1Test.iloc[i]
        SBDTest = SBDTest.append(row)
        row2 = findClosestTest(row["Prob"])
        SBDTest = SBDTest.append(row2)

print(len(SBDTest))
print(len(notSBDTest))

In [None]:
stdProbTest = .5 * test["Prob"].std()

fullHTPTest = hardToPredictTest(stdProbTest, thresholdTest)

notHTPTest = fullHTPTest[1]
HTPTest = fullHTPTest[0]

print("HTPTest:", len(HTPTest))
print("notHTPTest:", len(notHTPTest))

HTPTest[["Churn_Yes", "Prob", "Class"]].head(5)


In [None]:
upperlimitTest = thresholdTest + stdProbTest
lowerlimitTest = thresholdTest - stdProbTest

saveTest = test[test['Prob'].between(lowerlimitTest, upperlimitTest)]

notsaveTest = test.copy()
notsaveTest = notsaveTest.drop(saveTest.index)

print(len(saveTest))
print(len(notsaveTest))
len(test)

In [None]:
#NN2(probdf, probdf, test, test)

In [None]:
#NN2(save, notsave, saveTest, notsaveTest)

In [None]:
#NN2(HTP, notHTP, test, test)

In [None]:
NNTree(HTP, notHTP, HTPTest, notHTPTest)

In [None]:
HTPDF = HTP.copy()
HTPDF["HTP"] = 1
notHTPDF = notHTP.copy()
notHTPDF["HTP"] = 0
HTP2 = pd.concat([HTPDF, notHTPDF])

HTPDF = HTPTest.copy()
HTPDF["HTP"] = 1
notHTPDF = notHTPTest.copy()
notHTPDF["HTP"] = 0
HTP3 = pd.concat([HTPDF, notHTPDF])
HTP3

xtrainHTP = HTP2.drop(["Churn_Yes", "Prob", "Class", "HTP"], axis = 1)
ytrainHTP = HTP2["HTP"]

xtestHTP = HTP3.drop(["Churn_Yes", "Prob", "Class", "HTP"], axis = 1)
ytestHTP = HTP3["HTP"]

gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42, max_depth = 8)
gbc.fit(xtrainHTP,ytrainHTP)
ypred = gbc.predict(xtestHTP)

HTP3["Model"] = ypred

HTPTestNN = HTP3[HTP3["Model"] == 1].drop(["Model", "HTP"], axis = 1)

notHTPTestNN = HTP3[HTP3["Model"] == 0].drop(["Model", "HTP"], axis = 1)

NNTree(HTP, notHTP, HTPTestNN, notHTPTestNN)



# Weight

In [None]:
def NNTree(df, df2, test, test2):
    
    tot = pd.concat([df, df2])
    
    xTrainTot = tot.drop(["Class", "Prob", "Churn_Yes"], axis=1)
    yTrainTot = tot["Churn_Yes"]
    
    xtrain = df.drop(["Class", "Prob", "Churn_Yes"], axis=1)
    ytrain = df["Churn_Yes"]
    
    xtrain2 = df2.drop(["Class", "Prob", "Churn_Yes"], axis=1)
    ytrain2 = df2["Churn_Yes"]
    
    xt = test.drop(["Class", "Prob", "Churn_Yes"], axis=1)
    yt = test["Churn_Yes"]
    
    xt2 = test2.drop(["Class", "Prob", "Churn_Yes"], axis=1)
    yt2 = test2["Churn_Yes"]
    
    totTest = pd.concat([test, test2])
    
    xtest = totTest.drop(["Class", "Prob", "Churn_Yes"], axis=1)
    ytest = totTest["Churn_Yes"]

    #######################################################################################################
    
    gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42, max_depth = 2)
    gbc.fit(xtrain, ytrain)
    
    tPred = gbc.predict_proba(xTrainTot)[:, 0]
    
    #######################################################################################################
    model = Sequential()
    model.add(Dense(64, activation='relu', input_dim=30))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam', loss='binary_crossentropy',
                  metrics=['accuracy', recall_m, precision_m, f1_m])
    
    history = model.fit(xtrain2, ytrain2, epochs=200, batch_size=32, shuffle=True, verbose=0)
    
    nPred = model.predict(xTrainTot)[:, 0]

    bestThresh = bestWeight(tPred, nPred, yTrainTot)
    
    evalNN(bestThresh[1], bestThresh[0], yTrainTot)
    
    testPred = (bestThresh[2] * gbc.predict_proba(xtest)[:, 0]) + (bestThresh[3] * model.predict(xtest)[:, 0])
    
    evalNN(bestThresh[1], testPred, ytest)
    
def evalNN(thresh, pred, ytest):
    plotROC(pred, ytest)
    classPred = [0 if val < thresh else 1 for val in pred]
    evaluate(ytest, classPred, thresh)
    
def evaluate(acc, pred, bestthresh):
    cm = confusion_matrix(acc, pred)
    bestacc = accuracy_score(acc, pred)
    besttp = recall_score(acc, pred)
    bestf1 = f1_score(acc, pred)
    
    print("Confusion Matrix:")
    print(cm)
    print("Best Threshold:", bestthresh)
    print("Accuracy:", bestacc)
    print("Recall:", besttp)
    print("F1:", bestf1)
    
    
def bestWeight(pred1, pred2, acc):
    w1 = 0
    w2 = 1
    f1 = 0
    t = 0
    p = 0
    
    for i in range(101):
        weight1 = i/100
        weight2 = 1-weight1
        
        pred = (pred1*weight1) + (pred2*weight2)
        test = thresh2(pred, acc)
        #print("Weight1:", weight1, "Weight2:", weight2, "Threshold:", test[1], "F1:", test[0])
        
        if (test[0] > f1):
            p = pred
            f1 = test[0]
            w1 = weight1
            w2 = weight2
            t = test[1]
            
    print("Best Weight1:", w1)
    print("Best Weight2:", w2)
    print("Best F1:", f1)
    print("Best Threshold:", t)
    
    return([p, t, w1, w2])

def thresh2(pred, ytest):
    bestf1 = 0
    bestthresh = 0

    for i in range(1,100):
        classPred = [0 if val < (i/100) else 1 for val in pred]
        f1 = f1_score(ytest, classPred)
        if (f1>bestf1):
            bestf1 = f1
            bestthresh=(i/100)

    return (bestf1, bestthresh)

In [None]:
NNTree(HTP, notHTP, HTPTest, notHTPTest)

# HTP NN

In [None]:
HTPDF = HTP.copy()
HTPDF["HTP"] = 1
notHTPDF = notHTP.copy()
notHTPDF["HTP"] = 0
HTP2 = pd.concat([HTPDF, notHTPDF])
HTP2

In [None]:
HTPDF = HTPTest.copy()
HTPDF["HTP"] = 1
notHTPDF = notHTPTest.copy()
notHTPDF["HTP"] = 0
HTP3 = pd.concat([HTPDF, notHTPDF])
HTP3

In [None]:
def dtree(x, y, xtest, ytest):
    dtc = DecisionTreeClassifier()
    rfc = RandomForestClassifier(n_estimators=100, random_state=42)
    gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42, max_depth = 2)
    
    dtc.fit(x,y)
    rfc.fit(x,y)
    gbc.fit(x,y)
    
    print("--------------------Regular Tree--------------------")
    dtcimp = varImp(dtc, x)
    print(dtcimp.sort_values("Importance", ascending= False).head(5))
    yfit = dtc.predict(x)
    print("Training:")
    evaluate(y, yfit)
    ypred = dtc.predict(xtest)
    print("Testomg:")
    evaluate(ytest, ypred)
    
    
    print("--------------------Random Forest--------------------")
    rfcimp = varImp(rfc, x)
    print(rfcimp.sort_values("Importance", ascending= False).head(5))
    yfit = rfc.predict(x)
    ypred = rfc.predict(xtest)
    print("Training:")
    evaluate(y, yfit)
    print("Testing:")
    evaluate(ytest, ypred)
    
    print("--------------------Gradient Boost--------------------")
    gbcimp = varImp(gbc, x)
    print(gbcimp.sort_values("Importance", ascending= False).head(5))
    yfit = gbc.predict(x)
    ypred = gbc.predict(xtest)
    print("Training:")
    evaluate(y, yfit)
    print("Testing:")
    evaluate(ytest, ypred)
    
def evaluate(acc, pred):
    cm = confusion_matrix(acc, pred)
    bestacc = accuracy_score(acc, pred)
    besttp = recall_score(acc, pred)
    bestf1 = f1_score(acc, pred)
    
    print("Confusion Matrix:")
    print(cm)
    print("Accuracy:", bestacc)
    print("Recall:", besttp)
    print("F1:", bestf1)

In [None]:
xtrainHTP = HTP2.drop(["Churn_Yes", "Prob", "Class", "HTP"], axis = 1)
ytrainHTP = HTP2["HTP"]



In [None]:
dtree(xtrain, ytrain, xtest, ytest)

# Recompile NN

In [None]:

def NN(df, xtest, ytest):
    xtrain = df.drop("Churn_Yes", axis=1)
    ytrain = df["Churn_Yes"]
    
    # Define the model architecture
    model = Sequential()
    model.add(Dense(64, activation='relu', input_dim=30))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy',
                  metrics=['accuracy', recall_m, precision_m, f1_m])
    
    val_data = (xtest, ytest)

    history = model.fit(xtrain, ytrain, epochs=1, batch_size=32, shuffle=False, 
                        validation_data=val_data, verbose=0)
    
    pred = model.predict(xtest)

    return(thresh(pred,ytest))

def thresh(pred, ytest):
    bestf1 = 0
    bestthresh = 0

    for i in range(1,100):
        classPred = [0 if val < (i/100) else 1 for val in pred]
        f1 = f1_score(ytest, classPred)
        if (f1>bestf1):
            bestf1 = f1
            bestthresh=(i/100)

    return bestf1

def plotline(x, y, xlab, ylab):
    # Plot the recall values
    plt.plot(x, y, 'k', label=ylab)

    plt.title((ylab + " Line Graph"))
    plt.xlabel(xlab)
    plt.ylabel(ylab)
    plt.legend()
    plt.show()

In [None]:
f1recompile = []

In [None]:
for i in range(200):
    f1recompile.append(NN(probdf.drop(["Prob", "Class"], axis=1), xtest, ytest))


In [None]:
plotline(list(range(200)), f1recompile, "Epoch", "F1")

In [None]:
# Define the model architecture
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=30))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy',
              metrics=['accuracy', recall_m, precision_m, f1_m])

In [None]:
def NN(df, xtest, ytest):
    xtrain = df.drop("Churn_Yes", axis=1)
    ytrain = df["Churn_Yes"]
    
    val_data = (xtest, ytest)

    history = model.fit(xtrain, ytrain, epochs=1, batch_size=32, shuffle=False, 
                        validation_data=val_data, verbose=0)
    
    pred = model.predict(xtest)

    return(thresh(pred,ytest))

In [None]:
f1same = []

In [None]:
for i in range(200):
    f1same.append(NN(probdf.drop(["Prob", "Class"], axis=1), xtest, ytest))

In [None]:
plotline(list(range(200)), f1same, "Epoch", "F1")

# New Dataset

In [None]:
train = pd.read_csv(r"C:\Users\21sla\OneDrive - Dickinson College\Data300\churn-bigml-80.csv")

In [None]:
test = pd.read_csv(r"C:\Users\21sla\OneDrive - Dickinson College\Data300\churn-bigml-20.csv")

In [None]:
#https://www.kaggle.com/datasets/mnassrib/telecom-churn-datasets?select=churn-bigml-80.csv
#https://www.kaggle.com/datasets/mathchi/churn-for-bank-customers

churn = pd.concat([train, test])

len(churn)


In [None]:
churn.dtypes

In [None]:
#Drop State
clean = churn.drop("State", axis=1)
clean["Churn"] = clean["Churn"].astype(object)

In [None]:
def hardToPredictTest(class0, class1, std, threshold):
    
    upperlimit = threshold + std
    lowerlimit = threshold - std
    
    print("Upperlimit:", upperlimit, "Lowerlimit:", lowerlimit)
    
    C0HP = class0[(class0["Prob"]>upperlimit)].sort_values(by="Prob", ascending = False)
    notC0Hp = class0[(class0["Prob"]<=upperlimit)]

    C1LP = class1[(class1["Prob"]<lowerlimit)].sort_values(by="Prob")
    notC1LP = class1[(class1["Prob"]>=lowerlimit)]
    
    with warnings.catch_warnings():
        warnings.simplefilter(action='ignore', category=FutureWarning)
        notHTP = pd.DataFrame().append(notC0Hp).append(notC1LP)
        HTP = pd.DataFrame()
        
        alternate = min(len(C0HP), len(C1LP))
        for i in range (0, alternate):
            HTP = HTP.append(C0HP.iloc[i]).append(C1LP.iloc[i])
            #print(len(orderedHTP))

        if (alternate == len(C0HP)):
            HTP = HTP.append(C1LP.iloc[(len(C0HP)):(len(C1LP))])

        else:
            HTP = HTP.append(C0HP.iloc[(len(C1LP)):(len(C0HP))])
    
    print("HTP:", len(HTP))
    print("notHTP:", len(notHTP))
    
    return (HTP, notHTP)

def NNTree(df, df2, test, test2):
    
    df = df.drop(["Prob"], axis =1)
    xt = test.drop(["Prob", "Churn_Yes"], axis=1)
    yt = test["Churn_Yes"]

    gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42, max_depth = 2)
    gbc.fit(df.drop(["Churn_Yes"], axis=1),df["Churn_Yes"])
    ypred = gbc.predict(xt)
    NN1CM = evaluate(yt, ypred)
    
    df2 = df2.drop(["Prob"], axis =1)
    len(df2)
    xt2 = test2.drop(["Prob", "Churn_Yes"], axis=1)
    len(xt)
    yt2 = test2["Churn_Yes"]
    len(yt)
    
    yprob = gbc.predict_proba(xt2)
    # Create a scatter plot
    plt.scatter(yprob[:, 1], test2.index, c=test2["Churn_Yes"], cmap='viridis')
    plt.xlabel('Predicted Probability')
    plt.ylabel('Sample Unit')
    plt.title('Predicted Probabilities and Class Labels')
    plt.show()
    
    NN2CM = NN(df2, xt2, yt2)
    
    return (evaluateCM(NN1CM+NN2CM))

def NN(df, xtest, ytest):
    # Define the model architecture
    model = Sequential()
    model.add(Dense(64, activation='relu', input_dim=18))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy',
                  metrics=['accuracy', recall_m, precision_m, f1_m])

    
    xtrain = df.drop("Churn_Yes", axis=1)
    ytrain = df["Churn_Yes"]
    
    print("Starting Model Training")
    history = model.fit(xtrain, ytrain, epochs=200, batch_size=32, shuffle=True, verbose=0)
    print("Model Training Finished")
    
    print("Training ROC:")
    pred = model.predict(xtrain)
    
    #print(pred)
    plotROC(pred, ytrain)
    #plotNN(history)
    return(evalNN(model, xtest, ytest))

In [None]:
#Make dummies
dummies = pd.get_dummies(clean, drop_first= True)
clean = dummies
clean.dtypes

In [None]:
x = clean.drop('Churn_True', axis=1)
y = clean['Churn_True']

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)


In [None]:
#Training 
model = LogisticRegression(max_iter=10000)
model.fit(xtrain, ytrain)

In [None]:
#Getting probabilities
yprob = model.predict_proba(xtest)
threshold = thresh(yprob[:, 1], ytest)
#threshold = 0.5
ypred = (yprob[:, 1] > threshold).astype(int)


In [None]:
#Log results
evaluate(ytest,ypred, threshold)

In [None]:
train = xtrain.copy()
train["Churn_Yes"] = ytrain

test = xtest.copy()
test["Churn_Yes"] = ytest

print(len(train))
print(len(test))

trainprob = model.predict_proba(xtrain)[:, 1]
testprob = model.predict_proba(xtest)[:, 1]

train["Prob"] = trainprob
test["Prob"] = testprob


In [None]:
train.head(5)

In [None]:
trainClass0 = train[train["Churn_Yes"] == 0]
trainClass1 = train[train["Churn_Yes"] == 1]

testClass0 = test[test["Churn_Yes"] == 0]
testClass1 = test[test["Churn_Yes"] == 1]

std = .5* train["Prob"].std()


trainHTP = hardToPredictTest(trainClass0, trainClass1, std, threshold)
testHTP = hardToPredictTest(testClass0, testClass1, std, threshold)


In [None]:
NNTree(trainHTP[0], trainHTP[1], testHTP[0], testHTP[1])

In [None]:
NN(train.drop("Prob", axis=1), xtest, ytest)