# Importing Dataset

In [57]:
import pandas as pd;
import numpy as np;
from collections import Counter;

In [58]:
df = pd.read_csv('Customer-Churn-Records.csv')

In [59]:
df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Card Type,Point Earned
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1,1,2,DIAMOND,464
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,1,3,DIAMOND,456
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1,1,3,DIAMOND,377
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0,0,5,GOLD,350
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0,0,5,GOLD,425
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0,0,1,DIAMOND,300
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0,0,5,PLATINUM,771
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1,1,3,SILVER,564
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1,1,2,GOLD,339


# Handling Unwanted Features

In [60]:
df.drop(['CustomerId','RowNumber','Surname','Complain'],axis=1,inplace=True)

In [62]:
df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Satisfaction Score,Card Type,Point Earned
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1,2,DIAMOND,464
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,3,DIAMOND,456
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1,3,DIAMOND,377
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0,5,GOLD,350
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0,5,GOLD,425
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0,1,DIAMOND,300
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0,5,PLATINUM,771
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1,3,SILVER,564
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1,2,GOLD,339


# Handling Missing Values

In [63]:
print(df.isnull().sum())

CreditScore           0
Geography             0
Gender                0
Age                   0
Tenure                0
Balance               0
NumOfProducts         0
HasCrCard             0
IsActiveMember        0
EstimatedSalary       0
Exited                0
Satisfaction Score    0
Card Type             0
Point Earned          0
dtype: int64


In [64]:
# Fill missing values with the mode of each column  
for column in df.columns:  
    mode_value = df[column].mode().iloc[0]  # Get the first mode value  
    df[column].fillna(mode_value, inplace=True)  

In [65]:
print(df.isnull().sum())

CreditScore           0
Geography             0
Gender                0
Age                   0
Tenure                0
Balance               0
NumOfProducts         0
HasCrCard             0
IsActiveMember        0
EstimatedSalary       0
Exited                0
Satisfaction Score    0
Card Type             0
Point Earned          0
dtype: int64


# Handling Categorical Values

In [66]:
import pandas as pd  
from sklearn.preprocessing import LabelEncoder  


# Identify categorical columns  
categorical_columns = df.select_dtypes(include=['object', 'category']).columns  

# Initialize LabelEncoder  
label_encoder = LabelEncoder()  

# Apply LabelEncoder to categorical columns  
for column in categorical_columns:  
    df[column] = label_encoder.fit_transform(df[column])  



In [67]:
print(df['Exited'])
Counter(df['Exited'])

0       1
1       0
2       1
3       0
4       0
       ..
9995    0
9996    0
9997    1
9998    1
9999    0
Name: Exited, Length: 10000, dtype: int64


Counter({0: 7962, 1: 2038})

# Normalization

In [68]:
import pandas as pd  
from sklearn.preprocessing import StandardScaler 

y = df['Exited']
X = df.drop('Exited',axis=1) 
# Here we assume any numeric column needs scaling  
numerical_columns = X.select_dtypes(include=['int64', 'float64','int32']).columns  

# Step 3: Apply Z-score Normalization  
scaler = StandardScaler()  
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])  



In [69]:
X

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Satisfaction Score,Card Type,Point Earned
0,-0.326221,-0.901886,-1.095988,0.293517,-1.041760,-1.225848,-0.911583,0.646092,0.970243,0.021886,-0.721130,-1.339533,-0.630839
1,-0.440036,1.515067,-1.095988,0.198164,-1.387538,0.117350,-0.911583,-1.547768,0.970243,0.216534,-0.009816,-1.339533,-0.666251
2,-1.536794,-0.901886,-1.095988,0.293517,1.032908,1.333053,2.527057,0.646092,-1.030670,0.240687,-0.009816,-1.339533,-1.015942
3,0.501521,-0.901886,-1.095988,0.007457,-1.387538,-1.225848,0.807737,-1.547768,-1.030670,-0.108918,1.412812,-0.445319,-1.135457
4,2.063884,1.515067,-1.095988,0.388871,-1.041760,0.785728,-0.911583,0.646092,0.970243,-0.365276,1.412812,-0.445319,-0.803472
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1.246488,-0.901886,0.912419,0.007457,-0.004426,-1.225848,0.807737,0.646092,-1.030670,-0.066419,-1.432445,-1.339533,-1.356781
9996,-1.391939,-0.901886,0.912419,-0.373958,1.724464,-0.306379,-0.911583,0.646092,0.970243,0.027988,1.412812,0.448895,0.728088
9997,0.604988,-0.901886,-1.095988,-0.278604,0.687130,-1.225848,-0.911583,-1.547768,0.970243,-1.008643,-0.009816,1.343109,-0.188192
9998,1.256835,0.306591,0.912419,0.293517,-0.695982,-0.022608,0.807737,0.646092,-1.030670,-0.125231,-0.721130,-0.445319,-1.184148


# Data Balancing

In [93]:
from imblearn.combine import SMOTEENN  
smote_enn = SMOTEENN(random_state=42)  
X, y = smote_enn.fit_resample(X, y)
Counter(y)

Counter({1: 7286, 0: 4976})

# SVM Classifier

In [97]:
import pandas as pd  
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import LabelEncoder, StandardScaler  
from sklearn.svm import SVC  
from sklearn.metrics import accuracy_score, classification_report  



# Step 3: Split the dataset into training and test sets  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  
 

# Step 5: Create and train the SVM model  
model = SVC(kernel='rbf', class_weight='balanced', random_state=42)  # You can change the kernel to 'rbf', 'poly', etc.  
model.fit(X_train, y_train)  

# Step 6: Make predictions  
y_pred = model.predict(X_test)  

# Step 7: Evaluate the model  
accuracy = accuracy_score(y_test, y_pred)  
report = classification_report(y_test, y_pred)  

print("\nAccuracy:", accuracy)  
print("\nClassification Report:\n", report)


Accuracy: 0.9200978393803506

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.92      0.90      1013
           1       0.94      0.92      0.93      1440

    accuracy                           0.92      2453
   macro avg       0.92      0.92      0.92      2453
weighted avg       0.92      0.92      0.92      2453



# Random Forest Classifier

In [100]:
import pandas as pd  
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import LabelEncoder, StandardScaler  
from sklearn.ensemble import RandomForestClassifier  
from sklearn.metrics import accuracy_score, classification_report  



 

# Step 3: Split the dataset into training and test sets  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)  

# Step 4: Scale features if needed  
scaler = StandardScaler()  
X_train = scaler.fit_transform(X_train)  
X_test = scaler.transform(X_test)  

# Step 5: Create and train the Random Forest model  
model = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust n_estimators  
model.fit(X_train, y_train)  

# Step 6: Make predictions  
y_pred = model.predict(X_test)  

# Step 7: Evaluate the model  
accuracy = accuracy_score(y_test, y_pred)  
report = classification_report(y_test, y_pred)  

print("\nAccuracy:", accuracy)  
print("\nClassification Report:\n", report)  

# Step 8: Feature importance  
importances = model.feature_importances_  
feature_names = X.columns  

# Combine feature names and their importances into a DataFrame  
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})  

# Sort the DataFrame by importance  
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)  

# Display the feature importances  
print("\nFeature Importances:")  
print(feature_importance_df)


Accuracy: 0.9442783365044849

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.92      0.93      1493
           1       0.94      0.96      0.95      2186

    accuracy                           0.94      3679
   macro avg       0.94      0.94      0.94      3679
weighted avg       0.94      0.94      0.94      3679


Feature Importances:
               Feature  Importance
3                  Age    0.261296
6        NumOfProducts    0.212260
5              Balance    0.105671
1            Geography    0.061448
12        Point Earned    0.052594
8       IsActiveMember    0.049413
9      EstimatedSalary    0.049265
0          CreditScore    0.047886
4               Tenure    0.045795
10  Satisfaction Score    0.040649
11           Card Type    0.034464
2               Gender    0.028832
7            HasCrCard    0.010428


# Naive Bayes Imported

In [106]:
import pandas as pd  
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import LabelEncoder, StandardScaler  
from sklearn.naive_bayes import GaussianNB  
from sklearn.metrics import accuracy_score, classification_report  
from imblearn.combine import SMOTEENN  

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)  
  
# Step 4: Create and train the Naive Bayes model  
model = GaussianNB()  
model.fit(X_train, y_train)  

# Step 5: Make predictions  
y_pred = model.predict(X_test)  

# Step 6: Evaluate the model  
accuracy = accuracy_score(y_test, y_pred)  
report = classification_report(y_test, y_pred)  

print("\nAccuracy:", accuracy)  
print("\nClassification Report:\n", report)  


Accuracy: 0.8320423970648186

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.74      0.78      1013
           1       0.83      0.90      0.86      1440

    accuracy                           0.83      2453
   macro avg       0.83      0.82      0.82      2453
weighted avg       0.83      0.83      0.83      2453



# NaiveBayes From Scratch

In [107]:
import numpy as np

class GaussianNaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.mean = {}
        self.var = {}
        self.priors = {}

        for c in self.classes:
            X_c = X[y == c]
            self.mean[c] = np.mean(X_c, axis=0)
            self.var[c] = np.var(X_c, axis=0)
            self.priors[c] = len(X_c) / len(X)

    def gaussian_pdf(self, x, mean, var):
        eps = 1e-9  # Avoid division by zero
        coef = 1.0 / np.sqrt(2 * np.pi * var + eps)
        exponent = np.exp(-(x - mean)**2 / (2 * var + eps))
        return coef * exponent

    def predict(self, X):
        predictions = []
        for x in X:
            posteriors = []
            for c in self.classes:
                prior = np.log(self.priors[c])
                conditional = np.sum(np.log(self.gaussian_pdf(x, self.mean[c], self.var[c])))
                posterior = prior + conditional
                posteriors.append(posterior)
            predictions.append(self.classes[np.argmax(posteriors)])
        return np.array(predictions)

from imblearn.combine import SMOTEENN  
smote_enn = SMOTEENN(random_state=42)  
XX, yy = smote_enn.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(XX,yy, test_size=0.2, random_state=42)  
#X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
X_train = X_train.to_numpy()  
X_test = X_test.to_numpy()  

# Initialize and train the custom model
gnb = GaussianNaiveBayes()
gnb.fit(X_train, y_train)

# Predict
y_pred = gnb.predict(X_test)

# Evaluate
print("\nAccuracy:", accuracy)  
print(classification_report(y_test, y_pred))



Accuracy: 0.8320423970648186
              precision    recall  f1-score   support

           0       0.85      0.83      0.84      1408
           1       0.84      0.85      0.84      1433

    accuracy                           0.84      2841
   macro avg       0.84      0.84      0.84      2841
weighted avg       0.84      0.84      0.84      2841



# Logistic Regression Imported

In [108]:
import pandas as pd  
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import LabelEncoder, StandardScaler  
from sklearn.linear_model import LogisticRegression  
from sklearn.metrics import accuracy_score, classification_report  


# Step 3: Split the dataset into training and test sets  
X_train, X_test, y_train, y_test = train_test_split(XX, yy, test_size=0.2, random_state=42, stratify=yy)  

# Step 4: Scale features  
scaler = StandardScaler()  
X_train = scaler.fit_transform(X_train)  
X_test = scaler.transform(X_test)  

# Step 5: Create and train the Logistic Regression model  
model = LogisticRegression(random_state=42)  
model.fit(X_train, y_train)  

# Step 6: Make predictions  
y_pred = model.predict(X_test)  

# Step 7: Evaluate the model  
accuracy = accuracy_score(y_test, y_pred)  
report = classification_report(y_test, y_pred)  

print("\nAccuracy:", accuracy)  
print("\nClassification Report:\n", report)


Accuracy: 0.7866948257655755

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.79      0.78      1392
           1       0.80      0.78      0.79      1449

    accuracy                           0.79      2841
   macro avg       0.79      0.79      0.79      2841
weighted avg       0.79      0.79      0.79      2841



# Logistic Regression From Scratch

In [110]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report



# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(XX, yy, test_size=0.2, random_state=42)

# Add bias term to the features (intercept)
X_train_biased = np.c_[np.ones(X_train.shape[0]), X_train]
X_test_biased = np.c_[np.ones(X_test.shape[0]), X_test]

# Initialize parameters
weights = np.zeros(X_train_biased.shape[1])
learning_rate = 0.01
num_epochs = 1000

# Train the model using gradient descent
for epoch in range(num_epochs):
    # Compute predictions
    predictions = X_train_biased.dot(weights)
    
    # Compute the gradient
    errors = predictions - y_train
    gradient = X_train_biased.T.dot(errors) / len(y_train)
    
    # Update weights
    weights -= learning_rate * gradient

# Make predictions on the test set
test_predictions = X_test_biased.dot(weights)

# Convert predictions to binary (0 or 1) using a threshold of 0.5
y_pred = (test_predictions >= 0.5).astype(int)

# Print the classification report
print("\nAccuracy:", accuracy)  
print(classification_report(y_test, y_pred))


Accuracy: 0.7866948257655755
              precision    recall  f1-score   support

           0       0.77      0.79      0.78      1408
           1       0.79      0.76      0.78      1433

    accuracy                           0.78      2841
   macro avg       0.78      0.78      0.78      2841
weighted avg       0.78      0.78      0.78      2841

