# <font color="#00000"> Credit Decisioning Model</font>


In [None]:

import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler 
from sklearn.metrics import accuracy_score, classification_report

: 

In [None]:
#Load the dataset
data = pd.read_csv("train.csv")

: 

In [None]:
#inspect the dataset
data.head()

: 

In [None]:

print("The shape =", data.shape)

# Dataset dimensions and statistics
num_rows, num_cols = loan_data.shape
num_features = num_cols - 1
total_data = num_rows * num_cols

print("No. of rows:", num_rows)
print("No. of columns:", num_cols)
print("No of features:", num_features)
print("Total data:",total_data)


: 

In [None]:
print(loan_data.info())

: 

In [None]:
#categorical data
data.describe(include=object)

: 

In [None]:
#Numerical data
loan_data.describe().T.round(2) #T.round(2) rounds up the data upto two decimals

: 

**Description**

* **count:** The number of non-null values in the dataset for a given variable.
* **mean:** The average value, calculated by summing all values and dividing by the count.
* **Standard Deviation (std):** Measures how spread out the data is around the mean. A higher standard deviation indicates greater variability.
* **min:** The smallest value present in the dataset for that variable.
* **25%:** The first quartile. 25% of the data falls below this value.
* **50%:** The median or second quartile. 50% of the data falls below this value.
* **75%:** The third quartile. 75% of the data falls below this value.
* **max:** The largest value present in the dataset for that variable.

Now, we have done inspecting the data, let's move forward with the next step exlporing the data

**Exploratory Data Analysis**, we'll be visualising the data here to understand more in depth about the data.

**Visualising the Data**

In [None]:
# Calculate the average income
mean_income = data['ApplicantIncome'].mean()
print("Mean Income:", mean_income)

# Count applicants above and below mean income
high_income_count = (data['ApplicantIncome'] > mean_income).sum()
low_income_count = (data['ApplicantIncome'] <= mean_income).sum()



print('Applicants with Higher Income than mean:', high_income_count)
print('Applicants with Lower Income than mean:', low_income_count)

# Visualize income distribution
plt.figure(figsize=(8, 4))
sns.barplot(x=['High Income', 'Low Income'], y=[high_income_count, low_income_count], palette=["skyblue", "salmon"])  
plt.title('Income Distribution')
plt.ylabel('Number of Applicants')
plt.show()

: 

In [None]:
sns.countplot(x="Self_Employed", data=data, palette=["skyblue", "coral", "lightgreen"])  

# Add labels and title
plt.xlabel('Self-Employed Status')
plt.ylabel('Number of Applicants')
plt.title('Distribution of Self-Employment')

# Add count labels on top of the bars
ax = plt.gca()
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{height}', 
                xy=(p.get_x() + p.get_width() / 2, height), 
                ha='center', 
                va='bottom')

plt.tight_layout()

: 

In [None]:
# Create the count plot with a custom color palette
sns.countplot(x="Credit_History", data=data, palette=["skyblue", "orange"]) 

# Add labels and title
plt.xlabel('Credit History')
plt.ylabel('Number of Applicants')
plt.title('Distribution of Credit History')

# Add count labels on top of the bars
ax = plt.gca()
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{height}', 
                xy=(p.get_x() + p.get_width() / 2, height), 
                ha='center', 
                va='bottom')

# Adjust layout to prevent overlapping labels
plt.tight_layout()

# Show the plot
plt.show()

: 

In [None]:
sns.countplot(x="Property_Area", data=data, palette=["skyblue", "orange", "green"])  

# Add labels and title
plt.xlabel('Property Area')
plt.ylabel('Number of Applicants')
plt.title('Distribution of Property Areas')

ax = plt.gca()
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{height}', 
                xy=(p.get_x() + p.get_width() / 2, height), 
                ha='center', 
                va='bottom')

# Adjust layout to prevent overlapping labels
plt.tight_layout()
plt.show()

: 

In [None]:
sns.countplot(x="Loan_Amount_Term", data=data, palette="viridis")

# Add labels and title
plt.xlabel('Loan Amount Term')
plt.ylabel('Number of Applicants')
plt.title('Distribution of Loan Amount Terms')

# Add count labels on top of the bars
ax = plt.gca()
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{height}', 
                xy=(p.get_x() + p.get_width() / 2, height), 
                ha='center', 
                va='bottom')

: 

In [None]:
# Create the count plot with a custom color palette
sns.countplot(x="Loan_Status", data=data, palette=["skyblue", "orange"]) 

# Add labels and title
plt.xlabel('Loan Status')
plt.ylabel('Number of Applicants')
plt.title('Distribution of Loan Statuses')

# Add count labels on top of the bars
ax = plt.gca()
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{height}', 
                xy=(p.get_x() + p.get_width() / 2, height), 
                ha='center', 
                va='bottom')

# Adjust layout to prevent overlapping labels
plt.tight_layout()

# Show the plot
plt.show()

: 

In [None]:
sns.countplot(x="Gender", data=data, palette=["skyblue", "pink"])  # Assuming two genders: Male and Female

# Add labels and title
plt.xlabel('Gender')
plt.ylabel('Number of Applicants')
plt.title('Distribution of Genders')

# Add count labels on top of the bars
ax = plt.gca()
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{height}', 
                xy=(p.get_x() + p.get_width() / 2, height), 
                ha='center', 
                va='bottom')

# Adjust layout to prevent overlapping labels
plt.tight_layout()

# Show the plot
plt.show()

: 

In [None]:
plt.figure(figsize=(10, 5))

# Define colors for the pie chart slices (adjust as needed)
colors = sns.color_palette("pastel")  # Using a pastel color palette

# labels by descending order
plt.pie(MarriedAnalysis, 
        labels=[("Married"),("Single"),("NaN")], 
        startangle=140,  # Adjust start angle for better visual arrangement
        autopct='%1.1f%%', 
        colors=colors)

plt.axis('equal')
plt.title('Marital Status Distribution')
plt.show()

: 

In [None]:
sns.countplot(x="Dependents", data=data, palette=["skyblue", "orange", "green", "red", "purple"])

# Add labels and title
plt.xlabel('Number of Dependents')
plt.ylabel('Number of Applicants')
plt.title('Distribution of Dependents')

# Add count labels on top of the bars
ax = plt.gca()
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{height}', 
                xy=(p.get_x() + p.get_width() / 2, height), 
                ha='center', 
                va='bottom')

# Adjust layout to prevent overlapping labels
plt.tight_layout()

# Show the plot
plt.show()


: 

In [None]:
plt.figure(figsize=(8, 8))  # Adjust figure size as needed

# Define colors for the pie chart slices
colors = sns.color_palette("pastel")  # Using a pastel color palette 

# Plot the pie chart
plt.pie(EducationAnalysis, labels=EducationAnalysis.index, autopct='%1.1f%%', startangle=140, colors=colors)

# Add title
plt.title('Distribution of Education Levels')

# Equal aspect ratio ensures a circular pie chart
plt.axis('equal') 

# Show the plot
plt.show()

: 

In [None]:
#Graph to analyse the outliers in the data for numerical values like applicant income, co-applicant income, loan amount

def plot_distribution(data, column, title):
    """Plots the distribution of a numerical column using Matplotlib."""
    plt.figure(figsize=(8, 4))
    plt.hist(data[column], bins=20, density=True, alpha=0.7)  # Adjust bins as needed
    plt.title(title)
    plt.xlabel(column)
    plt.ylabel('Density')
    plt.grid(True)
    plt.show()

plot_distribution(data, "ApplicantIncome", "Applicant Income Distribution")
plot_distribution(loan_data, "CoapplicantIncome", "Coapplicant Income Distribution")
plot_distribution(data, "LoanAmount", "Loan Amount Distribution")


: 

Correlation Analysis
It a statistical technique used to evaluate the relationship between two variables in a data set. The matrix is a table in which every cell contains a correlation coefficient, where 1 is considered a strong relationship between variables, 0 a neutral relationship and -1 a not strong relationship.


In [None]:
# Calculates the correlation coefficients between all pairs of numerical variables in the dataset
correlation_matrix = data.corr(numeric_only=True)

plt.figure(figsize=(15, 7.5))
sns.heatmap(correlation_matrix, annot=True, cmap='flare') 
plt.title('Correlation Matrix')

# Display the heatmap
plt.show()

: 

There is positive correlation between **Loan Amount** and **Applicant Income**

## <font size="4" face="WildWest">3. Data Relationships Analysis</font>

In [None]:
# Create the contingency table 
contingency_table = pd.crosstab(data['Credit_History'], data['Loan_Status'])

# Plot the contingency table using Seaborn's countplot (simpler than stacked barplot)
sns.countplot(x='Credit_History', hue='Loan_Status', data=data)

plt.xlabel('Credit History')
plt.ylabel('Count of Applicants')
plt.title('Relationship between Credit History and Loan Status')

# Show the plot
plt.show()

: 

The analysis appears a **good credit history** significantly *increases the chances of loan approval*. Because the percentage of people who have a good credit history and are approved is much better than a bad credit history.

Good credit History is directly proportional to chances of loan approval.

In [None]:
property_loan_table = pd.crosstab(data['Property_Area'], loan_data['Loan_Status'])
sns.countplot(x='Property_Area', hue='Loan_Status', data=data)

# Add labels and title
plt.xlabel('Property Area')
plt.ylabel('Count of Applicants')
plt.title('Relationship between Property Area and Loan Status')

# Show the plot
plt.show()

: 

Most of loan that got accepted has property in **Semiurban** compared to *Urban* and *Rural*.

In [None]:
# Create a cross-tabulation of 'Gender' and 'Married' variables
pd.crosstab(data.Gender, data.Married, dropna=True).plot(kind="bar", figsize=(8, 4))


# Add a title to the plot
plt.title('Gender VS Married')

# Label the x-axis
plt.xlabel('Gender')

# Label the y-axis
plt.ylabel('Count')

# Rotate the x-axis labels to avoid overlap
plt.xticks(rotation=0)

# Display the plot
plt.show()


: 

Most **male applicants** are already **married** compared to **female applicants**. Also, the number of **not married male applicants** are **higher** compare to **female applicants** that had **not married**.

In [None]:
# Create a cross-tabulation of 'Gender' and 'Loan Status' variables
pd.crosstab(loan_data.Education, data.Loan_Status).plot(kind="bar", figsize=(8, 5))

# Add a title to the plot
plt.title('Education Status VS Loan Status')

# Label the x-axis
plt.xlabel('Education')

# Label the y-axis
plt.ylabel('Count')

# Rotate the x-axis labels to avoid overlap
plt.xticks(rotation=0)

# Display the plot
plt.show()

: 

The analysis appears a **graduate applicant** significantly increases the chances of *loan approval*. Because the percentage of people who graduated and were approved is much better than who didn't graduate.

- Remove Unneeded Features
- Handling Missing Values
- Encoding Categorical Variables
- Handling Outliers
- Handling Duplicates

## <font size="4" face="WildWest">1. Remove Unneeded Features</font>
As Loan_ID is completely unique and not correlated with any of the other column, So we will drop it using .drop() function.
- axis = 1 means column , axis = 0 means row.
- inplace = True means the changes are reflected in the original DataFrame, inplace = False (default) means leaving the original DataFrame unchanged.

In [None]:
# Dropping Loan_ID column
data.drop(['Loan_ID'], axis = 1 , inplace = True)

# Check the Loan_ID is dropped
print(loan_data.info())


: 

## <font size="4" face="WildWest">2. Handling Missing Values</font>

**Missing values** are data points that are absent or not recorded for certain variables in a dataset. This can happen because of mistakes such as data entry errors, equipment failures, or simply because the information is not available.

 **How to detect missing values?**
 - Look for blank cells and use data analysis tools that can identify missing values like .isnull() functoin.

 **Decision options:**
1. Keep them 
    - Not ideal, as missing values can lead to biased analyses and inaccurate predictions.
2. Reassign new values to them (Using specific techniques)
    - Imputation (Y):
        - Replace missing values with estimated or calculated values based on the available data. This can be done using various methods such as:
            1. **Mean/Median Imputation:** Replace with the average (mean) or middle value (median) for numerical data.
            2. **Mode Imputation:** Replace with the most frequent value.
    - Forward/Backward Fill (N):
        - For time-series data, missing values can be filled with the last observed value (forward fill) or the next observed value (backward fill).
    - Interpolation (N):
        - Estimate missing values based on the trend or pattern observed in the existing data points.
3. Delete them (Sometimes necessary, especially if missing values are extensive and cannot be reasonably imputed without introducing significant bias.)

In [None]:
# Check for missing values
null_counts = data.isnull().sum()

# Display the number of null values
print(null_counts)
print("_________________________________________________________________")
print(colored(f"Totally, there are {null_counts.sum()} null values in the dataset.", attrs=['reverse']))

: 

<font size="4" face="WildWest">There are two types of data:</font>
- Numerical Data
- Categorical Data

In [None]:
# Fill the missing values for numerical data, .fillna = fill (NaN) values
'''
## 1- Mean Imputation:
loan_data["Credit_History"] = loan_data["Credit_History"].fillna(loan_data["Credit_History"].mean())

## 1- Median Imputation:
loan_data["Credit_History"] = loan_data["Credit_History"].fillna(loan_data["Credit_History"].median())
'''

## 2- Mode Imputatoin:
# The best way fill in the Credit History is the mode (Most common valus)
# [0] This ensures that if there are multiple mode values, only the first one is selected.
loan_data["Credit_History"] = loan_data["Credit_History"].fillna(loan_data["Credit_History"].mode()[0])

'''
## 3- Forward Fill
loan_data["Credit_History"] = loan_data["Credit_History"].fillna(method='ffill')

## 3- Backward Fill
loan_data["Credit_History"] = loan_data["Credit_History"].fillna(method='bfill')

## 4- Interpolation
loan_data["Credit_History"] = loan_data["Credit_History"].interpolate(method='linear') 
'''
############################################################################################

## 1- Mean Imputation:
# The best way fill in the Loan Amount is the Mean (The average).
loan_data["LoanAmount"] = loan_data["LoanAmount"].fillna(loan_data["LoanAmount"].mean())

'''
## 1- Median Imputation:
loan_data["LoanAmount"] = loan_data["LoanAmount"].fillna(loan_data["LoanAmount"].median())


## 2- Mode Imputatoin:
loan_data["LoanAmount"] = loan_data["LoanAmount"].fillna(loan_data["LoanAmount"].mode()[0])
'''
############################################################################################
'''
## 1- Mean Imputation:
loan_data["Loan_Amount_Term"] = loan_data["Loan_Amount_Term"].fillna(loan_data["Loan_Amount_Term"].mean())

## 1- Median Imputation:
loan_data["Loan_Amount_Term"] = loan_data["Loan_Amount_Term"].fillna(loan_data["Loan_Amount_Term"].median())
'''

## 2- Mode Imputatoin:
# The best way fill in the Credit History is the mode (Most common valus)
loan_data["Loan_Amount_Term"] = loan_data["Loan_Amount_Term"].fillna(loan_data["Loan_Amount_Term"].mode()[0])

############################################################################################

# Fill the missing values for categorical data.
loan_data["Gender"] = loan_data["Gender"].fillna(loan_data["Gender"].mode()[0])
loan_data["Married"] = loan_data["Married"].fillna(loan_data["Married"].mode()[0])
loan_data["Dependents"] = loan_data["Dependents"].fillna(loan_data["Dependents"].mode()[0])
loan_data["Self_Employed"] = loan_data["Self_Employed"].fillna(loan_data["Self_Employed"].mode()[0])


# Delete them (but it is not the best choose for me [Low accuracy] )
# loan_data = loan_data.dropna(subset=['Gender', 'Married', 'Dependents', 'Self_Employed', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History'])

: 

In [None]:
# Check for missing values after handling
null_counts = loan_data.isnull().sum()

# Display the number of null values after handling
print(null_counts)

: 

## <font size="4" face="WildWest">3. Encoding Categorical Variables</font>

In [None]:
# Convert categorical variables into dummy/indicator variables through a process called "one-hot" encoding
loan_data = pd.get_dummies(loan_data)

# Drop unneeded features
loan_data = loan_data.drop(['Gender_Female', 
                            'Married_No', 
                            'Education_Not Graduate',
                            'Self_Employed_No', 
                            'Loan_Status_N'], axis = 1) # axis = 1 for column

# Rename existing columns
newColunmsNames = {'Gender_Male': 'Gender', 
                   'Married_Yes': 'Married', 
                   'Education_Graduate': 'Education', 
                   'Self_Employed_Yes': 'Self_Employed',
                   'Loan_Status_Y': 'Loan_Status'}

# Assigning new columns names
loan_data.rename(columns=newColunmsNames, inplace=True)

# Display the columns names and shape of the transformed dataset
## Get the column names of the DataFrame
column_names = loan_data.columns.tolist()

## Print the column names in a readable format
print(colored("Column Names:", "blue",attrs=['reverse']))

for col in column_names:
    print(f"- {col}")
    
print("The shape =",loan_data.shape)
print("_______________________________________________")

# Display the first 5 rows of the transformed target variable
print(colored("Transformed Target Variable (Y):","blue", attrs=['reverse']))
print(loan_data['Loan_Status'].head())  # First 5 rows by default

: 

## <font size="4" face="WildWest">4. Handling Outliers</font>

**Outliers** are data points that significantly differ from the rest of the observations in a dataset, often due to errors in data collection or genuine extreme values.

 **How to detect outliers?**
- Visualization data (Numerical values)

 **Decision options:**
1. Keep them (Not the best solution, I try it and not get high accuracy, "There is better")
2. Reassign new values to them (Using specific techniques)
    - Capping Outliers (N)
        - Instead of removing outliers entirely, replace extreme values with more reasonable thresholds.
    - Robust Scaling (Y)
        - Use scaling methods less sensitive to outliers (e.g., IQR scaling, standardization with robust estimators).
            - Robust Scaling Function
            - IQR scaling
    - Replace with mean (N)
        - Replacing the outliers with the mean value
3. Delete them (N)

Showing outliers in this process before making any handling, but in another graph than the previous graph I used.

In [None]:
# Set the figure size
plt.figure(figsize=(15, 10))

# Only variables that have outliers
outliersColumns = loan_data.get(["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term"])

# Add outliers to the plot
sns.stripplot(data=outliersColumns, color="red", jitter=0.3, size=5)

# Set the axis labels and title
plt.title("Outliers")

# Show the plot
plt.show()

: 

**The Process**

In [None]:
# Reassign new values to them

'''
# 1- Capping Outliers
loan_data.loc[loan_data['ApplicantIncome'] > 20000, 'ApplicantIncome'] = 20000
loan_data.loc[loan_data['CoapplicantIncome'] > 10000, 'CoapplicantIncome'] = 10000
'''
'''
# 2- Robust Scaling
## Method 1

# import Robust Scaler
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
loan_data[['ApplicantIncome', 'CoapplicantIncome']] = scaler.fit_transform(loan_data[['ApplicantIncome', 'CoapplicantIncome']])

## Method 2

# IQR Scaling
Q1 = loan_data.quantile(0.25)
Q3 = loan_data.quantile(0.75)

# Calcuate the Interquartile Range (IQR)
IQR = Q3 - Q1

# Removing Qutliers
loan_data = loan_data[~((loan_data < (Q1 - 1.5 * IQR)) |(loan_data > (Q3 + 1.5 * IQR))).any(axis=1)]

'''

'''
# 3- Replace with mean
loan_data.loc[loan_data['ApplicantIncome'] > 20000, 'ApplicantIncome'] = loan_data['ApplicantIncome'].mean()
loan_data.loc[loan_data['CoapplicantIncome'] > 10000, 'CoapplicantIncome'] = loan_data['CoapplicantIncome'].mean()
'''
'''
# Delete the outliers
# The data before deleting outliers 
print("Before Removing the outliers", loan_data.shape)

# Deleting outliers (Removing the number of observation where the applicant income is more than 20k)
loan_data = loan_data[loan_data['ApplicantIncome']<20000]

# Deleting outliers (Removing the number of observation where the co-applicant income is more than 10k)
loan_data = loan_data[loan_data['CoapplicantIncome']<10000]

#The data after deleting outliers
print("After Removing the outliers", loan_data.shape)
'''
## Method 2

# IQR Scaling
Q1 = loan_data.astype(np.float32).quantile(0.25)
Q3 = loan_data.astype(np.float32).quantile(0.75)

# Calcuate the Interquartile Range (IQR)
IQR = Q3 - Q1

# Removing Qutliers
loan_data = loan_data[~((loan_data < (Q1 - 1.5 * IQR)) |(loan_data > (Q3 + 1.5 * IQR))).any(axis=1)]

# printing shape
print(loan_data.shape)


: 

In [None]:
# Square Root Transformation - to normalized the distribution.
loan_data.ApplicantIncome = np.sqrt(loan_data.ApplicantIncome)
loan_data.CoapplicantIncome = np.sqrt(loan_data.CoapplicantIncome)
loan_data.LoanAmount = np.sqrt(loan_data.LoanAmount)

print(loan_data.shape

: 

Visualize data distribution after handling outliers

In [None]:
# Set the figure size
plt.figure(figsize=(15, 10))

# Only variables that have outliers
outliersColumns = loan_data.get(["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term"])

# Add outliers to the plot
sns.stripplot(data=outliersColumns, color="red", jitter=0.3, size=5)

# Set the axis labels and title
plt.title("Outliers")

# Show the plot
plt.show()

: 

Another Graphe

In [None]:
# Histogram distribution for numerical values

# Set the seaborn theme palette
sns.set_theme(palette="flare")

def plot_distribution(column, title):
    plt.figure(figsize=(8, 4))
    sns.histplot(data=loan_data, x=column, kde=True)
    plt.title(title)
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()

# Plot distribution for each numerical column
plot_distribution("ApplicantIncome", "Applicant Income Distribution")
plot_distribution("CoapplicantIncome", "Coapplicant Income Distribution")
plot_distribution("LoanAmount", "Loan Amount Distribution")

: 

As can be seen, the distribution after using log transformation are much better compared to original distribution.

## <font size="4" face="WildWest">5. Handling Duplicates</font>

In [None]:
# List of column names to check for duplicates (Numerical values)
columns_to_check = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']

# Iterate over each column name
for column_name in columns_to_check:
    # Checking for duplicate entries in the current column
    duplicate_count = loan_data[column_name].duplicated().sum()
    
    # Output the result with a descriptive message
    if duplicate_count == 0:
        print(colored(f"No duplicate entries found in the {column_name} column.", "green", attrs=['reverse']))
    else:
        print(colored(f"Number of duplicate entries found in the {column_name} column: {duplicate_count}", "cyan", attrs=['bold']))

: 

**We don't need to handle duplicate values because it isn't affecting the accuracy or integrity of the analysis or model being performed.**

Data Processing
It involves *preparing* and *transforming raw data* into a suitable format for analysis and **model training**. Effective data processing ensures that the machine learning algorithms can extract meaningful patterns and make accurate predictions, includes:
- Splitting data into "Features" - "Target"
- SMOTE Technique
- Data Re-scaling and Normalizing the features
- Splitting data into "Training" - "Testing" Data

## <font size="4" face="WildWest">1. Splitting data into "Features" - "Target"</font>

In [None]:
# Features "Inputs" (X)
X = loan_data.drop(columns=['Loan_Status'])

# Target variable "Outputs" (Y)
Y = loan_data['Loan_Status']

# Print the shapes of X and Y to verify the splitting
print("Shape of X:", X.shape)
print("Shape of Y:", Y.shape)

: 

## <font size="4" face="WildWest">2. SMOTE Technique</font>
In previous exploration, it can be seen that the number between **accepted** and **rejected** loan is *imbalanced*. In this section, oversampling technique will be used to avoid overfitting. But recently I discovered that not using it will give higher accuracy. 

## <font size="4" face="WildWest">3. Data Re-scaling and Normalizing the features</font>
***Min-Max Scaling:*** transforms the features so that they fall within a specified range, typically between 0 and 1.

***Standardization:*** the features so that they have a mean of 0 , typically between -1 and 1.

In [None]:
# Rescale and normalize the features
'''
# Standardization (Normalization)
standard_scaler = StandardScaler()
X = standard_scaler.fit_transform(X)
'''

# Min-Max Scaling (Rescaling)
min_max_scaler = MinMaxScaler()
X = min_max_scaler.fit_transform(X)

#I will choose one of them in the future part "model selection" based on the highest accuracy

: 

**Why we use fit_transform() on training data but transform() on the test data?**
    
The fit_transform() method is used on training data to calculate scaling parameters like **mean** and **standard deviation**, then applies scaling. For test data, we apply the same scaling transformation without recalculating parameters, ensuring consistency for fair comparison and accurate evaluation.

## <font size="4" face="WildWest">4. Splitting data into "Training" - "Testing" Data</font>

In [None]:
# Split the data into training and testing sets
# X: Features, Y: Target variable
# test_size=0.2 specifies that 20% of the data will be used for testing and 80% for training
# random_state=0 sets the random seed for reproducibility
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# Print the shapes of the training and testing sets to verify the splitting
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of Y_train:", Y_train.shape)
print("Shape of Y_test:", Y_test.shape)

: 

Model Selection
Model selection is an essential step in machine learning that involves choosing **the most appropriate model for a given problem**. The goal of model selection is to find a model that generalizes well to unseen data and provides the best performance on the task at hand.
- Importing models for selection process
- Training our models.
    1. Decision Tree Classifier
    2. Random Forest Classifier (Highest Accuracy) 
    3. Naives Bayes
        - GaussianNB (Suitable for continuous data)
        - BernoulliNB (Suitable for binary value)
    4. Logistic Regression 
    5. Ridge Classifier CV
    6. K-Nearest Neighbors (KNN)
- For future searching
    7. Support Vector Classifier
    8. Gradient Boosting Classifier

## <font size="4" face="WildWest">1. Importing models for selection process</font>

In [None]:
from sklearn.tree import DecisionTreeClassifier  # For Decision Tree Classifier Model
from sklearn.ensemble import RandomForestClassifier # For Random Forest Classifier Model
from sklearn.neighbors import KNeighborsClassifier # For K-Nearest Neighbors Model
from sklearn.naive_bayes import GaussianNB,BernoulliNB  # For Gaussian,Bernoulli Naive Bayes Classifier Model
from sklearn.linear_model import RidgeClassifierCV, LogisticRegression # For Ridge Classifier Cross-validated and Logistic Regression Models
from sklearn.svm import SVC  # For Support Vector Classifier Model
from sklearn.ensemble import GradientBoostingClassifier  # For Gradient Boosting Classifier Model



: 

## <font size="4" face="WildWest">2. Training the Machine Learning Algorithms</font>

- Decision Tree Classifier

In [None]:
# Decision Tree Classifier Parameters
'''
sklearn.tree.DecisionTreeClassifier(*, criterion='gini', 
                                    splitter='best', max_depth=None, 
                                    min_samples_split=2, min_samples_leaf=1, 
                                    min_weight_fraction_leaf=0.0, max_features=None, 
                                    random_state=None, max_leaf_nodes=None, 
                                    min_impurity_decrease=0.0, class_weight=None, 
                                    ccp_alpha=0.0, monotonic_cst=None)
'''

# max_depth (The maximum depth of the tree "هو عدد الطبقات لاطول فرع من فروع الشجرة")
# min_samples_split (The minimum number of samples required to split an internal node "2 by default")
# min_samples_leaf (The minimum number of samples required to be at a leaf node)
# max_features (The number of features to consider when looking for the best split)

# Initialize lists to store training and testing accuracies
scoreListDT_Train = []
scoreListDT_Test = []

# Iterate over different values of max_depth
for i in range(1, 20):
    # Iterate over different values of min_samples_leaf
    for j in range(1, 5):
        # Create a Decision Tree model with the different values of max_depth, min_samples_leaf, and fixed max_features
        Model1 = DecisionTreeClassifier(max_depth=i, min_samples_leaf=j, max_features=2)

        # Fit the model on the training data
        Model1.fit(X_train, Y_train)

        # Calculate and store the training accuracy
        scoreListDT_Train.append(Model1.score(X_train, Y_train))

        # Calculate and store the testing accuracy
        scoreListDT_Test.append(Model1.score(X_test, Y_test))

# Find the maximum accuracy for both training and testing
DT_Accuracy_Train = max(scoreListDT_Train) 
DT_Accuracy_Test = max(scoreListDT_Test)

# Print the best accuracies achieved
print(f"Decision Tree best accuracy (Training): {DT_Accuracy_Train*100:.2f}%")
print(f"Decision Tree best accuracy (Testing): {DT_Accuracy_Test*100:.2f}%")

# Print a success message indicating that the model has been trained successfully
print(colored("The Decision Tree model has been trained successfully","green", attrs=['reverse']))

: 

- Random Forest Classifier

In [None]:
# Random Forest Classifier Parameters
'''
sklearn.ensemble.RandomForestClassifier(n_estimators=100, *, criterion='gini', 
                                        max_depth=None, min_samples_split=2, min_samples_leaf=1, 
                                        min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, 
                                        min_impurity_decrease=0.0, bootstrap=True, oob_score=False, 
                                        n_jobs=None, random_state=None, verbose=0, 
                                        warm_start=False, class_weight=None, ccp_alpha=0.0, 
                                        max_samples=None, monotonic_cst=None)
'''

# n_estimators (The number of trees in the forest"عدد الاشجار في الغابة")
# max_depth (The maximum depth of the tree.)
# random_state (Controls both the randomness of the bootstrapping of the samples used when building trees)

# Initialize lists to store training and testing accuracies
scoreListRF_Train = []
scoreListRF_Test = []

'''
max_dep      ----------> (1, 5),(1, 10) 
rand_state   ----------> (1, 35),(1, 50)
n_est        ----------> (1, 30),(1, 30)
'''

# Iterate over different values of max_depth
for max_dep in range(1, 10):
    # Iterate over different values of random_state
    for rand_state in range(1, 50):
        # Iterate over different values of n_estimators
        for n_est in range(1, 30):
            # Create a Random Forest model with the different values of max_depth, random_state, and n_estimators
            Model2 = RandomForestClassifier(n_estimators=n_est, random_state=rand_state, max_depth=max_dep)            
            
            # Fit the model on the training data
            Model2.fit(X_train, Y_train)
            
            # Calculate and store the training accuracy
            scoreListRF_Train.append(Model2.score(X_train, Y_train))
            
            # Calculate and store the testing accuracy
            scoreListRF_Test.append(Model2.score(X_test, Y_test))

# Find the maximum accuracy for both training and testing
RF_Accuracy_Train = max(scoreListRF_Train) 
RF_Accuracy_Test = max(scoreListRF_Test)

# Print the best accuracies achieved
print(f"Random Forest best accuracy (Training): {RF_Accuracy_Train*100:.2f}%")
print(f"Random Forest best accuracy (Testing): {RF_Accuracy_Test*100:.2f}%")

# Print a success message indicating that the model has been trained successfully
print(colored("The Random Forest model has been trained successfully","green", attrs=['reverse']))

: 

- Gaussian Naive Bayes

In [None]:
# Gaussian Naive Bayes Parameters
'''
sklearn.naive_bayes.GaussianNB(*, priors=None, var_smoothing=1e-09)
'''

# var_smoothing (Portion of the largest variance of all features that is added to variances for calculation stability.)

# Initialize lists to store training and testing accuracies
scoreListGNB_Train = []
scoreListGNB_Test = []

# Iterate over different values of var_smoothing
for i in range(1, 9):
    # Create a Gaussion Naive Bayes Model with the different values of var_smoothing
    Model3_1 = GaussianNB(var_smoothing=10**(-i))

    # Fit the model on the training data
    Model3_1.fit(X_train, Y_train)
    
    # Calculate and store the training accuracy
    scoreListGNB_Train.append(Model3_1.score(X_train, Y_train))
    
    # Calculate and store the testing accuracy
    scoreListGNB_Test.append(Model3_1.score(X_test, Y_test))

# Find the maximum accuracy for both training and testing
GNB_Accuracy_Train = max(scoreListGNB_Train) 
GNB_Accuracy_Test = max(scoreListGNB_Test)

# Print the best accuracies achieved
print(f"Gaussian Naive Bayes best accuracy (Training): {GNB_Accuracy_Train*100:.2f}%")
print(f"Gaussian Naive Bayes best accuracy (Testing): {GNB_Accuracy_Test*100:.2f}%")

# Print a success message indicating that the model has been trained successfully
print(colored("The Gaussian Naive Bayes model has been trained successfully","green", attrs=['reverse']))

: 

- Bernoulli Naive Bayes

In [None]:
# Bernoulli Naive Bayes Parameters
'''
sklearn.naive_bayes.BernoulliNB(*, alpha=1.0, force_alpha=True, binarize=0.0, fit_prior=True, class_prior=None)
'''

# alpha (Additive (Laplace/Lidstone) smoothing parameter)
# force_alpha (If False and alpha is close to zero, it's adjusted to 1e-10 to prevent numerical errors; otherwise, alpha stays unchanged.)
# binarize (Sets the threshold for converting sample features to binary values; if None, assumes the input is already binary.)

# Create a Bernoulli Naive Bayes Model
Model3_2= BernoulliNB()

# Fit the model on the training data
Model3_2.fit(X_train, Y_train)
    
# Calculate and store the training accuracy
BNB_Accuracy_Train = Model3_2.score(X_train, Y_train)
    
# Calculate and store the testing accuracy
BNB_Accuracy_Test = Model3_2.score(X_test, Y_test)

# Print the best accuracies achieved
print(f"Bernoulli Naive Bayes best accuracy (Training): {BNB_Accuracy_Train*100:.2f}%")
print(f"Bernoulli Naive Bayes best accuracy (Testing): {BNB_Accuracy_Test*100:.2f}%")

# Print a success message indicating that the model has been trained successfully
print(colored("The Bernoulli Naive Bayes model has been trained successfully","green", attrs=['reverse']))

: 

- Logistic Regression

In [None]:
# Logistic Regression Parameters
'''
sklearn.linear_model.LogisticRegression(penalty='l2', *, 
                                        dual=False, tol=0.0001, 
                                        C=1.0, fit_intercept=True, 
                                        intercept_scaling=1, class_weight=None, 
                                        random_state=None, solver='lbfgs', 
                                        max_iter=100, multi_class='auto', 
                                        verbose=0, warm_start=False, 
                                        n_jobs=None, l1_ratio=None)
'''

# tol ("مقدار السماحية المسموح بها في الخطأ")
# C ("مقلوب قيمة التنعيم مثل الموجود في نظام الدعم الآلي,كلما قلت كلما زاد التنعيم")
# solver ("معادلة التصنيف")
# max_iter ("العدد الاقصي للمحاولات")
# random_state ("الهيكلية المسخدمة في عشوائية البيانات")
# n_jobs ("لتحديد مدي سرعة العملية,لو نان او 1 ستكون عادية,وكلما زادت زادت السرعة وقيمة سالب واحد الاقصي")

'''
solver:
liblinear : small data
sag       : big data
sage      : big data

'''

# Initialize lists to store training and testing accuracies
scoreListLR_Train = []
scoreListLR_Test = []

# Iterate over different values of random_state
for i in range(1, 150):
    # Iterate over different values of max_iter
    for j in range(1, 10):
        # Create a Logistic Regression Model with the different values of random_state and max_iter
        Model4= LogisticRegression(random_state=i, solver='saga', max_iter=j)

        # Fit the model on the training data
        Model4.fit(X_train,Y_train)

        # Calculate and store the training accuracy
        scoreListLR_Train.append(Model4.score(X_train, Y_train))

        # Calculate and store the testing accuracy
        scoreListLR_Test.append(Model4.score(X_test, Y_test))

# Find the maximum accuracy for both training and testing
LR_Accuracy_Train = max(scoreListLR_Train) 
LR_Accuracy_Test = max(scoreListLR_Test)

# Print the best accuracies achieved
print(f"Logistic Regression best accuracy (Training): {LR_Accuracy_Train*100:.2f}%")
print(f"Logistic Regression best accuracy (Testing): {LR_Accuracy_Test*100:.2f}%")

# Print a success message indicating that the model has been trained successfully
print(colored("The Logistic Regression model has been trained successfully","green", attrs=['reverse']))

: 

- Ridge Classifier CV

In [None]:
# Ridge Classifier CV Parameters
'''
sklearn.linear_model.RidgeClassifierCV(alphas=(0.1, 1.0, 10.0), *, 
                                       fit_intercept=True, scoring=None, 
                                       cv=None, class_weight=None, 
                                       store_cv_values=False)
'''
# alphas (Array of alpha values to try. Regularization strength)
# cv (Determines the cross-validation splitting strategy.)

# Create a Ridge Classifier Model
Model5= RidgeClassifierCV()

# Fit the model on the training data
Model5.fit(X_train,Y_train)

# Calculate and store the training accuracy
RCCV_Accuracy_Train = Model5.score(X_train, Y_train)

# Calculate and store the testing accuracy
RCCV_Accuracy_Test = Model5.score(X_test, Y_test)

# Print the best accuracies achieved
print(f"Ridge Classifier CV best accuracy (Training): {RCCV_Accuracy_Train*100:.2f}%")
print(f"Ridge Classifier CV best accuracy (Testing): {RCCV_Accuracy_Test*100:.2f}%")

# Print a success message indicating that the model has been trained successfully
print(colored("The Ridge Classifier CV model has been trained successfully","green", attrs=['reverse']))

: 

- K-Nearest Neighbors (KNN) 

In [None]:
# K-Nearest Neighbors (KNN) Parameters
'''
sklearn.neighbors.KNeighborsClassifier(n_neighbors=5, *, 
                                       weights='uniform', algorithm='auto', 
                                       leaf_size=30, p=2, 
                                       metric='minkowski', 
                                       metric_params=None, n_jobs=None)
'''

# n_neighbors (Number of neighbors to use by default for kneighbors queries.)
# metric (Metric to use for distance computation.)
# n_jobs (The number of parallel jobs to run for neighbors search.)

# Initialize lists to store training and testing accuracies
scoreListknn_Train = []
scoreListknn_Test = []

# Iterate over different values of n_neighbors
for i in range(3, 16):
    # Create a KNN model with the different value of n_neighbors
    Model6 = KNeighborsClassifier(n_neighbors=i, weights='distance')
    
    # Fit the model on the training data
    Model6.fit(X_train, Y_train)
    
    # Calculate and store the training accuracy
    scoreListknn_Train.append(Model6.score(X_train, Y_train))
    
    # Calculate and store the testing accuracy
    scoreListknn_Test.append(Model6.score(X_test, Y_test))

# Plot the training accuracy for different values of n_neighbors
plt.plot(range(3, 16), scoreListknn_Train)
plt.xticks(np.arange(3, 16, 1))
plt.title("Training Accuracy vs K value")
plt.xlabel("K value")
plt.ylabel("Accuracy")
plt.show()

# Plot the testing accuracy for different values of n_neighbors
plt.plot(range(3, 16), scoreListknn_Test)
plt.xticks(np.arange(3, 16, 1))
plt.title("Testing Accuracy vs K value")
plt.xlabel("K value")
plt.ylabel("Accuracy")
plt.show()

# Find the maximum accuracy for both training and testing
KNN_Accuracy_Train = max(scoreListknn_Train) 
KNN_Accuracy_Test = max(scoreListknn_Test)

# Print the best accuracies achieved
print(f"KNN best accuracy (Training): {KNN_Accuracy_Train*100:.2f}%")
print(f"KNN best accuracy (Testing): {KNN_Accuracy_Test*100:.2f}%")

# Print a success message indicating that the model has been trained successfully
print(colored("The K-Nearest Neighbors (KNN) model has been trained successfully","green", attrs=['reverse']))

: 

Model Evaluation
- Training Score
- Testing Score.
- Choosing the better Model.

## <font size="4" face="WildWest">1. Training Score</font>

In [None]:
# Dictionary to store model names and their respective training scores (key:value)
model_train_scores = {
    "Model 1-Decision Tree Classifier": DT_Accuracy_Train,
    "Model 2-Random Forest Classifier": RF_Accuracy_Train,
    "Model 3-GaussianNB": GNB_Accuracy_Train,
    "Model 3-BernoulliNB": BNB_Accuracy_Train,
    "Model 4-Logistic Regression": LR_Accuracy_Train,
    "Model 5-Ridge Classifier CV": RCCV_Accuracy_Train,
    "Model 6-K-Nearest Neighbors (KNN)": KNN_Accuracy_Train
}

# Loop through each model and print the training score
for model_name, accuracy in model_train_scores.items():
    print(colored(f"{model_name:<50} Training Score: {accuracy*100}", "green"))

: 

## <font size="4" face="WildWest">2. Testing Score</font>

In [None]:
# Dictionary to store model names and their respective testing scores
model_test_scores = {
    "Model 1-Decision Tree Classifier": DT_Accuracy_Test,
    "Model 2-Random Forest Classifier": RF_Accuracy_Test,
    "Model 3-GaussianNB": GNB_Accuracy_Test,
    "Model 3-BernoulliNB": BNB_Accuracy_Test,
    "Model 4-Logistic Regression": LR_Accuracy_Test,
    "Model 5-Ridge Classifier CV": RCCV_Accuracy_Test,
    "Model 6-K-Nearest Neighbors (KNN)": KNN_Accuracy_Test
}

# Loop through each model and print the testing score
for model_name, accuracy in model_test_scores.items():
    print(colored(f"{model_name:<50} Testing Score: {accuracy*100}", "green"))

: 

## <font size="4" face="WildWest">3. Choosing the better Model.</font>

The best models in accuracy are models 1 & 2 & 4 : **Decision Tree Classifier** and **Random Forest Classifier** and **Logistic Regression** with accuracy **96.43%**. Which we can choose any one of them for our *deployment*. I have chosen **Random Forest Classifier** model.

# <font color="#4863A0"> Model Deployment 🛠️</font>

In [None]:
import pickle

# File mode explanation:
# 'r'  - open for reading (default)
# 'w'  - open for writing, truncating the file first
# 'x'  - create a new file and open it for writing
# 'a'  - open for writing, appending to the end of the file if it exists
# 'b'  - binary mode
# 't'  - text mode (default)
# '+'  - open a disk file for updating (reading and writing)
# 'U'  - universal newline mode (deprecated)

# Define the filename for the pickle file
filename = 'model.pkl'

# Save (serialize) the model to the file using pickle
# 'wb' mode opens the file in binary format for writing
pickle.dump("Model 1", open(filename, 'wb'))

: 

: 

: 