In [None]:
%load_ext autoreload
%autoreload 2

Loading Modules

In [None]:
import pandas as pd
import numpy as np
from mlproject.components import load_data, DataExplore, DataProcess, DataVisualize, save_to_csv
from mlproject.model import logistic_regression, decision_tree, \
knn, random_forest, xgboost , split_data, auto_encoder
from typing import Union, List
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

Loading dataset

In [None]:
data = load_data(file_path="Dataset/creditcard_2023.csv")

check dataset info

In [None]:
data.shape

In [None]:
data.columns

Finding Numerical Columns

In [None]:
numerical = data.select_dtypes(include="number")

Count Duplicated rows

In [None]:
data.duplicated(keep="last").sum()

To drop duplicate rows if you find any

In [None]:
#data.drop_duplicates()

Check Null Values

In [None]:
data.isnull().sum().sum()

Using another Faster method to find Null

In [None]:
data.isnull().values.any().sum()

To Fill Null values

In [None]:
# To fill row with Mean or Median
def fill_null(data: pd.DataFrame, method: Union[List[str]] = "mean" ): 
    if method == "zero":
        print("using zero")
        data.fillna(value=0, inplace=True)
    elif method == "mean":
        print("using mean")
        for col in data.columns:
            data[col] = data[col].fillna(value=data[col].mean())
    else:
        for col in data.columns:
            data[col] = data[col].fillna(value=data[col].median())
    return data

#fill_null(data=data, method="zero")

Describe the dataset

In [None]:
# Removing id column which is just a number
data.drop("id", axis=1, inplace=True)

In [None]:
data.describe()

Check Outliers & Visualize it

Method to find Outliers:
* Z- Score
* Quantile filter
* IQR - Distance from Median - Below example

In [None]:
def check_outliers(data: pd.DataFrame):

    outliers = []

    for col in data.select_dtypes(include="number").drop(columns=["Amount", "Class"]).columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)

        IQR = Q3 - Q1
        lower_bound = Q1 - 3 * IQR
        upper_bound = Q3 + 3 * IQR

        #print(Q1,Q3,IQR, lower_bound, upper_bound)

        # Z-Score implementation
        threshold = 3
        #Step1: Calculated Mean
        mean = data[col].mean()
        # Step2: Squarred differences
        squared_diff = (data[col] - mean)**2
        # Step3: Divide Squarred diff with lenght of column
        variance = squared_diff.sum() / len(data[col])
        #Step4: Standard Deviation
        std = variance ** 0.5
        z_score = (data[col] - mean)/std
        outliers_zscore =  data[col] [z_score.abs() > threshold]
        #########################################################

        outlier_mask = (data[col] < lower_bound) | (data[col] > upper_bound)

        outliers_data = data[col][outlier_mask]

        data.loc[outlier_mask, col] = np.nan

        num_outliers = len(outliers_data)
        percent_outliers = (num_outliers / len(data[col])) * 100
        #if percent_outliers > 1.0:
        outliers.append([data[col].name, data[col].shape[0], num_outliers,"num:",round(percent_outliers, 3), "%", len(outliers_zscore),round(lower_bound, 3),round(upper_bound,3)])
    return outliers, data

_ , data = check_outliers(data=data)

#plt.figure(figsize=(10,8)) # widthx Height
#sns.boxplot(data=data.drop(columns= ["Amount", "Class"])) # ignore Nan values 
#plt.show()

Dropping All Nan contained rows

In [None]:
data.dropna(inplace=True)

In [None]:
plt.figure(figsize=(10,8)) # widthx Height
sns.boxplot(data=data.drop(columns= ["Amount", "Class"])) # ignore Nan values 
plt.show()

Feature scaling 0 -> 1
* Note: Before scaling, let's check data distribution using Histogram

In [None]:
# Filter only numerical columns
numerical_data = data.drop(columns= ["Amount", "Class"])

# Number of columns in the plot grid
num_cols = 3
num_rows = len(numerical_data.columns) // num_cols + (len(numerical_data.columns) % num_cols != 0)

# Set up the figure with subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5 * num_rows))

# Flatten axes array to easily iterate
axes = axes.flatten()

# Loop over columns to plot histograms
for i, col in enumerate(numerical_data.columns):
    sns.histplot(data[col], bins=10, kde=True, ax=axes[i])
    axes[i].set_title(f'Histogram of {col}')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')

# Hide any empty subplots (if number of columns is not a perfect multiple of num_cols)
for j in range(i + 1, len(axes)):
    axes[j].axis('off')

# Adjust layout to avoid overlap
plt.tight_layout()
plt.show()

Applying Standard Scalar Distribution

In [None]:
standard_scaler = StandardScaler()

for col in data.drop(columns=["Amount", "Class"]).columns:
    if data[col].isnull().any():
        data[col] = data[col].fillna(data[col].mean())
    scaled_data = standard_scaler.fit_transform(data[col].values.reshape(-1, 1))
    data[col] = scaled_data

In [None]:
data['Amount'] = np.log1p(data['Amount'])

In [None]:
# Set up a 2x2 grid of subplots (adjust as needed)
plt.figure(figsize=(20, 16))
for i, col in enumerate(data.drop(columns=["Class"]).columns):
    plt.subplot(5, 6, i+1)  # Adjust this layout based on number of columns
    sns.histplot(data[col], kde=True, bins=30)
    plt.title(f"Distribution of {col}")

plt.tight_layout()
plt.show()

checking class values

In [None]:
class_counts = data['Class'].value_counts()

print("Class distribution:")
print(class_counts)

Check corr matrix

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data=data.corr())
plt.show()

In [None]:
data.isnull().sum().sum()

Split and train the model

In [None]:
X = data.drop(columns = ["Class"])
y = data["Class"]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42, stratify=y)

In [None]:
# Call models
results = {
    "Logistic Regression": logistic_regression(X_train, y_train, X_test, y_test),
    "Decision Tree": decision_tree(X_train, y_train, X_test, y_test),
    "KNN": knn(X_train, y_train, X_test, y_test),
    "Random Forest": random_forest(X_train, y_train, X_test, y_test),
    "XGBoost": xgboost(X_train, y_train, X_test, y_test),
}

results

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

Accuracy = accuracy_score(y_test, y_pred)
Precision =  precision_score(y_test, y_pred)
Recall = recall_score(y_test, y_pred)
F1Score = f1_score(y_test, y_pred)

print(Accuracy, Precision, Recall, F1Score)


In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix using Seaborn heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, 
            xticklabels=["Class 0", "Class 1"], 
            yticklabels=["Class 0", "Class 1"])

# Add labels and title
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()


Predicted
    0        1
* True 0  [TN: 100, FP: 10]
* 1  [FN: 20, TP: 90]

* Random Forest and XGBoost outperform the other models in all metrics, particularly in precision and recall. They are both strong candidates if you are looking for the most reliable models.
* KNN and Decision Tree also perform very well, with KNN having perfect recall, but slightly lower precision than Random Forest and XGBoost.
* Logistic Regression, while still very good, has the lowest performance of all models, particularly in recall.

* Based on these results, Random Forest and XGBoost are the top-performing models in this scenario.

Findings:
* All are Numerical Columns with shape (568630, 31)
* All Numerical columns, No Categorical
* All are int64 and Float64, No Object type Numerical values
* No duplicate rows & Null values
* id column just a numbers, not useful for Model training, now 30 columns only
* Heat map is not providing with much info and I didn't find much correlation with among each columns
* Calculated Number of Outliers using 3 Sigma Method and Visualized as well using Seaboarn boxplot
* Replaced outliers with Nan values and then removed entire rows after that.
* Data is normally distributed based on each Histogram
* Apply feature scaling using standard scalar due to Normal distrubution