In [521]:
#Importing required Libraries/modules
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns, plotly.express as px
import sklearn
#Importing train model and metrices for evaluation
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, recall_score, f1_score
#Importing Classifiers that are used
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
#Setting the theme for the charts/visuals
sns.set_theme(context='talk', style='darkgrid', palette = 'bright')

In [522]:
# Importing the csv(From Kaggle: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud)
df = pd.read_csv("creditcard.csv") 
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [523]:
#Analysing Stats 
print(df['Amount'].describe().round(2)) 

count    284807.00
mean         88.35
std         250.12
min           0.00
25%           5.60
50%          22.00
75%          77.16
max       25691.16
Name: Amount, dtype: float64


In [524]:
# Mapping Class 0 =>  Genuine, 1 => Fraud
df["Transaction Type"] = df["Class"].map({0: "Genuine", 1: "Fraud"})

scatterPlot = px.scatter(
    df, x="Amount", y=df.index, color=df["Transaction Type"],
    title="Distribution of Transaction Amounts",
    color_discrete_sequence=["#00FFFF", "#FF00FF"]  
)

scatterPlot.update_layout(
    xaxis_title="Transactions Amount (In $)", 
    yaxis_title="Per Transaction",
    template="plotly_dark",
    font=dict(family="Courier New, monospace", size=14, color="white"), 
    title_font=dict(size=20, color="cyan"),
    paper_bgcolor="#111111",  
    plot_bgcolor="#222222"
)

scatterPlot.show()


In [525]:
    #Class Distribution (0 TO 1) as of PieChart, Also for this data Fraud x Genuine
    pieChart = px.pie(df.Class,values = df.Class.value_counts(),
                names=['Genuine', 'Fraud'],
                title='Fraudulent 🆚 Genuine Transactions',
                color_discrete_sequence=["#00FFFF", "#FF00FF"],
                
            )
    pieChart.update_layout(
                template="plotly_dark",  
                title_font=dict(size=20, color="cyan"),  
                font=dict(family="Courier New, monospace", size=14, color="white"),  
                paper_bgcolor="#111111",  
                plot_bgcolor="#222222", 
            )

    pieChart.show()

    #Exact numbers
    df.Class.value_counts()


Class
0    284315
1       492
Name: count, dtype: int64

In [526]:
# More analysis on Fraud transactions
fraud_df = df.query("Class == 1")

scatterPlot = px.scatter(
    fraud_df, x="Amount", y=fraud_df.index,
    title="👎 Distribution of Fraud Transactions",
    color_discrete_sequence=["#00FFFF", "#FF00FF"]  
)

scatterPlot.update_layout(
    xaxis_title="Transactions Amount (In $)", 
    yaxis_title="Per Transaction",
    template="plotly_dark",
    font=dict(family="Courier New, monospace", size=14, color="white"), 
    title_font=dict(size=20, color="cyan"),
    paper_bgcolor="#111111",  
    plot_bgcolor="#222222"
)

scatterPlot.show()


In [527]:
#Dropping time variable
df = df.drop(columns = ['Time'], axis = 1)
#I divided the dataset into the independent variables (X) and the target variable (y)

X = df.drop(columns=['Class'], axis=1)
Y = df.Class


In [528]:
#Visualizing x
X

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Transaction Type
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,Genuine
1,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,Genuine
2,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,Genuine
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,Genuine
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,Genuine
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,4.356170,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,Genuine
284803,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,-0.975926,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,Genuine
284804,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,-0.484782,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,Genuine
284805,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,-0.399126,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,Genuine


In [529]:
#Visualizing Y
Y

0         0
1         0
2         0
3         0
4         0
         ..
284802    0
284803    0
284804    0
284805    0
284806    0
Name: Class, Length: 284807, dtype: int64

In [530]:
#Splitting the Data 80:20(Train:Test)
train_x, test_x, train_y, test_y = train_test_split(X,Y, test_size=0.2,  random_state = 123, stratify=Y) 
print("X Train Size" , train_x.shape)
print("X Test Size" ,test_x.shape)
print("Y Train Size" , train_y.shape)
print("Y Test Size" , test_y.shape)

X Train Size (227845, 30)
X Test Size (56962, 30)
Y Train Size (227845,)
Y Test Size (56962,)


In [531]:
#Stopping Data Leakage on training
scaler = StandardScaler()
train_x['Amount'] = scaler.fit_transform(train_x.Amount.values.reshape(-1,1))
train_x

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Transaction Type
136387,-0.973802,0.559524,2.538997,2.398071,-0.976096,2.149748,-0.631164,0.972725,-0.149099,-0.364067,...,-0.000142,0.249886,-0.071418,-0.648040,0.074952,1.401265,-0.041695,0.020135,0.051027,Genuine
214665,2.208105,-0.951627,-2.788394,-1.942618,1.983184,3.131311,-0.848990,0.696902,-0.633813,0.810395,...,0.023947,-0.005671,0.232390,0.711876,0.025019,-0.258152,-0.020661,-0.069854,-0.305379,Genuine
274158,2.081642,-0.015034,-1.309023,0.334923,-0.004871,-1.249462,0.207628,-0.383531,0.636140,0.010102,...,0.250243,0.898228,-0.018674,-0.070288,0.275983,-0.094883,-0.015831,-0.058939,-0.347699,Genuine
101822,-0.484408,0.642081,1.380320,-1.181570,-0.998115,-1.138431,-0.080515,0.184586,-1.753744,-0.080898,...,0.114779,0.130982,0.157172,0.722204,-0.628034,-0.615897,0.039669,0.108707,-0.291911,Genuine
59039,-0.368585,1.125132,1.304311,0.068310,0.101569,-0.994461,0.724027,-0.098055,-0.465908,-0.523562,...,-0.254401,-0.651778,-0.010525,0.347213,-0.142180,0.070202,0.247047,0.100165,-0.333792,Genuine
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57347,-0.582681,0.749033,1.481512,1.020133,0.868137,1.958464,0.080095,0.765570,-0.862215,-0.042048,...,0.215740,0.689123,-0.260397,-1.381088,0.008527,0.095466,0.153314,0.051170,-0.342200,Genuine
221840,-1.354936,3.475997,-1.436621,4.216784,1.027361,0.183235,0.783496,-0.039165,-0.199555,3.877435,...,-0.718536,-1.031749,0.149333,-0.827282,-0.340763,-0.000825,0.383560,-0.164467,-0.340964,Genuine
235976,1.759257,-1.265568,-1.185628,-0.483502,-1.098432,-0.801659,-0.773407,-0.062532,-0.176281,0.181638,...,0.396957,0.802033,-0.043247,-0.020043,-0.265369,-0.130350,-0.008780,0.008516,0.401456,Genuine
129087,1.187790,-0.370982,0.910894,0.794605,-1.244136,-0.788377,-0.322724,-0.192188,-0.731266,0.598652,...,-0.547628,-0.986693,0.128609,0.879197,0.350698,-0.564368,0.062486,0.046380,-0.152560,Genuine


In [532]:
#Stopping Data Leakage on testing??
scaler= StandardScaler()
test_x['Amount'] = scaler.fit_transform(test_x.Amount.values.reshape(-1,1))
test_x



Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Transaction Type
24729,1.297282,-1.176191,0.450595,-1.453381,-0.921556,0.816109,-1.238829,0.333164,-1.865569,1.250254,...,-0.070905,0.318808,0.198480,-0.662771,0.021905,-0.098047,0.099753,0.010333,-0.278511,Genuine
124222,1.041323,-0.252484,0.924527,0.379391,-0.548263,0.568597,-0.666701,0.391843,0.247487,-0.027557,...,0.068064,0.169519,0.107589,-0.276678,-0.052496,0.396344,0.012009,0.012486,-0.226439,Genuine
214211,-0.570442,1.137299,-0.601386,-0.458304,0.593223,-0.507870,0.980228,0.359346,-0.514617,-0.554865,...,0.163359,0.487744,-0.092854,0.766815,-0.065340,-0.268976,0.258155,0.176732,-0.197546,Genuine
64868,1.232836,-0.187215,0.246974,0.601609,-0.320217,0.155374,-0.349593,0.181849,0.646119,0.000201,...,-0.106708,-0.171977,-0.188017,-0.477295,0.613008,0.477029,-0.031343,-0.010443,-0.319237,Genuine
116182,-0.751012,0.281865,2.937944,1.124178,-0.549099,-0.313871,0.108175,0.018034,0.316367,-0.267872,...,0.174580,0.619169,-0.230472,0.688505,0.239638,-0.169139,-0.065851,-0.125541,-0.210959,Genuine
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260463,-1.132954,-0.152772,2.264922,-2.289108,-0.852749,1.855496,-1.445714,1.171690,0.187240,-0.688716,...,0.262217,1.055061,-0.596807,-0.254867,0.879284,0.091643,0.219280,0.045171,-0.340106,Genuine
154446,0.054681,1.200326,-1.353828,-0.319962,1.176937,-0.049571,0.336115,0.386296,0.974510,-1.401985,...,0.156120,0.579496,-0.211421,-0.513745,-0.262378,-0.175571,-0.046089,0.018901,-0.308539,Genuine
160659,1.945615,0.233408,-2.708821,1.382179,1.199875,-0.561540,0.691269,-0.259857,0.290790,-0.367882,...,-0.052585,-0.136515,-0.127208,-0.118055,0.520093,-0.483740,-0.020506,-0.019040,-0.050122,Genuine
134551,1.277188,-0.694899,0.194248,-1.702634,-0.903516,-0.523666,-0.487222,-0.040520,-0.252517,-0.037524,...,-0.319925,-0.369215,0.024968,-0.012170,0.415641,-0.727509,0.087082,0.021185,-0.243661,Genuine


In [533]:
Y.value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

In [534]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns in train_x and test_x
le = LabelEncoder()

for col in train_x.select_dtypes(include=['object']).columns:
    train_x[col] = le.fit_transform(train_x[col])

for col in test_x.select_dtypes(include=['object']).columns:
    test_x[col] = le.transform(test_x[col])  # Use same encoder as train_x

# Encode train_y and test_y directly (since they are Series)
train_y = le.fit_transform(train_y)
test_y = le.transform(test_y)

# Verify encoding
print("Unique values in train_y:", set(train_y))
print("Unique values in test_y:", set(test_y))


Unique values in train_y: {np.int64(0), np.int64(1)}
Unique values in test_y: {np.int64(0), np.int64(1)}


In [535]:

#Balancing the data because the fraud is too less
from imblearn.over_sampling import SMOTE
train_x, train_y = SMOTE(random_state = 42).fit_resample(train_x,train_y) 


In [536]:
#Finally Random Forest classifier

random_forest = RandomForestClassifier(n_estimators = 100, random_state = 123)
random_forest.fit(train_x,train_y)
y_predictions_rf = random_forest.predict(test_x)

# Applying Decision Tree Classifier 

decision_tree = DecisionTreeClassifier(random_state = 123)
decision_tree.fit(train_x,train_y)
 
y_predictions_dt = decision_tree.predict(test_x)

# Applying Ada Boost Classifier 
ada_boost = AdaBoostClassifier(n_estimators = 100, random_state = 123)
ada_boost.fit(train_x,train_y)
 
y_predictions_ab = ada_boost.predict(test_x)

# Applying Gradient Boosting Classifier 
gradient_boosting = GradientBoostingClassifier(n_estimators = 100, random_state = 123)
gradient_boosting.fit(train_x,train_y)
 
y_predictions_gb = gradient_boosting.predict(test_x)



In [537]:
#checking the Matrices for Random Forest

metrics = [['Accuracy',(accuracy_score(test_y, y_predictions_rf))],
           ['Precision',precision_score(test_y, y_predictions_rf)],
           ['Recall', recall_score(test_y, y_predictions_rf)],
           ['F1_score',f1_score(test_y, y_predictions_rf)]]

metrics_df = pd.DataFrame(metrics, columns = ['Metrics', 'Results'])
metrics_df

Unnamed: 0,Metrics,Results
0,Accuracy,1.0
1,Precision,1.0
2,Recall,1.0
3,F1_score,1.0


In [538]:

#checking the Matrices for Dicision Tree Classifier

metrics = [['Accuracy',(accuracy_score(test_y, y_predictions_dt))],
           ['Precision',precision_score(test_y, y_predictions_dt)],
           ['Recall', recall_score(test_y, y_predictions_dt)],
           ['F1_score',f1_score(test_y, y_predictions_dt)]]

metrics_df = pd.DataFrame(metrics, columns = ['Metrics', 'Results'])
metrics_df

Unnamed: 0,Metrics,Results
0,Accuracy,1.0
1,Precision,1.0
2,Recall,1.0
3,F1_score,1.0


In [539]:

#checking the Matrices for Ad Boost Classifier


metrics = [['Accuracy',(accuracy_score(test_y, y_predictions_ab))],
           ['Precision',precision_score(test_y, y_predictions_ab)],
           ['Recall', recall_score(test_y, y_predictions_ab)],
           ['F1_score',f1_score(test_y, y_predictions_ab)]]

metrics_df = pd.DataFrame(metrics, columns = ['Metrics', 'Results'])
metrics_df

Unnamed: 0,Metrics,Results
0,Accuracy,1.0
1,Precision,1.0
2,Recall,1.0
3,F1_score,1.0


In [540]:

#checking the Matrices for Gradient Boosting Classifier


metrics = [['Accuracy',(accuracy_score(test_y, y_predictions_gb))],
           ['Precision',precision_score(test_y, y_predictions_gb)],
           ['Recall', recall_score(test_y, y_predictions_gb)],
           ['F1_score',f1_score(test_y, y_predictions_gb)]]

metrics_df = pd.DataFrame(metrics, columns = ['Metrics', 'Results'])
metrics_df

Unnamed: 0,Metrics,Results
0,Accuracy,1.0
1,Precision,1.0
2,Recall,1.0
3,F1_score,1.0
