In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import category_encoders as ce
from sklearn.metrics import accuracy_score, classification_report

In [None]:
df = pd.read_csv('Findata.csv')
df = df.dropna()
print(df.head())
print(df.shape) 
print(df.describe().round(2))

   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  
(1048575, 11)
             step       amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest  \
count  104

In [9]:
# Convert categorical data to numeric values using target encoding
encoder = ce.TargetEncoder(cols=['type', 'nameOrig', 'nameDest'])
df = encoder.fit_transform(df, df['isFraud'])

# Normalize numerical columns
scaler = StandardScaler()
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

In [10]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Split the data into training and testing sets for supervised learning
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape) #(838860, 10)
print(X_test.shape) #(209715, 10)

(838860, 10)
(209715, 10)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Supervised Learning: Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)


In [20]:
# Evaluate the supervised learning model
print("Supervised Learning - Random Forest Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Supervised Learning - Random Forest Classifier
Accuracy: 1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    209715

    accuracy                           1.00    209715
   macro avg       1.00      1.00      1.00    209715
weighted avg       1.00      1.00      1.00    209715



In [21]:
# Unsupervised Learning: K-Means Clustering
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=42)
df['Cluster'] = kmeans.fit_predict(df.drop(columns=[df.columns[-1]], errors='ignore'))

# Display the first few rows of the dataset with cluster labels
print(df.head())

       step      type    amount   nameOrig  oldbalanceOrg  newbalanceOrig  \
0 -1.662022 -0.612335 -0.561738  -0.033018      -0.236855       -0.243832   
1 -1.662022 -0.612335 -0.591840  -0.033018      -0.286956       -0.290673   
2 -1.662022  3.042916 -0.598194  30.285174      -0.294045       -0.297117   
3 -1.662022  0.257418 -0.598194  30.285174      -0.294045       -0.297117   
4 -1.662022 -0.612335 -0.554837  -0.033018      -0.280123       -0.287183   

   nameDest  oldbalanceDest  newbalanceDest    isFraud  isFlaggedFraud  \
0 -0.022779       -0.425883       -0.461062  -0.033019             0.0   
1 -0.022779       -0.425883       -0.461062  -0.033019             0.0   
2  6.576403       -0.425883       -0.461062  30.285174             0.0   
3  6.546556       -0.416661       -0.461062  30.285174             0.0   
4 -0.022779       -0.425883       -0.461062  -0.033019             0.0   

   Cluster  
0        0  
1        0  
2        1  
3        1  
4        0  


In [22]:
df.to_csv('data_with_clusters.csv', index=False)