In [1]:
import pandas as pd
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder


In [2]:
df = pd.read_csv('/kaggle/input/fraud-detection/fraudTrain.csv')

In [3]:
# Extract the desired numerical features from the datetime column
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['year'] = df['trans_date_trans_time'].dt.year
df['month'] = df['trans_date_trans_time'].dt.month

# print(df.columns)
df.head(1)
print(df.dtypes)


Unnamed: 0                        int64
trans_date_trans_time    datetime64[ns]
cc_num                            int64
merchant                         object
category                         object
amt                             float64
first                            object
last                             object
gender                           object
street                           object
city                             object
state                            object
zip                               int64
lat                             float64
long                            float64
city_pop                          int64
job                              object
dob                              object
trans_num                        object
unix_time                         int64
merch_lat                       float64
merch_long                      float64
is_fraud                          int64
year                              int64
month                             int64


In [4]:
# Count the occurrences of each class
class_counts = df['is_fraud'].value_counts()

# Calculate the class distribution
class_distribution = class_counts / len(df)

print(class_distribution)

0    0.994211
1    0.005789
Name: is_fraud, dtype: float64


In [5]:

label_encoder = LabelEncoder()
df['state'] = label_encoder.fit_transform(df['state'])
df['city'] = label_encoder.fit_transform(df['city'])
df.dtypes

Unnamed: 0                        int64
trans_date_trans_time    datetime64[ns]
cc_num                            int64
merchant                         object
category                         object
amt                             float64
first                            object
last                             object
gender                           object
street                           object
city                              int64
state                             int64
zip                               int64
lat                             float64
long                            float64
city_pop                          int64
job                              object
dob                              object
trans_num                        object
unix_time                         int64
merch_lat                       float64
merch_long                      float64
is_fraud                          int64
year                              int64
month                             int64


In [6]:
anomaly_inputs = ['city_pop', 'amt','lat','long','zip','year','month','merch_lat','merch_long','state','city']
# Split the dataset into features (X) and labels (y)
X = df[anomaly_inputs]  # Features
y = df['is_fraud']  # Labels
model_IF = IsolationForest(contamination=float(0.3),random_state=42)
# model_IF = IsolationForest(n_estimators=100, max_samples='auto', contamination=0.02,
#                       max_features=.8, bootstrap=True, n_jobs=-1, random_state=42,
#                       verbose=0)


In [7]:
# Create and fit the Isolation Forest model
model_IF.fit(X)

# Generate outlier scores for instances
outlier_scores = model_IF.decision_function(X)

# Define a threshold to classify instances as anomalies or normal data
threshold = 0.0

# Convert outlier scores to binary labels based on the threshold
predictions = [1 if score < threshold else 0 for score in outlier_scores]

# Calculate precision, recall, and F1-score
print(classification_report(y, predictions))



              precision    recall  f1-score   support

           0       1.00      0.70      0.82   1289169
           1       0.01      0.75      0.03      7506

    accuracy                           0.70   1296675
   macro avg       0.51      0.73      0.43   1296675
weighted avg       0.99      0.70      0.82   1296675



In [8]:
# model_IF.fit(X)

# # Predict the anomalies
# if_prediction = model_IF.predict(X)
# # Change the anomalies' values to make it consistent with the true values
# if_prediction = [1 if i==-1 else 0 for i in if_prediction]
# # Check the model performance
# print(classification_report(y, if_prediction))