In [1]:
import pandas as pd
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
import datetime as dt
from datetime import datetime
from sklearn.preprocessing import StandardScaler


In [2]:
df = pd.read_csv('/kaggle/input/fraud-detection/fraudTrain.csv')

In [3]:
# Extract the desired numerical features from the datetime column
# df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
# df['year'] = df['trans_date_trans_time'].dt.year
# df['month'] = df['trans_date_trans_time'].dt.month

# # print(df.columns)
# df.head(1)
# print(df.dtypes)


In [4]:
# Count the occurrences of each class
class_counts = df['is_fraud'].value_counts()

# Calculate the class distribution
class_distribution = class_counts / len(df)

print(class_distribution)

0    0.994211
1    0.005789
Name: is_fraud, dtype: float64


In [5]:

# label_encoder = LabelEncoder()
# df['state'] = label_encoder.fit_transform(df['state'])
# df['city'] = label_encoder.fit_transform(df['city'])
# df['category'] = label_encoder.fit_transform(df['category'])
# df.dtypes

In [6]:
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

# Extract hour, day and month into separate columns
df.insert(loc=1, column='hour', value=df['trans_date_trans_time'].dt.hour)
df.insert(loc=2, column='day', value=df['trans_date_trans_time'].dt.day)
df.insert(loc=3, column='month', value=df['trans_date_trans_time'].dt.month)

df['amt_log'] = np.log(df['amt'])
df['city_pop_log'] = np.log(df['city_pop'])
# Converting dob to age and applying log transformation on age
df['dob'] = pd.to_datetime(df['dob'])  # Convert 'dob' column to datetime format
current_date = datetime.now()  # Get the current date

df['age'] = (current_date - df['dob']).dt.days // 365

df['age'] = np.log(df['age'])

# Create an instance of OneHotEncoder
encoder = LabelEncoder()
# Fit and transform the 'category' column
df['category_enc'] = encoder.fit_transform(df[['category']])
df['gender_enc'] = encoder.fit_transform(df[['gender']])
df['state_enc'] = encoder.fit_transform(df[['state']])

df.head()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Unnamed: 0.1,Unnamed: 0,hour,day,month,trans_date_trans_time,cc_num,merchant,category,amt,first,...,unix_time,merch_lat,merch_long,is_fraud,amt_log,city_pop_log,age,category_enc,gender_enc,state_enc
0,0,0,1,1,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,...,1325376018,36.011293,-82.048315,0,1.60342,8.159089,3.555348,8,0,27
1,1,0,1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,...,1325376044,49.159047,-118.186462,0,4.674976,5.003946,3.806662,4,0,47
2,2,0,1,1,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,...,1325376051,43.150704,-112.154481,0,5.394127,8.331827,4.110874,0,1,13
3,3,0,1,1,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,...,1325376076,47.034331,-112.561071,0,3.806662,7.569928,4.025352,2,1,26
4,4,0,1,1,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,...,1325376186,38.674999,-78.632459,0,3.736717,4.59512,3.610918,9,1,45


In [7]:
# Create a StandardScaler object
scaler = StandardScaler()

col_to_standardize = ['lat', 'long', 'merch_lat', 'merch_long']
# Fit the scaler to your dataframe
scaler.fit(df[col_to_standardize])

# Convert the transformed array back to a dataframe
df[col_to_standardize] = scaler.transform(df[col_to_standardize])

In [8]:
# Balancing the imbalanced dataset
normal_samples = df[df['is_fraud'] == 0].sample(n=20000, random_state=42)

anomaly_samples = df[df['is_fraud'] == 1].sample(n=5000, random_state=42)

dataset = pd.concat([normal_samples, anomaly_samples], ignore_index=True)

In [9]:
X = dataset.drop(['Unnamed: 0',	'trans_date_trans_time', 'cc_num', 'merchant',
                           'category', 'amt', 'first', 'last', 'gender', 'street', 'city',
                           'state', 'zip', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'is_fraud'], axis=1)

y = dataset['is_fraud']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)

X_train.head()

Unnamed: 0,hour,day,month,lat,long,merch_lat,merch_long,amt_log,city_pop_log,age,category_enc,gender_enc,state_enc
19372,11,7,3,-0.024729,1.08621,0.024786,1.034536,4.600057,6.57647,3.637586,4,0,20
11301,18,9,6,0.203845,-0.000841,0.092351,-0.002431,4.30959,7.321189,2.995732,11,1,14
22903,3,14,10,0.053445,0.832086,0.175531,0.885316,6.94373,6.760415,3.912023,9,0,45
7549,11,30,3,-1.601917,-0.066899,-1.553571,-0.046316,5.277145,12.845051,3.828641,4,0,18
9481,16,31,3,0.420678,1.213006,0.468002,1.162193,2.498974,10.448599,3.951244,12,0,34


In [11]:
# anomaly_inputs = ['city_pop', 'amt','zip','year','month','merch_lat','merch_long','state','city','category']
# # Split the dataset into features (X) and labels (y)
# X = df[anomaly_inputs]  # Features
# y = df['is_fraud']  # Labels
# model_IF = IsolationForest(contamination=float(0.3),random_state=42)



In [12]:
model_IF = IsolationForest(contamination=float(0.3),random_state=42)

# Create and fit the Isolation Forest model
model_IF.fit(X_train)

# Generate outlier scores for instances
outlier_scores = model_IF.decision_function(X_test)

# Define a threshold to classify instances as anomalies or normal data
threshold = 0

# Convert outlier scores to binary labels based on the threshold
predictions = [1 if score < threshold else 0 for score in outlier_scores]

# Calculate precision, recall, and F1-score
print(classification_report(y_test, predictions))



              precision    recall  f1-score   support

           0       0.84      0.71      0.77      6032
           1       0.27      0.44      0.34      1468

    accuracy                           0.66      7500
   macro avg       0.56      0.58      0.55      7500
weighted avg       0.73      0.66      0.69      7500



In [13]:

# # Predict the anomalies
# if_prediction = model_IF.predict(X_test)
# # Change the anomalies' values to make it consistent with the true values
# if_prediction = [1 if i==-1 else 0 for i in if_prediction]
# # Check the model performance
# print(classification_report(y_test, if_prediction))