In [1]:
!pip install pyod

Collecting pyod
  Downloading pyod-1.1.2.tar.gz (160 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/160.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m92.2/160.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.5/160.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyod
  Building wheel for pyod (setup.py) ... [?25l[?25hdone
  Created wheel for pyod: filename=pyod-1.1.2-py3-none-any.whl size=190289 sha256=a7020bc068c2eb553286ff575d6aed51c385248d4dd9e40eb1737023ea24373e
  Stored in directory: /root/.cache/pip/wheels/81/1b/61/aa85b78c3c0c8871f4231e3f4a03bb23cecb7db829498380ee
Successfully built pyod
Installing collected packages: pyod
Successfully installed pyod-1.1.2


In [2]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks/Datasets/255-Datasets/Clustering/CreditCard

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/Datasets/255-Datasets/Clustering/CreditCard


In [3]:
%ls

cc_info.csv  transactions.csv


In [4]:
# Import necessary libraries
import pandas as pd
from pyod.models.iforest import IForest
from sklearn.preprocessing import StandardScaler

# Load your datasets
cc_info_path = 'cc_info.csv'
transactions_path = 'transactions.csv'

cc_info_df = pd.read_csv(cc_info_path)
transactions_df = pd.read_csv(transactions_path)

In [5]:
cc_info_df.head()

Unnamed: 0,credit_card,city,state,zipcode,credit_card_limit
0,1280981422329509,Dallas,PA,18612,6000
1,9737219864179988,Houston,PA,15342,16000
2,4749889059323202,Auburn,MA,1501,14000
3,9591503562024072,Orlando,WV,26412,18000
4,2095640259001271,New York,NY,10001,20000


In [6]:
transactions_df.head()

Unnamed: 0,credit_card,date,transaction_dollar_amount,Long,Lat
0,1003715054175576,2015-09-11 00:32:40,43.78,-80.174132,40.26737
1,1003715054175576,2015-10-24 22:23:08,103.15,-80.19424,40.180114
2,1003715054175576,2015-10-26 18:19:36,48.55,-80.211033,40.313004
3,1003715054175576,2015-10-22 19:41:10,136.18,-80.174138,40.290895
4,1003715054175576,2015-10-26 20:08:22,71.82,-80.23872,40.166719


In [7]:
print(str(cc_info_df.shape) + " : " + str(transactions_df.shape))

(984, 5) : (294588, 5)


# Preprocessing


In [8]:
# Preprocessing
# Convert 'date' to datetime and extract hour and day of the week
transactions_df['date'] = pd.to_datetime(transactions_df['date'])
transactions_df['hour'] = transactions_df['date'].dt.hour
transactions_df['day_of_week'] = transactions_df['date'].dt.dayofweek
transactions_df.drop(['date'], axis=1, inplace=True)

# Standardize the numerical values
scaler = StandardScaler()
scaled_features = scaler.fit_transform(transactions_df[['transaction_dollar_amount', 'Long', 'Lat', 'hour', 'day_of_week']])
scaled_transactions_df = pd.DataFrame(scaled_features, columns=['amount_scaled', 'long_scaled', 'lat_scaled', 'hour_scaled', 'day_of_week_scaled'])

# Modeling

In [9]:
# Isolation Forest for anomaly detection
iforest = IForest(contamination=0.01)  # adjust contamination as needed
iforest.fit(scaled_transactions_df)

# Predict anomalies
predictions = iforest.predict(scaled_transactions_df)
transactions_df['anomaly'] = predictions

# Output anomalies
anomalies = transactions_df[transactions_df['anomaly'] == 1]
print(anomalies.head())
print(f"Total anomalies detected: {anomalies.shape[0]}")



          credit_card  transaction_dollar_amount        Long        Lat  hour  \
128  1003715054175576                      87.77  102.032338 -32.343578     0   
141  1003715054175576                      53.31   -7.373733 -11.785999    20   
152  1003715054175576                     888.04  -80.231908  40.297183     1   
226  1003715054175576                     859.31  -80.202474  40.221667    15   
306  1013870087888817                      40.52  110.386103  52.238635     0   

     day_of_week  anomaly  
128            6        1  
141            4        1  
152            6        1  
226            0        1  
306            1        1  
Total anomalies detected: 2946


The anomalies are detected based on several factors like the transaction amount, time of the transaction, and the location of the transaction. The high transaction amounts at unusual hours or the transactions occurring at geographically unusual coordinates are some of the patterns that might have led the model to identify these as anomalies.

Given the large number of anomalies detected, it may be beneficial to review the contamination parameter (set at 1% in this case) or to investigate these transactions further to determine if they are indeed fraudulent or just outliers due to other reasons. This kind of analysis is crucial in real-world scenarios to minimize false positives while effectively detecting fraudulent activities.