In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Data\credit_card_transactions-ibm_v2.csv',nrows=20000)

In [3]:
df["card_id"] = df["User"].astype(str) + "_" + df["Card"].astype(str)

In [4]:
client_node_data = pd.DataFrame([1]*len(df["card_id"].unique())).set_index(df["card_id"].unique())

In [3]:
df.head()

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No
1,0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
2,0,0,2002,9,2,06:22,$120.34,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
3,0,0,2002,9,2,17:45,$128.95,Swipe Transaction,3414527459579106770,Monterey Park,CA,91754.0,5651,,No
4,0,0,2002,9,3,06:23,$104.71,Swipe Transaction,5817218446178736267,La Verne,CA,91750.0,5912,,No


In [None]:
# We have 24 Million rows
# 100343 unique merchants
# 109 MCC
# 13429 merchant city
# 223 states

In [4]:
df.shape

(24386900, 15)

In [12]:
df.isna().sum()

User                     0
Card                     0
Year                     0
Month                    0
Day                      0
Time                     0
Amount                   0
Use Chip                 0
Merchant Name            0
Merchant City            0
Merchant State     2720821
Zip                2878135
MCC                      0
Errors?           23998469
Is Fraud?                0
dtype: int64

In [13]:
for i in df.columns:
    print(i,df[i].nunique())

User 2000
Card 9
Year 30
Month 12
Day 31
Time 1440
Amount 98953
Use Chip 3
Merchant Name 100343
Merchant City 13429
Merchant State 223
Zip 27321
MCC 109
Errors? 23
Is Fraud? 2


In [7]:
from src.ingest_data import IngestData
from src.data_cleaning import DataPreprocessStrategy
from src.making_graph import HeteroGraphStrategy
from src.model_dev import HeteroGnnClassificationModel
from src.visualize import NodeRepresentation

In [8]:
graph_def_dict = {
    'target' : "Is Fraud?",
    "transaction" : ["Use Chip","Merchant City","Errors?","Amount","Hour","Minute"],
    'graph':{
        "card_id" : ['one'],
        "Merchant Name" : ['one']
    }
}
data_path="Data\\credit_card_transactions-ibm_v2.csv"

In [9]:
ingest_data = IngestData(data_path)
df = ingest_data.get_data()

process_strategy = DataPreprocessStrategy()
df = process_strategy.handle_data(df)

graph_strategy = HeteroGraphStrategy()
hgraph = graph_strategy.get_graph(df,graph_def_dict)

trained_model = HeteroGnnClassificationModel(hgraph).train()

reduc = NodeRepresentation().embedding_dim_reduction(hgraph,trained_model,"transaction")

       Year  Month  Day  Amount  Use Chip  Merchant Name  Merchant City   MCC  \
0      2002      9    1  134.09         2            376            128  5300   
1      2002      9    1   38.48         2            258            165  5411   
2      2002      9    2  120.34         2            258            165  5411   
3      2002      9    2  128.95         2            372            165  5651   
4      2002      9    3  104.71         2            456            128  5912   
...     ...    ...  ...     ...       ...            ...            ...   ...   
19995  2011      2   20  112.73         2            400             60  5655   
19996  2011      2   22   62.54         2             57            143  5814   
19997  2011      2   27   56.41         2            124            143  5812   
19998  2011      2   28  111.61         2            175            143  7538   
19999  2011      3    1   12.72         2             57            143  5814   

       Errors?  Is Fraud?  

100%|██████████| 30/30 [00:05<00:00,  5.20it/s]
100%|██████████| 30/30 [00:06<00:00,  4.64it/s]


In [6]:
trained_model

HeteroGNN(
  (convs): ModuleList(
    (0-1): 2 x HeteroConv(num_relations=4)
  )
  (lin): Linear(256, 2, bias=True)
)

In [13]:
import plotly.express as px
import pandas as pd

In [16]:
q = pd.DataFrame(reduc,columns=['x','y'])

In [18]:
q['cluster'] = hgraph['transaction'].y

In [None]:
fig = px.scatter(q, x='x', y='y',color='cluster',title="Embeddings")
fig.show()