# AI-Based Knowledge Graph Builder for Enterprise Intelligence
## Milestone 1 – Data Ingestion, Preprocessing, Transformation & Enrichment


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler


In [2]:
# 1. DATA INGESTION (SOURCE → TARGET)
df = pd.read_csv("creditcard.csv")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()


Shape: (10000, 10)
Columns: ['transaction_id', 'amount', 'transaction_hour', 'merchant_category', 'foreign_transaction', 'location_mismatch', 'device_trust_score', 'velocity_last_24h', 'cardholder_age', 'is_fraud']


Unnamed: 0,transaction_id,amount,transaction_hour,merchant_category,foreign_transaction,location_mismatch,device_trust_score,velocity_last_24h,cardholder_age,is_fraud
0,1,84.47,22,Electronics,0,0,66,3,40,0
1,2,541.82,3,Travel,1,0,87,1,64,0
2,3,237.01,17,Grocery,0,0,49,1,61,0
3,4,164.33,4,Grocery,0,1,72,3,34,0
4,5,30.53,15,Food,0,0,79,0,44,0


In [3]:
# DATA VALIDATION
print("Null values:\n", df.isnull().sum())
print("Duplicate rows:", df.duplicated().sum())
print("\nData Types:\n", df.dtypes)


Null values:
 transaction_id         0
amount                 0
transaction_hour       0
merchant_category      0
foreign_transaction    0
location_mismatch      0
device_trust_score     0
velocity_last_24h      0
cardholder_age         0
is_fraud               0
dtype: int64
Duplicate rows: 0

Data Types:
 transaction_id           int64
amount                 float64
transaction_hour         int64
merchant_category       object
foreign_transaction      int64
location_mismatch        int64
device_trust_score       int64
velocity_last_24h        int64
cardholder_age           int64
is_fraud                 int64
dtype: object


In [4]:
# 2. PREPROCESSING (CLEANING)
# standardize column names
df.columns = df.columns.str.lower().str.strip()
# remove duplicates
df.drop_duplicates(inplace=True)
# remove empty strings
df.replace('', np.nan, inplace=True)
print("After cleaning shape:", df.shape)


After cleaning shape: (10000, 10)


In [5]:
# 3. DATA TRANSFORMATION
# derived risk feature
df["risk_flag"] = df["foreign_transaction"] + df["location_mismatch"]

df.head()


Unnamed: 0,transaction_id,amount,transaction_hour,merchant_category,foreign_transaction,location_mismatch,device_trust_score,velocity_last_24h,cardholder_age,is_fraud,risk_flag
0,1,84.47,22,Electronics,0,0,66,3,40,0,0
1,2,541.82,3,Travel,1,0,87,1,64,0,1
2,3,237.01,17,Grocery,0,0,49,1,61,0,0
3,4,164.33,4,Grocery,0,1,72,3,34,0,1
4,5,30.53,15,Food,0,0,79,0,44,0,0


In [6]:
# ==============================
# 4. DATA NORMALIZATION
# ==============================

scaler = MinMaxScaler()

df["amount_norm"] = scaler.fit_transform(df[["amount"]])
df["trust_norm"] = scaler.fit_transform(df[["device_trust_score"]])
df["velocity_norm"] = scaler.fit_transform(df[["velocity_last_24h"]])


In [7]:
# ==============================
# 5. DATA ENRICHMENT (KG ENTITIES)
# ==============================

df["transaction_node"] = "TX_" + df["transaction_id"].astype(str)
df["merchant_node"] = df["merchant_category"]
df["cardholder_node"] = "USER_" + df.index.astype(str)


In [8]:
# ==============================
# 6. CLASSIFICATION LABEL
# ==============================

df["fraud_label"] = df["is_fraud"].map({0:"Normal",1:"Fraud"})



In [9]:
# ==============================
# SAVE PROCESSED DATASET
# ==============================

df.to_csv("processed_creditcard.csv", index=False)

df.head()


Unnamed: 0,transaction_id,amount,transaction_hour,merchant_category,foreign_transaction,location_mismatch,device_trust_score,velocity_last_24h,cardholder_age,is_fraud,risk_flag,amount_norm,trust_norm,velocity_norm,transaction_node,merchant_node,cardholder_node,fraud_label
0,1,84.47,22,Electronics,0,0,66,3,40,0,0,0.057422,0.554054,0.333333,TX_1,Electronics,USER_0,Normal
1,2,541.82,3,Travel,1,0,87,1,64,0,1,0.368324,0.837838,0.111111,TX_2,Travel,USER_1,Normal
2,3,237.01,17,Grocery,0,0,49,1,61,0,0,0.161117,0.324324,0.111111,TX_3,Grocery,USER_2,Normal
3,4,164.33,4,Grocery,0,1,72,3,34,0,1,0.11171,0.635135,0.333333,TX_4,Grocery,USER_3,Normal
4,5,30.53,15,Food,0,0,79,0,44,0,0,0.020754,0.72973,0.0,TX_5,Food,USER_4,Normal
