In [157]:
# Import libraries required for the program execution

# for system processing
import os
import warnings
warnings.filterwarnings("ignore")

# for data processing
import numpy as np
import pandas as pd

# for plotting charts & graphs
import matplotlib.pyplot as plt

# for statistical processing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import ParameterGrid
from sklearn.manifold import TSNE

# for Graph processing using pytorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv

In [158]:
os.chdir('C:/Users/athorat/OneDrive - Nice Systems Ltd/00_Amit Thorat Data/ISB - AMPBA/01_Project/Term4_DL/Assignment/Part 4')
transData = pd.read_csv('train_transaction.csv')

In [159]:
transData

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.50,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.00,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.00,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.00,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.00,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,15811047,49.00,W,6550,,150.0,visa,226.0,...,,,,,,,,,,
590536,3577536,0,15811049,39.50,W,10444,225.0,150.0,mastercard,224.0,...,,,,,,,,,,
590537,3577537,0,15811079,30.95,W,12037,595.0,150.0,mastercard,224.0,...,,,,,,,,,,
590538,3577538,0,15811088,117.00,W,7826,481.0,150.0,mastercard,224.0,...,,,,,,,,,,


In [160]:
transData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 394 entries, TransactionID to V339
dtypes: float64(376), int64(4), object(14)
memory usage: 1.7+ GB


In [161]:
catCols = transData.select_dtypes(include=['object']).columns.tolist()
numCols = transData.select_dtypes(exclude=['object']).columns.tolist()

In [162]:
print(catCols)

['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']


In [163]:
print(numCols)

['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2', 'dist1', 'dist2', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', 'V98', 'V

In [164]:
# Identify columns with null values exceeding the 50% threshold value 
nullPercentage = (transData.isnull().sum() / len(transData)) * 100
nullDF = pd.DataFrame({'Column Name': transData.columns, 'Null Percentage':nullPercentage})
nullDF = nullDF.sort_values(by='Null Percentage', ascending=False)
colwithmorethan50percentnull = nullPercentage[nullPercentage > 50].index.tolist()

# Create new dataframe which has data columns that has less than 50% null value
modTransData = transData.drop(columns = colwithmorethan50percentnull)

In [165]:
catCols = modTransData.select_dtypes(include=['object']).columns.tolist()
numCols = modTransData.select_dtypes(exclude=['object']).columns.tolist()

In [166]:
print(catCols)

['ProductCD', 'card4', 'card6', 'P_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M6']


In [167]:
print(numCols)

['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D10', 'D11', 'D15', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 

In [168]:
# Impute numerical columns with column mean values
colMeans = modTransData.mean()
modTransData = modTransData.fillna(colMeans)

Categorical columns having null values: 'ProductCD', 'card4', 'card6', 'P_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M6'

In [169]:
# Encode categorical variables
labelEncoders = {}
for col in catCols:
    LE = LabelEncoder()
    modTransData[col] = LE.fit_transform(modTransData[col].astype(str))
    labelEncoders[col] = LE

In [170]:
# Standardize numerical features
scaler = StandardScaler()
modTransData[numCols] = scaler.fit_transform(modTransData[numCols])

In [171]:
modTransData.isnull().sum()[modTransData.isnull().sum()!=0]

Series([], dtype: int64)

In [172]:
modTransData

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,-1.732048,-0.190417,-1.577987,-0.278167,4,0.821695,-3.629961e-16,-0.282202,1,-1.393802,...,-0.227588,-0.222876,-0.249776,-0.229654,-0.048378,-0.032816,-0.058051,-0.055289,-0.088857,-0.074144
1,-1.732042,-0.190417,-1.577986,-0.443327,4,-1.457558,2.646603e-01,-0.282202,2,-2.367147,...,-0.227588,-0.222876,-0.249776,-0.229654,-0.048378,-0.062213,-0.058051,-0.055289,-0.088857,-0.074144
2,-1.732036,-0.190417,-1.577972,-0.317889,4,-1.068263,8.138473e-01,-0.282202,4,-0.809796,...,-0.227588,-0.222876,-0.249776,-0.229654,-0.048378,-0.062213,-0.058051,-0.055289,-0.088857,-0.074144
3,-1.732030,-0.190417,-1.577965,-0.355521,4,1.679858,1.305561e+00,-0.282202,2,-2.002143,...,0.556719,-0.222876,-0.249776,-0.229654,-0.026352,0.290551,0.224768,-0.055289,-0.088857,-0.074144
4,-1.732024,-0.190417,-1.577964,-0.355521,1,-1.102133,9.671088e-01,-0.282202,2,-2.367147,...,-0.227588,-0.222876,-0.249776,-0.229654,-0.048378,-0.062213,-0.058051,-0.055289,-0.088857,-0.074144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,1.732024,-0.190417,1.827666,-0.359702,4,-0.683253,-3.629961e-16,-0.282202,4,0.650221,...,-0.227588,0.277647,0.026701,0.181131,-0.048378,-0.062213,-0.058051,-0.055289,-0.088857,-0.074144
590536,1.732030,-0.190417,1.827666,-0.399424,4,0.111252,-8.784150e-01,-0.282202,2,0.601554,...,-0.227588,-0.222876,-0.249776,-0.229654,-0.048378,-0.062213,-0.058051,-0.055289,-0.088857,-0.074144
590537,1.732036,-0.190417,1.827673,-0.435174,4,0.436277,1.484366e+00,-0.282202,2,0.601554,...,-0.227588,-0.222876,-0.249776,-0.229654,-0.048378,-0.062213,-0.058051,-0.055289,-0.088857,-0.074144
590538,1.732042,-0.190417,1.827675,-0.075376,4,-0.422906,7.563743e-01,-0.282202,2,0.601554,...,0.452145,3.091327,3.610522,2.490356,-0.048378,0.499094,-0.058051,-0.055289,-0.088857,-0.074144


In [173]:
# Split the dataset into training and testing sets
X = modTransData.drop(columns=['isFraud']).values
y = modTransData['isFraud'].values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [174]:
# Define the GNN Model
class GNNModel(nn.Module):
    def __init__(self, num_features, hidden_dim):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        x = torch.relu(x)
        x = self.fc(x).squeeze(1)
        return x

In [175]:
# Create an instance of the GNNModel class
model = GNNModel(num_features=X_train.shape[1], hidden_dim=64)

In [176]:
# Define a range of hyperparameters to search
paramGrid = {
    'hiddenDim': [32, 64, 128],
    'learningRate': [0.001, 0.01, 0.1]
}

In [177]:
# Create adjancy matrix
numNodes = len(colNames)
adjMatrix = np.zeros((numNodes, numNodes), dtype=int)
for i in range(numNodes):
    for j in range(numNodes):
        if i != j:
            adjMatrix[i, j] = 1
        
print(adjMatrix)

[[0 1 1 ... 1 1 1]
 [1 0 1 ... 1 1 1]
 [1 1 0 ... 1 1 1]
 ...
 [1 1 1 ... 0 1 1]
 [1 1 1 ... 1 0 1]
 [1 1 1 ... 1 1 0]]


In [180]:
bestScore = 0
bestModel = None
# Iterate over all hyperparameter combinations
for params in ParameterGrid(paramGrid):
    # Create a new model instance with the current hyperparameters
    model = GNNModel(num_features=X_train.shape[1], hidden_dim=params['hiddenDim']).to(device)
    optimizer = optim.Adam(model.parameters(), lr=params['learningRate'])
    criterion = nn.BCEWithLogitsLoss()
    hidDim = params['hiddenDim']
    learnRate = params['learningRate']

    # Training loop (as in your original code)
    epochs = 10
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, data.y)
        loss.backward()
        optimizer.step()

    # Evaluate the model on the validation data (you should have a validation set)
    model.eval()
    with torch.no_grad():
        val_data = Data(x=torch.tensor(X_val, dtype=torch.float32).to(device),
                        edge_index=torch.tensor(adjMatrix.nonzero(), dtype=torch.long).to(device),
                        y=torch.tensor(y_val, dtype=torch.float32).to(device))
        y_pred = model(val_data).cpu().numpy()
        threshold = 0.5
        y_val_binary = (y_val > threshold).astype(int)
        y_pred_binary = (y_pred > threshold).astype(int)
        
        # Calculate a performance metric (e.g., ROC AUC) for the current hyperparameters
        roc_auc = roc_auc_score(y_val_binary, y_pred)
        print(f'For Hidden Dimension {hidDim} and learning rate {learnRate}: ROC AUC Score: {roc_auc} ')

        # Update the best model and score if the current model is better
        if roc_auc > bestScore:
            bestScore = roc_auc
            bestModel = model

For Hidden Dimension 32 and learning rate 0.001: ROC AUC Score: 0.7301494368845162 
For Hidden Dimension 32 and learning rate 0.01: ROC AUC Score: 0.769512028179264 
For Hidden Dimension 32 and learning rate 0.1: ROC AUC Score: 0.7705418715413875 
For Hidden Dimension 64 and learning rate 0.001: ROC AUC Score: 0.681853861814113 
For Hidden Dimension 64 and learning rate 0.01: ROC AUC Score: 0.7675464194233521 
For Hidden Dimension 64 and learning rate 0.1: ROC AUC Score: 0.7681324940182755 
For Hidden Dimension 128 and learning rate 0.001: ROC AUC Score: 0.6933037115108868 
For Hidden Dimension 128 and learning rate 0.01: ROC AUC Score: 0.7677131559381201 
For Hidden Dimension 128 and learning rate 0.1: ROC AUC Score: 0.7680374078092223 


In [181]:
bestModel.eval()
with torch.no_grad():
    test_data = Data(x=torch.tensor(X_test, dtype=torch.float32).to(device),
                     edge_index=torch.tensor(adjMatrix.nonzero(), dtype=torch.long).to(device),
                     y=torch.tensor(y_test, dtype=torch.float32).to(device))
    y_pred = bestModel(test_data).cpu().numpy()
    threshold = 0.5
    y_test_binary = (y_test > threshold).astype(int)
    y_pred_binary = (y_pred > threshold).astype(int)
    
    accuracy = accuracy_score(y_test_binary, y_pred_binary)
    f1 = f1_score(y_test_binary, y_pred_binary)
    roc_auc = roc_auc_score(y_test_binary, y_pred)
    nodeEmbeddings = bestModel(data).cpu().numpy()

    print(bestModel)
    print(f"Best Model - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, ROC AUC: {roc_auc:.4f}")

GNNModel(
  (conv1): GCNConv(219, 32)
  (conv2): GCNConv(32, 32)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)
Best Model - Accuracy: 0.8804, F1 Score: 0.1969, ROC AUC: 0.7685


In [153]:
# Perform dimensionality reduction using t-SNE
nodeEmbeddings2d = nodeEmbeddings.reshape(-1, 1)
tsne = TSNE(n_components=2, perplexity=30, n_iter=300, random_state=0)
embeddings_2d = tsne.fit_transform(nodeEmbeddings2d)

# Visualize the 2D embeddings
plt.figure(figsize=(10, 8))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=data.y.cpu().numpy(), cmap=plt.get_cmap('viridis'))
plt.colorbar()
plt.title("Node Embeddings Visualization (t-SNE)")
plt.show()

ValueError: n_components=2 must be between 1 and min(n_samples, n_features)=1 with svd_solver='randomized'

In [154]:
nodeEmbeddings

array([-119303.64 , -119303.64 , -119303.64 , ..., -202027.83 ,
       -213673.81 ,  -42791.832], dtype=float32)

In [156]:
len(nodeEmbeddings)

472432