<a href="https://colab.research.google.com/github/purvasingh96/Deep-Reinforcement-Learning/blob/master/4.%20Deep%20Q%20Networks/Credit_Card_Fraud_Detection_via_DQNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Downloading data and Making *gym-fraud-detection* environment

In [0]:
import shutil

shutil.rmtree('/content/gym-fraud-detection')

In [1]:
!unzip dataset.zip

Archive:  dataset.zip
   creating: dataset/
  inflating: dataset/creditcard.csv  


In [2]:
import os
print(os.getcwd())

/content


In [0]:
!unzip gym-fraud-detection.zip

In [0]:
import os
os.chdir('gym-fraud-detection')

In [0]:
pip install -e .

In [0]:
import gym
import gym_fraud_detection

In [0]:
os.chdir('/content')
env = gym.make('gym-fraud-detection-v0')

In [51]:
print(env.action_space)
print(env.observation_space)

Discrete(2)
Discrete(284807)


# Download proper libraries

In [0]:
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T


In [0]:
# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Exploring the custom gym environment

In [14]:
print(env.action_space.n)

2


In [0]:
actions = []
rewards = []
count = 5

while True:
  action = env.action_space.sample()
  reward = env.step(action)
  actions.append(action)
  rewards.append(reward)
  count -= 1
  if count==0:
    break


In [16]:
for i in range(5):
  print("action : ", actions[i], "labelled data : ", env.label_for(i))
  print("reward : ", rewards[i])

action :  0 labelled data :  0.0
reward :  (1, 1, False, '{"true_positive_rate": 0, "false_positive_rate": 0, "true_negative_rate": 0, "false_negative_rate": 0}')
action :  0 labelled data :  0.0
reward :  (2, 1, False, '{"true_positive_rate": 0, "false_positive_rate": 0, "true_negative_rate": 0, "false_negative_rate": 0}')
action :  0 labelled data :  0.0
reward :  (3, 1, False, '{"true_positive_rate": 0, "false_positive_rate": 0, "true_negative_rate": 0, "false_negative_rate": 0}')
action :  0 labelled data :  0.0
reward :  (4, 1, False, '{"true_positive_rate": 0, "false_positive_rate": 0, "true_negative_rate": 0, "false_negative_rate": 0}')
action :  1 labelled data :  0.0
reward :  (5, -1, False, '{"true_positive_rate": 0, "false_positive_rate": 1, "true_negative_rate": 0, "false_negative_rate": 0}')


# Exploring data and creating pytorch model

link : https://www.kaggle.com/dakshmiglani/pytorch-credit-card-fraud-prediction-99-8

In [0]:
import torch, torch.nn as nn, torch.nn.functional as F
from torch.autograd import Variable
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch.utils.data as data_utils

In [18]:
df = pd.read_csv('./dataset/creditcard.csv')
df.head(1) # give us a sneek preview of the dataset xD

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0


In [0]:
X = df.iloc[:, :-1].values 
y = df.iloc[:, -1].values

sc = StandardScaler()
X = sc.fit_transform(X)

## 2. Defining the Q-Network

In [0]:
class DQN(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(30, 16)
        self.fc2 = nn.Linear(16, 18)
        self.fc3 = nn.Linear(18, 20)
        self.fc4 = nn.Linear(20, 24)
        self.fc5 = nn.Linear(24, 2)
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.dropout(x, p=0.25)
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = torch.sigmoid(self.fc5(x))
        return x

# Define DQN Agent

In [0]:
import numpy as np
import random 
from collections import namedtuple, deque 

import torch
import torch.nn.functional as F
import torch.optim as optim

BUFFER_SIZE = int(1e5)  #replay buffer size
BATCH_SIZE = 1         # minibatch size
GAMMA = 0.99            # discount factor
TAU = 1e-3              # for soft update of target parameters
LR = 5e-4               # learning rate
UPDATE_EVERY = 4        # how often to update the network
EPSILON = 0.8           # probability of chosing on-policy action

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [0]:
class Agent():
  def __init__(self, action_size, seed):
    self.action_size = action_size
    self.seed = random.seed(seed)


    # Q - Network
    self.qnet_local = DQN().double().to(device)
    self.qnet_target = DQN().double().to(device)

    self.optimizer = optim.Adam(self.qnet_local.parameters(), lr=0.001)

    self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

    self.t_step = 0

  def step(self, state, action, reward, next_step, done):
    print(state, action, reward, next_step, done)
    self.memory.add(state, action, reward, next_step, done)


    # learn every 4 timesteps
    self.t_step = (self.t_step+1)%UPDATE_EVERY
    if self.t_step == 0:
      experience = self.memory.sample()
      self.learn(experience, GAMMA)


  def epsilon_greedy_action(self, state):
    print('success')
    state = state.to(device)
    self.qnet_local.eval()
    with torch.no_grad():
      print('Actual output of qnetwork : {}'.format(self.qnet_local(state)))
      action_values = self.qnet_local(state).max(1)[1]#.view(1, 1)
      print('action_values : ', action_values)
    self.qnet_local.train()

    if random.random() < 0.8:
      print('Predicting action based on QNetwork')
      return action_values
    else:
      print('Chosing a random action')
      return random.choice(np.arange(self.action_size))
  
  def learn(self, experiences, gamma):
    print('Started learning')
    states, actions, rewards, next_state, done = experiences
    criterion = torch.nn.BCELoss()
    self.qnet_local.double().train()
    self.qnet_target.double().eval()

    predicted_targets = self.qnet_local(states).double().gather(1, actions)
    print('Predicted targets : {}'.format(predicted_targets))
    with torch.zero_grad():
      labels_next = self.qnet_target(next_states).detach().max(1)[0].unsqueeze(1)
    
    labels = rewards + (gamma * labels_next)
    print('Working fine before loss extraction')
    loss = criterion(predicted_targets, labels).to(device)
    print("Training loss :: {}".format(loss))
    self.optimizer.zero_grad()
    loss.backward()
    self.optimizer.step()

    # perform soft update
    self.soft_update(self, self.qnet_local, self.qnet_target, TAU)
  
  def soft_update(self, local_model, target_model, tau):
    for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
      target_param.data.copy_(tau*local_param.data + (1-tau)*target_param.data)




In [0]:
class ReplayBuffer():
  def __init__(self, action_size, buffer_size, batch_size, seed):
    self.action_size = action_size
    self.memory = deque(maxlen=buffer_size)
    self.batch_size = batch_size
    self.experiences = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
    self.seed = random.seed(seed)
  
  def add(self, state, action, reward, next_state, done):
    experience = self.experiences(state, action, reward, next_state, done)
    self.memory.append(experience)

  def sample(self):
    experiences = random.sample(self.memory, k=self.batch_size)
    
    states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
    actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device)
    rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
    next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
    dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None])).float().to(device)

    return (states, actions, rewards, next_states, dones)
  
  def __len__(self):
      return len(self.memory)


# Training the agent

In [0]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.1, random_state=1)

X_train = torch.from_numpy(X_train)
Y_train = torch.from_numpy(Y_train).double()

train = data_utils.TensorDataset(X_train, Y_train)
train_loader = data_utils.DataLoader(train, batch_size=1, shuffle=True)

In [224]:
# check reward strategy once
# add probability to epsilon_greedy
deep_agent = Agent(action_size=2, seed=0)
num_episodes = 200
max_t = 1000

for i in range(num_episodes):
  state = env.reset()
  env.state_index = 0
  score = 0
  for state_idx, data in enumerate(train_loader, 0):
    inputs, labels = data
    print('labels : ', labels)
    action = deep_agent.epsilon_greedy_action(inputs)
    print(action.item())#int())
    next_state, reward, done, _ = env.step(action.item())
    deep_agent.step(state, action, reward, next_state, done)
    state = next_state
    score += reward



labels :  tensor([0.], dtype=torch.float64)
success
Actual output of qnetwork : tensor([[0.4489, 0.4456]], device='cuda:0', dtype=torch.float64)
action_values :  tensor([0], device='cuda:0')
Chosing a random action
1
None 1 -1 1 False
labels :  tensor([0.], dtype=torch.float64)
success
Actual output of qnetwork : tensor([[0.4409, 0.4424]], device='cuda:0', dtype=torch.float64)
action_values :  tensor([1], device='cuda:0')
Predicting action based on QNetwork
1
1 tensor([1], device='cuda:0') -1 2 False
labels :  tensor([0.], dtype=torch.float64)
success
Actual output of qnetwork : tensor([[0.4397, 0.4367]], device='cuda:0', dtype=torch.float64)
action_values :  tensor([0], device='cuda:0')
Chosing a random action
1
2 1 -1 3 False
labels :  tensor([0.], dtype=torch.float64)
success
Actual output of qnetwork : tensor([[0.4426, 0.4345]], device='cuda:0', dtype=torch.float64)
action_values :  tensor([0], device='cuda:0')
Predicting action based on QNetwork
0
3 tensor([0], device='cuda:0') 1 

RuntimeError: ignored

In [179]:
from gym import spaces

x = spaces.Discrete(2)
print(x.dtype)

int64
