# **Reinforcement Learning Human Feedback for Cancer Cell Detection through Dyanmic Policies with (M2 Max)**


# 🤖  <- Agent

`Allowing a certain number of phases of iterative human feedback and different algorithms identifying RGB values, shape, texture, extraneous information, etc., can help shape the environmental reward signal and aid the agent to choose exploration over exploitation and vice versa.`

In [None]:
print("test")

## Manage and Preprocess image dataset for screening ~

In [None]:
train_dir = '/kaggle/input/bttai-nybg-2024/BTTAIxNYBG-train/BTTAIxNYBG-train' # Get better datasets and categorical classfications
valid_dir = '/kaggle/input/bttai-nybg-2024/BTTAIxNYBG-validation/BTTAIxNYBG-validation'

batch_size = 32
img_height = 224
img_width = 224


train_ds = tf.keras.utils.image_dataset_from_directory(
    train_dir,
    validation_split=0.2,
    subset='training',
    seed=123,
    image_size=(img_height, img_width),
    rescale=1./255, # Rescale pixel values to [0, 1]
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    brightness_range=[1.0, 1.1], # Adjust brightness randomly within the range [0.8, 1.2]
    fill_mode='nearest'
)

valid_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)

valid_ds = valid_datagen.flow_from_directory(
    valid_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    shuffle=False
)

# Setup RL Environment ~

In [None]:
import gym
from gym import spaces # hope this is open source
import numpy as np
import cv2  # For image processing

class CancerCellDetectionEnv(gym.Env):
    def __init__(self, images, labels):
        super(CancerCellDetectionEnv, self).__init__()
        self.images = images  # List of images
        self.labels = labels  # Corresponding labels
        self.current_image_index = 0
        self.current_step = 0
        self.max_steps = 100  # Define maximum steps per episode
        
        # Define action and observation space
        # Actions: 0 = non-cancerous, 1 = cancerous
        self.action_space = spaces.Discrete(2)
        
        # Observations: Image patches (e.g., 64x64x3)
        self.observation_space = spaces.Box(low=0, high=255, shape=(64, 64, 3), dtype=np.uint8)

    def reset(self):
        self.current_image_index = np.random.randint(len(self.images))
        self.current_step = 0
        self.current_image = self.images[self.current_image_index]
        self.current_label = self.labels[self.current_image_index]
        
        # Return initial observation (e.g., first image patch)
        initial_observation = self.get_observation()
        return initial_observation

    def step(self, action):
        # Perform the action and calculate reward
        done = False
        reward = self.calculate_reward(action)
        
        self.current_step += 1
        if self.current_step >= self.max_steps:
            done = True
        
        # Get the next observation
        observation = self.get_observation()
        
        return observation, reward, done, {}

    def get_observation(self):
        # Extract the current patch or ROI from the image
        # Here we just return the whole image for simplicity
        return cv2.resize(self.current_image, (64, 64))

    def calculate_reward(self, action):
        # Calculate reward based on the action and ground truth
        correct_label = self.current_label  # For simplicity, assume label is for the whole image
        if action == correct_label:
            return 1  # Positive reward for correct classification
        else:
            return -1  # Negative reward for incorrect classification

    def render(self, mode='human'):
        # Optionally implement render method to visualize the environment
        cv2.imshow('Cancer Cell Detection', self.current_image)
        cv2.waitKey(1)

    def close(self):
        cv2.destroyAllWindows()

# Usage Example
if __name__ == '__main__':
    images = [cv2.imread('path_to_image1.jpg'), cv2.imread('path_to_image2.jpg')]  # List of images
    labels = [0, 1]  # Corresponding labels (e.g., 0 = non-cancerous, 1 = cancerous)
    
    env = CancerCellDetectionEnv(images, labels)
    
    for episode in range(5):
        observation = env.reset()
        done = False
        total_reward = 0
        
        while not done:
            action = env.action_space.sample()  # Random action for testing
            observation, reward, done, info = env.step(action)
            total_reward += reward
            
            env.render()
        
        print(f'Episode {episode + 1}: Total Reward = {total_reward}')
    
    env.close()


# Proximal Policy Optimization for RLHF Adpative/Dynamic Policies ~ 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

# Define the policy network (CNN)
class PolicyNetwork(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(PolicyNetwork, self).__init__()
        self.conv1 = nn.Conv2d(input_shape[0], 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(64 * input_shape[1] * input_shape[2], 256)
        self.fc2 = nn.Linear(256, num_actions)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

# Define the value network
class ValueNetwork(nn.Module):
    def __init__(self, input_shape):
        super(ValueNetwork, self).__init__()
        self.conv1 = nn.Conv2d(input_shape[0], 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(64 * input_shape[1] * input_shape[2], 256)
        self.fc2 = nn.Linear(256, 1)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

# Define the PPO algorithm
class PPOAgent:
    def __init__(self, policy_net, value_net, lr=3e-4, gamma=0.99, eps_clip=0.2):
        self.policy_net = policy_net
        self.value_net = value_net
        self.optimizer_policy = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.optimizer_value = optim.Adam(self.value_net.parameters(), lr=lr)
        self.gamma = gamma
        self.eps_clip = eps_clip

    def select_action(self, state):
        logits = self.policy_net(state)
        probs = Categorical(logits=logits)
        action = probs.sample()
        return action, probs.log_prob(action), probs.entropy()

    def update(self, trajectories):
        # Compute advantages and returns
        returns, advantages = self.compute_advantages(trajectories)
        # Update policy and value networks
        for _ in range(10):  # Update 10 times for stability
            for state, action, log_prob, advantage, ret in trajectories:
                new_log_prob, entropy = self.policy_net(state)
                ratio = torch.exp(new_log_prob - log_prob)
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1 - self.eps_clip, 1 + self.eps_clip) * advantage
                policy_loss = -torch.min(surr1, surr2) - 0.01 * entropy
                self.optimizer_policy.zero_grad()
                policy_loss.mean().backward()
                self.optimizer_policy.step()

                value_loss = (self.value_net(state) - ret) ** 2
                self.optimizer_value.zero_grad()
                value_loss.mean().backward()
                self.optimizer_value.step()

    def compute_advantages(self, trajectories):
        returns = []
        advantages = []
        for t in reversed(range(len(trajectories))):
            state, action, reward, next_state, done = trajectories[t]
            if done:
                ret = reward
            else:
                ret = reward + self.gamma * self.value_net(next_state).detach()
            returns.insert(0, ret)
            advantage = ret - self.value_net(state).detach()
            advantages.insert(0, advantage)
        return returns, advantages

# Training Loop
input_shape = (3, 64, 64)  # Example input shape (channels, height, width)
num_actions = 2  # Example number of actions (cancer cell or not)

policy_net = PolicyNetwork(input_shape, num_actions)
value_net = ValueNetwork(input_shape)
ppo_agent = PPOAgent(policy_net, value_net)

for epoch in range(1000):  # Number of epochs
    trajectories = []  # Collect trajectories (state, action, reward, next_state, done)
    for _ in range(100):  # Number of episodes per epoch
        state = env.reset()  # Initialize the environment
        done = False
        while not done:
            action, log_prob, entropy = ppo_agent.select_action(state)
            next_state, reward, done, info = env.step(action)
            trajectories.append((state, action, reward, next_state, done))
            state = next_state
    ppo_agent.update(trajectories)
    # Incorporate human feedback -- Sher (im cooked)
