## Reinforcement Learning
### Training a taxi driver to pick up and drop off passenger using Q-learning

In [16]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gym
import random
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [17]:
env = gym.make("Taxi-v3")

In [20]:
env.render()

+---------+
|R: | : :G|
| : | : : |
| : : : :[43m [0m|
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+



In [21]:
action_size = env.action_space.n
state_size = env.observation_space.n
print(action_size,state_size)

6 500


In [49]:
class Taxi:
    def __init__(self,action_size,state_size,lr = 0.7,gamma = 0.618,eps = 1.0,max_eps = 1.0,min_eps = 0.01,decay_rate = 0.01):
        self.qtable = np.zeros((state_size,action_size))
        self.action_size = action_size
        self.state_size = state_size
        self.lr = lr
        self.gamma = gamma
        self.eps = eps
        self.max_eps = max_eps
        self.min_eps = min_eps
        self.decay_rate = decay_rate
    
    def train(self,total_episodes = 50000,max_steps = 99):
        for episode in range(total_episodes):
            state = env.reset()
            step = 0
            done = False
            for step in range(max_steps):
                exp_exp_tradeoff = random.uniform(0,1)
                if exp_exp_tradeoff > self.eps:
                    action = np.argmax(self.qtable[state,:])
                else:
                    action = env.action_space.sample()

                new_state, reward, done, info = env.step(action)

                self.qtable[state, action] = self.qtable[state, action] + self.lr * (reward + self.gamma * 
                                        np.max(self.qtable[new_state, :]) - self.qtable[state, action])

                state = new_state

                if done==True:
                    break

            self.eps = self.min_eps + (self.max_eps - self.min_eps)*np.exp(-self.decay_rate*episode)
    
    def test(self,total_test_episodes=100):
        env.reset()
        rewards = []

        for episode in range(total_test_episodes):
            state = env.reset()
            step = 0
            done = False
            total_rewards = 0
            #print("****************************************************")
            print("EPISODE ", episode)
            for step in range(max_steps):
                env.render()
                # Take the action (index) that have the maximum expected future reward given that state
                action = np.argmax(self.qtable[state,:])

                new_state, reward, done, info = env.step(action)

                total_rewards += reward

                if done:
                    rewards.append(total_rewards)
                    #print ("Score", total_rewards)
                    break
                state = new_state
        env.close()
        print ("Score over time: " +  str(sum(rewards)/total_test_episodes))

In [50]:
taxi = Taxi(action_size,state_size)

In [51]:
taxi.train()

In [52]:
taxi.test(5)

EPISODE  0
+---------+
|[35mR[0m: | : :G|
| :[43m [0m| : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+

+---------+
|[35mR[0m: | : :G|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (East)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (East)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1m[43mB[0m[0m: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[42mB[0m: |
+---------+
  (Pickup)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : |[42m_[0m: |
|Y| : |B: |
+---------+
  (North)
+---------+
|[35mR[0m: | : :G|
| : 