This notebook is a experimental model of reinforcement learning.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [None]:
# Kaggle environments.
!git clone https://github.com/Kaggle/kaggle-environments.git
!cd kaggle-environments && pip install .

# GFootball environment.
!apt-get update -y
!apt-get install -y libsdl2-gfx-dev libsdl2-ttf-dev

# Make sure that the Branch in git clone and in wget call matches !!
!git clone -b v2.6 https://github.com/google-research/football.git
!mkdir -p football/third_party/gfootball_engine/lib

!wget https://storage.googleapis.com/gfootball/prebuilt_gameplayfootball_v2.6.so -O football/third_party/gfootball_engine/lib/prebuilt_gameplayfootball.so
!cd football && GFOOTBALL_USE_PREBUILT_SO=1 pip3 install .

# keras reinforcement learning
!pip install reinforcement_learning_keras

# Setting Model

In [None]:
import keras
import math
from keras.models import Sequential
from keras.layers import Dense, Dropout

class RLNAgent:
    def __init__(self):
        self.memory = []
        self.rewards = []
        self.gamma = 0.3
        self.epsilon = 0.7
        self.epsilon_decay = 0.85
        self.epsilon_min = 0.05
        self.learning_rate = 0.0002
        self._build_model()
    
    # setting mechanic learning model
    def _build_model(self):
        model = Sequential()
        model.add(Dense(157, input_dim = 103, activation = 'linear'))
        #model.add(Dropout(0.1))
        model.add(Dense(101, activation = 'tanh'))
        model.add(Dense(83, activation = 'tanh'))
        model.add(Dense(29, activation = 'tanh'))
        model.add(Dense(18, activation = 'tanh'))
        model.compile(loss='mean_squared_error', optimizer = keras.optimizers.Adam(lr=self.learning_rate))
        self.model = model
    
    # transfor json type to (1,103) array
    def state_as_sample103(self, obs):
        sample103 = np.concatenate((
            np.array(obs['left_team'][obs['active']]).flatten(),
            np.array(obs['left_team_direction'][obs['active']]).flatten(),
            np.array(obs['ball']).flatten(),
            np.array(obs['ball_direction']).flatten(),
            np.array(obs['left_team'][obs['active']]).flatten() - np.array(obs['ball']).flatten()[0:1],
            np.array(obs['left_team']).flatten(),
            np.array(obs['left_team_direction']).flatten(),
            np.array(obs['right_team']).flatten(),
            np.array(obs['right_team_direction']).flatten(),
            (lambda x: [1,0,0] if x == -1 else ([0,1,0] if x == 0 else [0,0,1]))(obs['ball_owned_team'])
        ))
        return sample103.reshape((1,103))
    
    # remember the play state
    def remember(self, state, predict, action, reward, next_state):
        self.memory.append((state, predict, action, reward, next_state))
        self.rewards.append(reward)

    # get the model predict for the player's best action
    def act_predict(self, state):
        return self.model.predict(state)
    
    # using reward to train the model
    def replay(self, batch_size, maxreward):
        batches = np.arange(1, len(self.memory), 1)
        memory_max_length = len(self.memory)
        
        for i in batches:
            state, predict, action, reward, next_state = self.memory[i]

            # add noise
            target_f = (predict * (1-self.epsilon)) + (np.random.rand(1,18) * self.epsilon)
            
            # the training target value is the addition rewards with follow-up (1.5s) effect            
            followup_rewards = 0
            for j in range(i, min(i + 30, memory_max_length), 1):
                followup_rewards = followup_rewards + (self.rewards[j] * math.exp( (i - j) * 2 ))
            
            #if followup_rewards < 1 and followup_rewards > 0 and (self.rewards[i] - self.rewards[i-1]) < 0:
            #    continue
            #print("{}, followup_rewards:{}".format(i,followup_rewards))
            
            target = target_f[0][action] + (followup_rewards * 0.01) + (self.rewards[i] - self.rewards[i-1]) * 0.5
            
            # limit the max target
            if target > 0.8:
                target = 0.8

            target_f[0][action] = target
            
            # training
            self.model.fit(state, target_f, epochs = 1, verbose = 0)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
                        
    def save(self):
        self.model.save('rl_model')
        
    def load(self):
        self.model = keras.models.load_model('rl_model')

## Making the simulation environment

In [None]:
from kaggle_environments import make
import copy
import random

env = make("football",
           configuration={"save_video": False, 
                          "scenario_name": "11_vs_11_kaggle", 
                          "running_in_notebook": True})

## Rule based agent as the starting point of reinforcement learning

In [None]:
def closed_contestant_num(obs, controlled_player_pos):
    closed_num = 0
    for i in range(1, len(obs["right_team"])):
        if abs(controlled_player_pos[0] - obs["right_team"][i][0]) < 0.08 and abs(controlled_player_pos[1] - obs["right_team"][i][1]) < 0.04 :
            closed_num = closed_num + 1
    return closed_num

def rule_based_agent(obs):
    controlled_player_pos = obs['left_team'][obs['active']]
    if obs['ball_owned_player'] == obs['active'] and obs['ball_owned_team'] == 0:
        closed_people = closed_contestant_num(obs, controlled_player_pos)
        if controlled_player_pos[0] > 0.5 and closed_people < 4:
            return 12
        if controlled_player_pos[0] > 0.5 and controlled_player_pos[1] > 0.3 and closed_people > 5:
            return 3
        if controlled_player_pos[0] > 0.5 and controlled_player_pos[1] < -0.3 and closed_people > 5:
            return 7
        return 5
    else:        
        if obs['ball'][0] > controlled_player_pos[0] + 0.05:
            return 5
        if obs['ball'][0] < controlled_player_pos[0] - 0.05:
            return 1
        if obs['ball'][1] > controlled_player_pos[1] + 0.05:
            return 7
        if obs['ball'][1] < controlled_player_pos[1] - 0.05:
            return 3
        return 16

# Playing Games and Train Model

In [None]:
agent = RLNAgent()

episodes = 150  # playing the game up to {episodes} times
steps = 2000  # each game with {steps} steps

for e in range(episodes):
    
    env.reset()
    agent.memory = []
    agent.rewards = []
    
    trainer = env.train([None, "run_right"])
    trainer.reset()
    
    obs = env.state[0]['observation']['players_raw'][0]
    maxreward = -10
    
    for time_t in range(steps):    # simulating the game step by step
        
        action = 0
        
        state = agent.state_as_sample103(obs)
        predict = agent.act_predict(state)
        action = np.argmax(predict)
        
        # learning from rule based agent coach
        if e < 10:
            action = rule_based_agent(obs)
            predict = np.reshape([(lambda x, y: 1 if x==y else 0)(x, action) for x in range(18)], (1,18))
        
        next_obs, reward, done, info = trainer.step([action])
        
        reward = - 10 if reward == None else reward
            
        next_obs = next_obs['players_raw'][0]
        next_state = agent.state_as_sample103(next_obs)
        
        # if we steal the ball, the reward will gain 0.05 point
        if obs['ball_owned_team'] == 1 and next_obs['ball_owned_team'] == 0:
            reward += 0.05

        # if we get the ball, the reward will gain with the right moving distince
        if obs['ball_owned_team'] == 0:
            reward += ( next_obs['ball'][0] - obs['ball'][0] ) * 0.1 - abs( next_obs['ball'][1] ) * 0.01

        # if active player far from the ball, the reward will lose by the moving distance
        distance = abs(obs['left_team'][obs['active']][0] - obs['ball'][0]) + abs(obs['left_team'][obs['active']][1] - obs['ball'][1])
        next_distance = abs(next_obs['left_team'][next_obs['active']][0] - next_obs['ball'][0]) + abs(next_obs['left_team'][next_obs['active']][1] - next_obs['ball'][1])
        
        if abs(next_obs['left_team'][next_obs['active']][0] - obs['left_team'][obs['active']][0]) < 0.01 and abs(next_obs['left_team'][next_obs['active']][1] - obs['left_team'][obs['active']][1]) < 0.01 :
            reward -= 0.5
        
        if next_obs['ball_owned_team'] != 0:
            reward += - distance - (next_distance - distance) * 2

        # if the next action is shooting the ball, the reward will gain 0.2 point
        if obs['ball_owned_team'] == 0 and obs['left_team'][obs['active']][0] > 0.5 and action == 12:
            reward += 0.2
            
        if obs['left_team'][obs['active']][0] < 0 and action == 12:
            reward -= 0.05
            
        # if we go out the square, the reward will lose 2 point
        if obs['left_team'][obs['active']][0] > 0.98 or obs['left_team'][obs['active']][0] < - 0.98 or obs['left_team'][obs['active']][1] < - 0.39 or obs['left_team'][obs['active']][1] > 0.39:
            reward -= 2
        
        agent.remember(state, predict, action, reward, next_state)
        
        obs = copy.deepcopy(next_obs)
        
        if maxreward < reward:
            maxreward = reward

    print("episode: {}/{}, score: {}, action: {}, probability: {}".format(e + 1, episodes, maxreward, action, np.max(predict,axis = 1)))
    agent.replay(steps, maxreward)

In [None]:
agent.save()

In [None]:
%%writefile main.py
# for making a video

import numpy as np # linear algebra
from kaggle_environments.envs.football.helpers import *
import keras
from keras.models import Sequential
from keras.layers import Dense


class QNAgent:
    def __init__(self):
        self.load()
        
    def act(self, state):
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])
        
    def load(self):
        self.model = keras.models.load_model('/kaggle/working/rl_model')

    def state_as_sample103(self, obs):
        sample103 = np.concatenate((
            np.array(obs['left_team'][obs['active']]).flatten(),
            np.array(obs['left_team_direction'][obs['active']]).flatten(),
            np.array(obs['ball']).flatten(),
            np.array(obs['ball_direction']).flatten(),
            np.array(obs['left_team'][obs['active']]).flatten() - np.array(obs['ball']).flatten()[0:1],
            np.array(obs['left_team']).flatten(),
            np.array(obs['left_team_direction']).flatten(),
            np.array(obs['right_team']).flatten(),
            np.array(obs['right_team_direction']).flatten(),
            (lambda x: [1,0,0] if x == -1 else ([0,1,0] if x == 0 else [0,0,1]))(obs['ball_owned_team'])
        ))
        return sample103.reshape((1,103))


qagent = QNAgent()

ActionDic = [Action.Idle,
             Action.Left,
             Action.TopLeft,
             Action.Top,
             Action.TopRight,
             Action.Right,
             Action.BottomRight,
             Action.Bottom,
             Action.BottomLeft,
             Action.LongPass,
             Action.HighPass,
             Action.ShortPass,
             Action.Shot,
             Action.Sprint,
             Action.ReleaseDirection,
             Action.ReleaseSprint,
             Action.Slide,
             Action.Dribble,
             Action.ReleaseDribble
            ]

@human_readable_agent
def agent(obs):
    state = qagent.state_as_sample103(obs)
    action = qagent.act(state)    
    return ActionDic[action]

In [None]:
from kaggle_environments import make

env = make("football", configuration={"save_video": True, "scenario_name": "11_vs_11_kaggle", "running_in_notebook": True})
output = env.run(["/kaggle/working/main.py", "run_right"])[-1]

print('Left player: reward = %s, status = %s, info = %s' % (output[0]['reward'], output[0]['status'], output[0]['info']))
print('Right player: reward = %s, status = %s, info = %s' % (output[1]['reward'], output[1]['status'], output[1]['info']))

env.render(mode="human", width=800, height=600)

In [None]:
%%writefile main.py
# for submition

import numpy as np # linear algebra
from kaggle_environments.envs.football.helpers import *
import keras
from keras.models import Sequential
from keras.layers import Dense


class QNAgent:
    def __init__(self):
        self.load()
        
    def act(self, state):
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])
        
    def load(self):
        self.model = keras.models.load_model('/kaggle_simulations/agent/rl_model')

    def state_as_sample103(self, obs):
        sample103 = np.concatenate((
            np.array(obs['left_team'][obs['active']]).flatten(),
            np.array(obs['left_team_direction'][obs['active']]).flatten(),
            np.array(obs['ball']).flatten(),
            np.array(obs['ball_direction']).flatten(),
            np.array(obs['left_team'][obs['active']]).flatten() - np.array(obs['ball']).flatten()[0:1],
            np.array(obs['left_team']).flatten(),
            np.array(obs['left_team_direction']).flatten(),
            np.array(obs['right_team']).flatten(),
            np.array(obs['right_team_direction']).flatten(),
            (lambda x: [1,0,0] if x == -1 else ([0,1,0] if x == 0 else [0,0,1]))(obs['ball_owned_team'])
        ))
        return sample103.reshape((1,103))


qagent = QNAgent()

ActionDic = [Action.Idle,
             Action.Left,
             Action.TopLeft,
             Action.Top,
             Action.TopRight,
             Action.Right,
             Action.BottomRight,
             Action.Bottom,
             Action.BottomLeft,
             Action.LongPass,
             Action.HighPass,
             Action.ShortPass,
             Action.Shot,
             Action.Sprint,
             Action.ReleaseDirection,
             Action.ReleaseSprint,
             Action.Slide,
             Action.Dribble,
             Action.ReleaseDribble
            ]

@human_readable_agent
def agent(obs):
    state = qagent.state_as_sample103(obs)
    action = qagent.act(state)    
    return ActionDic[action]

In [None]:
!tar -czvf submission.tar.gz main.py rl_model