In [1]:
import collections
import numpy as np
import matplotlib.pyplot as plt
import gym
from gym import spaces
from gym.utils import seeding
%matplotlib inline

In [5]:
from collections import defaultdict

class RLAgent: 
    """
        Reinforcement Learning Agent Model for training/testing
       using Q-Learning
        
    """
    
    def __init__(self, env):
        self.env = env
        #number of actions
        self.nA = self.env.action_space.n
        #Initialise Q-Table
        self.Q = defaultdict(lambda: np.zeros(self.nA))
    
    # greedy action selection
    def greedy(self, Q, state):
        return np.argmax(self.Q[state])  
    
    #epsilon greed function
    def epsilon_greed(self, epsilon, state):
        if np.random.rand() < epsilon:
            return np.random.randint(self.nA)
        else:
            return self.greedy(self.Q, state)
    
    #Slicing the state to use for Q-table update. 
    #I am maintaining player card sum, dealer card info and usable ace to store in Q-table
    def sliceState(self,state):
        state = tuple((state['player_info']['player_sum_card'], state['dealer_info'], state['player_info']['usable_ace']))
        return state

    def train(self,player1, **params):
        self.player1 = player1
        
        # parameters
        gamma = params.pop('gamma', 0.99)
        alpha = params.pop('alpha', 0.1)
        epsilon= params.pop('epsilon', 0.1)
        maxiter= params.pop('maxiter', 1000) #no. of games
        maxstep= params.pop('maxstep', 1000) #no. of rounds

        #Add player
        #player1 = Player('player1')
        #env.add_player(player1)        
        
        #online train
        #initialise rewards trace
        rtrace = []
        
        for j in range(maxiter):
            self.env.reset()
            
            #initialise rewards and trace as an empty list
            rewards = []
            trace = []
            
            # run simulation for max number of steps 
            for step in range(maxstep): 
        
                #Check if you have enough money left in the account to bet, if not break and start a new game
                valid_bet_flag = self.env.get_valid_bet_amount(player1.player_name)
                if valid_bet_flag['is_round_done']:
                    break
                bet_amount = valid_bet_flag['valid_bet_amount']

                #Get the observations i.e state and use it to decide the bet amount
                state = self.env.init_round(player1.player_name)
                
                #slice the state to get required info only
                state = self.sliceState(state['state'])

                # If you have just $1 left in the bank, ALL-IN 
                if len(bet_amount) == 1 and bet_amount[0] == 1:
                    bet = 1
                else:
                    bet = np.random.choice(bet_amount)
                    self.env.bet_money(player1.player_name, bet)

                #Select action
                action = self.epsilon_greed(epsilon, state)
                
                while True:
                    #Use the selected action to actually take the action in env by calling step
                    result = self.env.step(player1.player_name, action)
                    
                    #Fetch the state, reward and done from result
                    next_state, reward, done = result['state'], result['reward'], result['is_round_done']
                    next_state = self.sliceState(next_state)
                    
                    #Select the next action
                    next_action = self.epsilon_greed(epsilon,next_state)
                    trace.append(next_state)
                    rewards.append(reward)

                    #update the Q-table using Q-learning algorithm
                    self.Q[state][action] = self.Q[state][action] + ( alpha * ( (reward + (gamma * np.max(self.Q[next_state]))) - self.Q[state][action] ) )    
                    
                    state = next_state
                    action = next_action
                    if done:
                        break

            rtrace.append(np.sum(rewards))
        return rtrace, trace

    def test(self, maxiter=1000):
        epsilon = 0
        #Initialise counters to track win, loss and draw
        win = 0
        lose = 0
        draw = 0
        
        # rewards trace
        rtrace = []
        trace = []
        
        #keeping track of number of rounds
        rounds = 0
        
        for j in range(maxiter):         
            self.env.reset()
            
            #Checking if we have enough money left in the account to bet, if not break and start a new game
            valid_bet_flag = self.env.get_valid_bet_amount(self.player1.player_name)
            if valid_bet_flag['is_round_done']:
                break
            bet_amount = valid_bet_flag['valid_bet_amount']

            #Get the observations i.e state and use it to decide how much to bet
            state = self.env.init_round(self.player1.player_name)

            #slice the state to get required info only
            state = self.sliceState(state['state'])

            # If you have just $1 left in the bank, ALL-IN 
            if len(bet_amount) == 1 and bet_amount[0] == 1:
                bet = 1
            else:
                bet = np.random.choice(bet_amount)
                self.env.bet_money(self.player1.player_name, bet)

            #Select action
            action = self.epsilon_greed(epsilon, state)
          
            rewards = []
            
            while True :
                #Use the selected action to actually take the action in env by calling step
                result = self.env.step(self.player1.player_name, action)
                
                next_state, reward, done = result['state'], result['reward'], result['is_round_done']
                next_state = self.sliceState(next_state)
                next_action = self.epsilon_greed(epsilon,next_state)
                trace.append(next_state)
                rewards.append(reward)

                state = next_state
                action = next_action
                rounds += 1

                if(reward>0):
                    win = win + 1
                elif(reward<0):
                    lose = lose + 1
                else:
                    draw = draw + 1

                if done:
                    break
                    
            rtrace.append(np.sum(rewards))
        
        #print("\n No. of rounds Won :  {}  Lost : {}  Draw : {}".format(win,lose,draw))
        #print("\n Win Percentage",(win*100)/rounds, "%")
        #print("\n Lose Percentage",(lose*100)/rounds, "%")
        #print("\n Draw Percentage",(draw*100)/rounds, "%")
        
        return rtrace, trace,  ((win*100)/rounds)       
    

In [11]:
#Running the agent with best hyperparamters of ParameterGrid Experiment
epsilon=0.7
env_f = BlackjackEnv()
player1f = Player('player1')
env_f.add_player(player1f)
agent_f = RLAgent(env_f)
#rtrace,trace = agent_f.train(player1f,epsilon=0.7,alpha = 0.8, gamma =0.9 )
rtrace,trace = agent_f.train(player1f,epsilon=0.5,alpha = 1, gamma =1 , maxiter= 500, maxstep=300)
rtrace_test, trace_test, win = agent_f.test()
print("Win  :", win,"%")

Win  : 32.4390243902439 %


In [None]:
import socket
import os
import subprocess
import ast
import numpy as np
player = 'eric' #enter unique name for your game
s = socket.socket()
# change ip addres to ip adress of your computer or use 'localhost' to practice
#host = '10.xx.xx.xx' 
host = 'localhost'
port = 9999

s.connect((host, port))

while True:
    data = s.recv(1024)
    if data.decode("utf-8") == 'send':
        s.send(str.encode( player))
        client_response = str(s.recv(20480), "utf-8")
        print(client_response, end="")
        break
while True:
    data = s.recv(1024)
    if data.decode("utf-8") == 'sendbet':
        ro=str(s.recv(20480), "utf-8")
        ro=ast.literal_eval(ro)
        print(ro)
# The above observation are stored in dict format. To access specific variables use syntax as 
# ro['state']['player_info']['player_total_balance']
# send whichever variable information stored in ro to your agent to help make the decision
#### Look at the above observations stored in variable ro as dictionary and store bet amount in variable bet#####
        bet=2
        s.send(str.encode( str(bet)))
        
    if data.decode("utf-8") == 'sendaction':
        ro=str(s.recv(20480), "utf-8")
        ro=ast.literal_eval(ro)
        print(ro)
        #follow same procedure as abov to send observations to your agent
        ###### Look at the above observations and action in variable action#####
        
        state = tuple((ro['state']['player_info']['player_sum_card'], ro['state']['dealer_info'], ro['state']['player_info']['usable_ace']))
        action = agent_f.epsilon_greed(0,state)
        #action = 1
        print("action ",action)
        s.send(str.encode( str(action)))
    if data.decode("utf-8") == 'gameover':
        print("Game over wait for others to play and wait for results")
        
        
    

Connection established, Names collected{'state': {'player_info': {'player_sum_card': 13, 'player_card': [7, 6], 'player_total_balance': 10, 'usable_ace': False}, 'dealer_info': 10}}
{'state': {'player_info': {'player_sum_card': 13, 'player_card': [7, 6], 'player_total_balance': 10, 'usable_ace': False}, 'dealer_info': 10}}
action  0
{'state': {'player_info': {'player_sum_card': 13, 'player_card': [10, 3], 'player_total_balance': 8, 'usable_ace': False}, 'dealer_info': 4}}
{'state': {'player_info': {'player_sum_card': 13, 'player_card': [10, 3], 'player_total_balance': 8, 'usable_ace': False}, 'dealer_info': 4}}
action  0
{'state': {'player_info': {'player_sum_card': 18, 'player_card': [10, 8], 'player_total_balance': 6, 'usable_ace': False}, 'dealer_info': 8}}
{'state': {'player_info': {'player_sum_card': 18, 'player_card': [10, 8], 'player_total_balance': 6, 'usable_ace': False}, 'dealer_info': 8}}
action  0
