# policy gradient (part1: 对局数据收集)
 
    agent1 and agent2 are identical
    combine their experiences for training (从两个方便收集到的数据一块训练)

In [19]:
# 环境配置
%cd /playground/sgd_deep_learning/sgd_rl/go
import sys
sys.path.append('./python')

/playground/sgd_deep_learning/sgd_rl


In [20]:
import os
import time
import datetime
import torch
from collections import namedtuple

from dlgo import agent
from dlgo import scoring
from dlgo import rl
from dlgo.goboard_fast import GameState, Player, Point

from dlgo.encoders import get_encoder_by_name
from dlgo.networks import cnn_small, resnet18


### 脚本输入参数设定

In [21]:
# 数据 模型存放目录
data_home_path = 'data/pg/'
if not os.path.exists(data_home_path):
    os.makedirs(data_home_path)

# 脚本输入参数 
class args:
    board_size = 9 # 缩小计算量, 保证算法的验证速度
    num_games = 100 # 每轮迭代只收集10games的数据
    learning_agent = data_home_path + 'agent_checkpoint.pth'
    experience_out = data_home_path + 'experience.pth'
    
print(os.path.exists(args.learning_agent))

# 全局变量
global BOARD_SIZE
BOARD_SIZE = args.board_size

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

False
device: cuda


### helper functions

In [22]:
COLS = 'ABCDEFGHJKLMNOPQRST'
STONE_TO_CHAR = {
    None: '.',
    Player.black: 'x',
    Player.white: 'o',
}

def avg(items):
    if not items:
        return 0.0
    return sum(items) / float(len(items))


def print_board(board):
    for row in range(BOARD_SIZE, 0, -1):
        line = []
        for col in range(1, BOARD_SIZE + 1):
            stone = board.get(Point(row=row, col=col))
            line.append(STONE_TO_CHAR[stone])
        print('%2d %s' % (row, ''.join(line)))
    print('   ' + COLS[:BOARD_SIZE])


class GameRecord(namedtuple('GameRecord', 'moves winner margin')):
    pass

def name(player):
    if player == Player.black:
        return 'B'
    return 'W'

### 模拟对局

In [23]:
def simulate_game(black_player, white_player):
    moves = []
    game = GameState.new_game(BOARD_SIZE)
    agents = {
        Player.black: black_player,
        Player.white: white_player,
    }
    while not game.is_over():
        next_move = agents[game.next_player].select_move(game)
        moves.append(next_move)
        game = game.apply_move(next_move)

    print_board(game.board)
    game_result = scoring.compute_game_result(game)
    print(game_result)

    # nametuple todo moves作用？margin作用？
    return GameRecord(
        moves=moves,
        winner=game_result.winner,
        margin=game_result.winning_margin,
    )

### main loop of self_play

In [24]:
def main_loop():
    agent_filename = args.learning_agent
    experience_filename = args.experience_out
    num_games = args.num_games

    # init agent object
    encoder_name = 'sevenplane'
    model = cnn_small(input_channel_num=7, board_size=BOARD_SIZE) 

    agent1, agent2 = None, None
    
    if not os.path.exists(agent_filename): # check_point不存在
        encoder = get_encoder_by_name(name=encoder_name, board_size=BOARD_SIZE)
        agent1 = agent.load_policy_agent(model=model, encoder=encoder, device=device)
        agent2 = agent.load_policy_agent(model=model, encoder=encoder, device=device)
    else: 
        # 指向同一个model没啥问题吧, 变量重新初始化了一遍
        agent1 = agent.load_policy_agent(model=model, save_path=agent_filename, device=device)
        agent2 = agent.load_policy_agent(model=model, save_path=agent_filename, device=device)
    
    assert (agent1 is not None) and (agent2 is not None)

    collector1 = rl.ExperienceCollector()
    collector2 = rl.ExperienceCollector()
    agent1.set_collector(collector1)
    agent2.set_collector(collector2)
    #######################################################################
    t1 = time.time()

    for i in range(args.num_games):
        print('Simulating game %d/%d...' % (i + 1, args.num_games))
        collector1.begin_episode() # 开始记录， 不是agent负责
        collector2.begin_episode()

        game_record = simulate_game(agent1, agent2) # pg只用了winner变量
        if game_record.winner == Player.black:
            collector1.complete_episode(reward=1)
            collector2.complete_episode(reward=-1)
        else:
            collector2.complete_episode(reward=1)
            collector1.complete_episode(reward=-1)

    print("simulatinon of {} games , cost_time:{:.3f}s.".format(args.num_games, time.time()-t1))

    experience = rl.combine_experience([collector1, collector2]) # 整合所有训练数据
    # with h5py.File(experience_filename, 'w') as experience_outf:
    experience.serialize(experience_filename) # 序列化存储
    
    print("collect {} samples".format(len(experience)))
    
main_loop()

Simulating game 1/100...
 9 .o.o.oo.o
 8 ooooooooo
 7 ooooo.o.o
 6 oo.oooooo
 5 oooooo.oo
 4 .ooooooo.
 3 oo.ooo.oo
 2 ooo.ooooo
 1 .oooooooo
   ABCDEFGHJ
W+88.5
Simulating game 2/100...
 9 xxxx.xxx.
 8 xxxxxxxxx
 7 .xxooxxxx
 6 xxoooxxx.
 5 xoo.oxxxx
 4 o.ooxxxxx
 3 oooxxxx.x
 2 oooxxxxxx
 1 xxxxx.xx.
   ABCDEFGHJ
B+35.5
Simulating game 3/100...
 9 xxxxooo.o
 8 xx.xxoooo
 7 xxx.xoooo
 6 xxxxxoo.o
 5 x.xxxoooo
 4 xxxxxoooo
 3 xxxxxooo.
 2 .x.xxo.oo
 1 xxxxooooo
   ABCDEFGHJ
W+2.5
Simulating game 4/100...
 9 xxxxoo.o.
 8 xx.xooooo
 7 .xxxxxxxo
 6 xxxxxxxxx
 5 x.x.xx.xx
 4 xxxxxxxxx
 3 xxxxxxx.x
 2 xxxxxxxxx
 1 x.xx.x.x.
   ABCDEFGHJ
B+51.5
Simulating game 5/100...
 9 xxxxx.xx.
 8 xx.xxxxxx
 7 xxxxxx.xx
 6 .xxxxxxx.
 5 xx.xxx.xx
 4 x.xxxxxx.
 3 xxxxxxxxx
 2 xxxx.xxxx
 1 .x.xxxxx.
   ABCDEFGHJ
B+73.5
Simulating game 6/100...
 9 ooooooox.
 8 oo.ooooxx
 7 ooooo.oxx
 6 o.oooooxx
 5 ooxooooox
 4 oxxoxxxxx
 3 xxxxxx.xx
 2 xxxxxxxxx
 1 x.xx.xxx.
   ABCDEFGHJ
W+0.5
Simulating game 7/100...
 9 oo

## playground

In [25]:
# 测试gpu数据和模型 序列化和反序列化的应用
import os
import torch
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device('cpu')
print("device:", device)

device: cuda


In [26]:
# save to disk
model = torch.nn.Linear(12,1).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

x = torch.randn((1,3,4), dtype=torch.float32).to(device)
y = model(torch.nn.Flatten(start_dim=1)(x))
print(y)

y1 = torch.LongTensor(1)
y2 = 2233
y3 = np.array([1,2,3])

print(y3, type(y3))

checkpoint = {
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'epoch': 100,  # 保存当前训练的轮次
    'x': x,
    'y1': y1,
    'y2': y2,
    'y3': y3,
}

pth = "data/checkpoint/test_pth.pth"
if not os.path.exists('data/checkpoint/'):
    os.makedirs('data/checkpoint/')
torch.save(checkpoint, pth)

tensor([[-0.4894]], device='cuda:0', grad_fn=<AddmmBackward0>)
[1 2 3] <class 'numpy.ndarray'>


In [27]:
# load from disk

# 创建模型实例（必须与保存时的模型结构一致）
model = torch.nn.Linear(12,1).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
checkpoint = torch.load(pth)

x = checkpoint['x']
print(x)

y_new = model(torch.nn.Flatten(start_dim=1)(x))
print(y_new)

# laod model
model.load_state_dict(checkpoint['model_state_dict'])
y_old = model(torch.nn.Flatten(start_dim=1)(x))
print(y_old)

epoch = checkpoint['epoch']
print(checkpoint['epoch'])

y3 = checkpoint['y3']
print(y3, type(y3))

# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# model.train()  # 或 model.eval()
# 结果显示，可以直接存储cuda格式的数据, np格式的序列化也统统没有问题

tensor([[[ 0.5703,  1.0359,  0.4338,  0.7824],
         [-0.5979,  0.6990, -1.4519,  1.9334],
         [-2.0659, -0.0798, -1.8046,  0.2827]]], device='cuda:0')
tensor([[-0.3260]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[-0.4894]], device='cuda:0', grad_fn=<AddmmBackward0>)
100
[1 2 3] <class 'numpy.ndarray'>
