## 正式的数据预处理脚本 （for 监督模型的训练）

In [1]:
# 环境配置
%cd /playground/sgd_deep_learning/sgd_rl/go
import sys
sys.path.append('./python')

/playground/sgd_deep_learning/sgd_rl


In [2]:
!rm test_samples.py
!rm data/oneplane/*.npy
!rm data/sevenplane/*.npy


In [3]:
from dlgo.data import  GoDataProcessorFast, GoDataset

encoder = 'sevenplane'
train_samples = 2000
root_dir = 'data'
# test_samples = 100 # 默认值，写死在sampling.py

  from .autonotebook import tqdm as notebook_tqdm


### Step1：并发模拟对局数据生成
    * 生成npy文件，每个文件的chunksize=1024 
    * 存储在对应encoder同名文件下

In [4]:
# 训练集预处理
go_data_process = GoDataProcessorFast(encoder=encoder, data_directory=root_dir)
go_data_process.load_go_data(data_type='train', num_samples=train_samples)

>>> Reading cached index page
KGS-2019_04-19-1255-.tar.gz 1255
KGS-2019_03-19-1478-.tar.gz 1478
KGS-2019_02-19-1412-.tar.gz 1412
KGS-2019_01-19-2095-.tar.gz 2095
KGS-2018_12-19-1992-.tar.gz 1992
KGS-2018_11-19-1879-.tar.gz 1879
KGS-2018_10-19-1209-.tar.gz 1209
KGS-2018_09-19-1587-.tar.gz 1587
KGS-2018_08-19-1447-.tar.gz 1447
KGS-2018_07-19-949-.tar.gz 949
KGS-2018_06-19-1002-.tar.gz 1002
KGS-2018_05-19-1590-.tar.gz 1590
KGS-2018_04-19-1612-.tar.gz 1612
KGS-2018_03-19-833-.tar.gz 833
KGS-2018_02-19-1167-.tar.gz 1167
KGS-2018_01-19-1526-.tar.gz 1526
KGS-2017_12-19-1488-.tar.gz 1488
KGS-2017_11-19-945-.tar.gz 945
KGS-2017_10-19-1351-.tar.gz 1351
KGS-2017_09-19-1353-.tar.gz 1353
KGS-2017_08-19-2205-.tar.gz 2205
KGS-2017_07-19-1191-.tar.gz 1191
KGS-2017_06-19-910-.tar.gz 910
KGS-2017_05-19-847-.tar.gz 847
KGS-2017_04-19-913-.tar.gz 913
KGS-2017_03-19-717-.tar.gz 717
KGS-2017_02-19-525-.tar.gz 525
KGS-2017_01-19-733-.tar.gz 733
KGS-2016_12-19-1208-.tar.gz 1208
KGS-2016_11-19-980-.tar.gz 980


In [5]:
# 测试集预处理
go_data_process = GoDataProcessorFast(encoder=encoder, data_directory=root_dir)
go_data_process.load_go_data(data_type='test') # 默认num_samples=100

>>> Reading cached index page
KGS-2019_04-19-1255-.tar.gz 1255
KGS-2019_03-19-1478-.tar.gz 1478
KGS-2019_02-19-1412-.tar.gz 1412
KGS-2019_01-19-2095-.tar.gz 2095
KGS-2018_12-19-1992-.tar.gz 1992
KGS-2018_11-19-1879-.tar.gz 1879
KGS-2018_10-19-1209-.tar.gz 1209
KGS-2018_09-19-1587-.tar.gz 1587
KGS-2018_08-19-1447-.tar.gz 1447
KGS-2018_07-19-949-.tar.gz 949
KGS-2018_06-19-1002-.tar.gz 1002
KGS-2018_05-19-1590-.tar.gz 1590
KGS-2018_04-19-1612-.tar.gz 1612
KGS-2018_03-19-833-.tar.gz 833
KGS-2018_02-19-1167-.tar.gz 1167
KGS-2018_01-19-1526-.tar.gz 1526
KGS-2017_12-19-1488-.tar.gz 1488
KGS-2017_11-19-945-.tar.gz 945
KGS-2017_10-19-1351-.tar.gz 1351
KGS-2017_09-19-1353-.tar.gz 1353
KGS-2017_08-19-2205-.tar.gz 2205
KGS-2017_07-19-1191-.tar.gz 1191
KGS-2017_06-19-910-.tar.gz 910
KGS-2017_05-19-847-.tar.gz 847
KGS-2017_04-19-913-.tar.gz 913
KGS-2017_03-19-717-.tar.gz 717
KGS-2017_02-19-525-.tar.gz 525
KGS-2017_01-19-733-.tar.gz 733
KGS-2016_12-19-1208-.tar.gz 1208
KGS-2016_11-19-980-.tar.gz 980


### step2: dataset加载中间结果， 使用dataloader、transform直接作为训练输入接口

In [8]:
from torch.utils.data import DataLoader

preprocess_dir = 'data/sevenplane/' # ！！注意修改encoder后需要修改数据目录
num_workers = 0

train_dataset = GoDataset(preprocess_dir, datatype='train')
test_dataset = GoDataset(preprocess_dir, datatype='test')

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=False, num_workers=num_workers)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=num_workers)

# 不shuffle, 就没有很多问题，每次只需要缓存几个文件(已经实现)
# 但是这样就会导致棋局后面的棋面的bias，学习率下降。 所以用 优化器adagrad adadelta
# 可以对比测试一下，shuffle和不shuffle的差异。添加缓存设计后是什么情况。
# reshuffle the data at every epoch to reduce model overfitting

In [9]:
x, y = train_dataset[0]
print(type(x), x.shape, x.dtype)
print(type(y), y.shape, y, y.dtype)

print("train_data:[{}] test_data:[{}].".format(len(train_dataset), len(test_dataset)))

for x,y in train_dataloader:
    print(x.shape, y.shape) # 每个batch数据的shape
    break

<class 'torch.Tensor'> torch.Size([7, 19, 19]) torch.float32
<class 'torch.Tensor'> torch.Size([]) tensor(263) torch.int64
train_data:[388096] test_data:[12288].
torch.Size([64, 7, 19, 19]) torch.Size([64])


### playground 

In [None]:
import glob
base = 'data/oneplane' + '/' + '*train_features_*.npy'
res = []
for feature_file in glob.glob(base):
    res.append(feature_file)

print(len(res), res)