Skip to content

Commit

Permalink
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
Browse files Browse the repository at this point in the history
… lstm
  • Loading branch information
qingqing01 committed Oct 19, 2017
2 parents 17e3373 + 63ffe52 commit 694bc64
Show file tree
Hide file tree
Showing 20 changed files with 959 additions and 190 deletions.
Binary file modified doc/design/cluster_train/src/trainer.graffle
Binary file not shown.
316 changes: 221 additions & 95 deletions doc/howto/usage/cluster/cluster_train_cn.md

Large diffs are not rendered by default.

327 changes: 232 additions & 95 deletions doc/howto/usage/cluster/cluster_train_en.md

Large diffs are not rendered by default.

Binary file added doc/howto/usage/cluster/src/trainer.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/howto/usage/cluster/src/trainer_cn.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
100 changes: 100 additions & 0 deletions doc/howto/usage/cluster/src/word2vec/api_train_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import gzip
import math

import paddle.v2 as paddle

embsize = 32
hiddensize = 256
N = 5


def wordemb(inlayer):
wordemb = paddle.layer.embedding(
input=inlayer,
size=embsize,
param_attr=paddle.attr.Param(
name="_proj",
initial_std=0.001,
learning_rate=1,
l2_rate=0,
sparse_update=True))
return wordemb


def main():
# for local training
cluster_train = False

if not cluster_train:
paddle.init(use_gpu=False, trainer_count=1)
else:
paddle.init(
use_gpu=False,
trainer_count=2,
port=7164,
ports_num=1,
ports_num_for_sparse=1,
num_gradient_servers=1)
word_dict = paddle.dataset.imikolov.build_dict()
dict_size = len(word_dict)
firstword = paddle.layer.data(
name="firstw", type=paddle.data_type.integer_value(dict_size))
secondword = paddle.layer.data(
name="secondw", type=paddle.data_type.integer_value(dict_size))
thirdword = paddle.layer.data(
name="thirdw", type=paddle.data_type.integer_value(dict_size))
fourthword = paddle.layer.data(
name="fourthw", type=paddle.data_type.integer_value(dict_size))
nextword = paddle.layer.data(
name="fifthw", type=paddle.data_type.integer_value(dict_size))

Efirst = wordemb(firstword)
Esecond = wordemb(secondword)
Ethird = wordemb(thirdword)
Efourth = wordemb(fourthword)

contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
hidden1 = paddle.layer.fc(input=contextemb,
size=hiddensize,
act=paddle.activation.Sigmoid(),
layer_attr=paddle.attr.Extra(drop_rate=0.5),
bias_attr=paddle.attr.Param(learning_rate=2),
param_attr=paddle.attr.Param(
initial_std=1. / math.sqrt(embsize * 8),
learning_rate=1))
predictword = paddle.layer.fc(input=hidden1,
size=dict_size,
bias_attr=paddle.attr.Param(learning_rate=2),
act=paddle.activation.Softmax())

def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
with gzip.open("batch-" + str(event.batch_id) + ".tar.gz",
'w') as f:
trainer.save_parameter_to_tar(f)
result = trainer.test(
paddle.batch(
paddle.dataset.imikolov.test(word_dict, N), 32))
print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics,
result.metrics)

cost = paddle.layer.classification_cost(input=predictword, label=nextword)

parameters = paddle.parameters.create(cost)
adagrad = paddle.optimizer.AdaGrad(
learning_rate=3e-3,
regularization=paddle.optimizer.L2Regularization(8e-4))
trainer = paddle.trainer.SGD(cost,
parameters,
adagrad,
is_local=not cluster_train)
trainer.train(
paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
num_passes=30,
event_handler=event_handler)


if __name__ == '__main__':
main()
123 changes: 123 additions & 0 deletions doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import math
import os
import paddle.v2 as paddle
import pickle

embsize = 32
hiddensize = 256
N = 5
cluster_train_file = "./train_data_dir/train/train.txt"
cluster_test_file = "./test_data_dir/test/test.txt"
node_id = os.getenv("OMPI_COMM_WORLD_RANK")
if not node_id:
raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK")


def wordemb(inlayer):
wordemb = paddle.layer.embedding(
input=inlayer,
size=embsize,
param_attr=paddle.attr.Param(
name="_proj",
initial_std=0.001,
learning_rate=1,
l2_rate=0,
sparse_update=True))
return wordemb


def cluster_reader_cluster(filename, node_id):
def cluster_reader():
with open("-".join([filename, "%05d" % int(node_id)]), "r") as f:
for l in f:
csv_data = [int(cell) for cell in l.split(",")]
yield tuple(csv_data)

return cluster_reader


def main():
# get arguments from env

# for local training
TRUTH = ["true", "True", "TRUE", "1", "yes", "Yes", "YES"]
cluster_train = os.getenv('PADDLE_CLUSTER_TRAIN', "False") in TRUTH
use_gpu = os.getenv('PADDLE_INIT_USE_GPU', "False")

if not cluster_train:
paddle.init(
use_gpu=use_gpu,
trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")))
else:
paddle.init(
use_gpu=use_gpu,
trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")),
port=int(os.getenv("PADDLE_INIT_PORT", "7164")),
ports_num=int(os.getenv("PADDLE_INIT_PORTS_NUM", "1")),
ports_num_for_sparse=int(
os.getenv("PADDLE_INIT_PORTS_NUM_FOR_SPARSE", "1")),
num_gradient_servers=int(
os.getenv("PADDLE_INIT_NUM_GRADIENT_SERVERS", "1")),
trainer_id=int(os.getenv("PADDLE_INIT_TRAINER_ID", "0")),
pservers=os.getenv("PADDLE_INIT_PSERVERS", "127.0.0.1"))
fn = open("thirdparty/wuyi_train_thdpty/word_dict.pickle", "r")
word_dict = pickle.load(fn)
fn.close()
dict_size = len(word_dict)
firstword = paddle.layer.data(
name="firstw", type=paddle.data_type.integer_value(dict_size))
secondword = paddle.layer.data(
name="secondw", type=paddle.data_type.integer_value(dict_size))
thirdword = paddle.layer.data(
name="thirdw", type=paddle.data_type.integer_value(dict_size))
fourthword = paddle.layer.data(
name="fourthw", type=paddle.data_type.integer_value(dict_size))
nextword = paddle.layer.data(
name="fifthw", type=paddle.data_type.integer_value(dict_size))

Efirst = wordemb(firstword)
Esecond = wordemb(secondword)
Ethird = wordemb(thirdword)
Efourth = wordemb(fourthword)

contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
hidden1 = paddle.layer.fc(input=contextemb,
size=hiddensize,
act=paddle.activation.Sigmoid(),
layer_attr=paddle.attr.Extra(drop_rate=0.5),
bias_attr=paddle.attr.Param(learning_rate=2),
param_attr=paddle.attr.Param(
initial_std=1. / math.sqrt(embsize * 8),
learning_rate=1))
predictword = paddle.layer.fc(input=hidden1,
size=dict_size,
bias_attr=paddle.attr.Param(learning_rate=2),
act=paddle.activation.Softmax())

def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
result = trainer.test(
paddle.batch(
cluster_reader_cluster(cluster_test_file, node_id), 32))
print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics,
result.metrics)

cost = paddle.layer.classification_cost(input=predictword, label=nextword)
parameters = paddle.parameters.create(cost)
adagrad = paddle.optimizer.AdaGrad(
learning_rate=3e-3,
regularization=paddle.optimizer.L2Regularization(8e-4))
trainer = paddle.trainer.SGD(cost,
parameters,
adagrad,
is_local=not cluster_train)
trainer.train(
paddle.batch(cluster_reader_cluster(cluster_train_file, node_id), 32),
num_passes=30,
event_handler=event_handler)


if __name__ == '__main__':
main()
41 changes: 41 additions & 0 deletions doc/howto/usage/cluster/src/word2vec/prepare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import paddle.v2 as paddle
import tarfile
import os
import pickle

SPLIT_COUNT = 3
N = 5


def file_len(fd):
for i, l in enumerate(fd):
pass
return i + 1


def split_from_reader_by_line(filename, reader, split_count):
fn = open(filename, "w")
for batch_id, batch_data in enumerate(reader()):
batch_data_str = [str(d) for d in batch_data]
fn.write(",".join(batch_data_str))
fn.write("\n")
fn.close()

fn = open(filename, "r")
total_line_count = file_len(fn)
fn.close()
per_file_lines = total_line_count / split_count + 1
cmd = "split -d -a 5 -l %d %s %s-" % (per_file_lines, filename, filename)
os.system(cmd)


word_dict = paddle.dataset.imikolov.build_dict()
with open("word_dict.pickle", "w") as dict_f:
pickle.dump(word_dict, dict_f)

split_from_reader_by_line("train.txt",
paddle.dataset.imikolov.train(word_dict, N),
SPLIT_COUNT)
split_from_reader_by_line("test.txt",
paddle.dataset.imikolov.test(word_dict, N),
SPLIT_COUNT)
4 changes: 4 additions & 0 deletions paddle/parameter/FirstOrderOptimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,10 @@ class AdamParameterOptimizer : public ParameterOptimizer {
addParameterType(PARAMETER_SECOND_MOMENTUM);
}

virtual void startBatch(int64_t numSamplesProcessed) {
learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
}

virtual void finishBatch() { ++step_; }

virtual void update(const VectorPtr vecs[],
Expand Down
39 changes: 39 additions & 0 deletions paddle/scripts/cluster_train_v2/fabric/conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

HOSTS = [
"root@10.1.9.7",
"root@10.1.18.7",
"root@10.1.32.9",
]
'''
workspace configuration
'''
#root dir for workspace, can be set as any director with real user account
ROOT_DIR = "/root"
'''
network configuration
'''
#pserver nics
PADDLE_NIC = "eth0"
#pserver port
PADDLE_PORT = 7164
#pserver ports num
PADDLE_PORTS_NUM = 1
#pserver sparse ports num
PADDLE_PORTS_NUM_FOR_SPARSE = 1
#trainer whether use gpu
PADDLE_USE_GPU = "False"
#environments setting for all processes in cluster job
LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/lib64"
11 changes: 11 additions & 0 deletions paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FROM docker.paddlepaddlehub.com/paddle:0.10.0rc2
RUN apt-get update && apt-get install -y openssh-server
RUN mkdir /var/run/sshd

RUN echo 'root:root' |chpasswd

RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config

EXPOSE 22
CMD ["/usr/sbin/sshd", "-D"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: ssh-servers
spec:
replicas: 3
template:
metadata:
labels:
app: ssh-servers
spec:
containers:
- name: ssh-servers
image: docker.paddlepaddlehub.com/paddlessh
resources:
limits:
cpu: 500m
memory: 1Gi
requests:
cpu: 500m
memory: 1Gi
ports:
- containerPort: 22
14 changes: 14 additions & 0 deletions paddle/scripts/cluster_train_v2/fabric/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash

python paddle.py \
--job_dispatch_package="/root/wuyi/fabric_submit/workspace" \
--dot_period=10 \
--ports_num_for_sparse=1 \
--log_period=50 \
--num_passes=5 \
--trainer_count=2 \
--saving_period=1 \
--local=0 \
--config=./trainer_config.py \
--save_dir=./output \
--use_gpu=0
Loading

0 comments on commit 694bc64

Please sign in to comment.