diff --git a/OpenNMT/onmt/Dataset.py b/OpenNMT/onmt/Dataset.py
index 8b357d7ff8..2651ae6458 100644
--- a/OpenNMT/onmt/Dataset.py
+++ b/OpenNMT/onmt/Dataset.py
@@ -14,7 +14,7 @@ def __init__(self, srcData, tgtData, batchSize, cuda):
         self.cuda = cuda
 
         self.batchSize = batchSize
-        self.numBatches = len(self.src) // batchSize
+        self.numBatches = (len(self.src) + batchSize - 1) // batchSize
 
     def _batchify(self, data, align_right=False):
         max_length = max(x.size(0) for x in data)
diff --git a/time_sequence_prediction/README.md b/time_sequence_prediction/README.md
new file mode 100644
index 0000000000..869618a7e8
--- /dev/null
+++ b/time_sequence_prediction/README.md
@@ -0,0 +1,17 @@
+# Time Sequence Prediction by pytorch
+This is an simple and illustrative example showing how to model the time sequence with LSTM(Long Short-Term Memory) . Several sine waves are input into the model and the modle learns how to generate the future waves according to given states. The result is as following.
+![image](https://cloud.githubusercontent.com/assets/1419566/23689065/1d6e9900-03f3-11e7-958b-80066f2e9472.png)
+
+## Usage 
+1. Generate the training data
+```
+python generate_sine_wave.py
+```
+
+2. Train the model and predict the future states
+```
+python train.py
+```
+
+## The model
+Stacked LSTM nodes are used to learn the patterns of the input signal.
diff --git a/time_sequence_prediction/generate_sine_wave.py b/time_sequence_prediction/generate_sine_wave.py
new file mode 100644
index 0000000000..8a68a57b5d
--- /dev/null
+++ b/time_sequence_prediction/generate_sine_wave.py
@@ -0,0 +1,12 @@
+import math
+import numpy as np
+import cPickle as pickle
+T = 20
+L = 1000
+N = 100
+np.random.seed(2)
+x = np.empty((N, L), 'int64')
+x[:] = np.array(range(L)) + np.random.randint(-4*T, 4*T, N).reshape(N, 1)
+y = np.sin(x / 1.0 / T).astype('float64')
+pickle.dump(y, open('traindata.pkl', 'wb'))
+
diff --git a/time_sequence_prediction/train.py b/time_sequence_prediction/train.py
new file mode 100644
index 0000000000..60036f9914
--- /dev/null
+++ b/time_sequence_prediction/train.py
@@ -0,0 +1,81 @@
+import torch
+import torch.nn as nn 
+from torch.autograd import Variable
+import cPickle as pickle
+import torch.optim as optim
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+
+class Sequence(nn.Module):
+    def __init__(self):
+        super(Sequence, self).__init__()
+        self.lstm1 = nn.LSTMCell(1, 51)
+        self.lstm2 = nn.LSTMCell(51, 1)
+
+    def forward(self, input, future = 0):
+        outputs = []
+        h_t = Variable(torch.zeros(input.size(0), 51).double(), requires_grad=False)
+        c_t = Variable(torch.zeros(input.size(0), 51).double(), requires_grad=False)
+        h_t2 = Variable(torch.zeros(input.size(0), 1).double(), requires_grad=False)
+        c_t2 = Variable(torch.zeros(input.size(0), 1).double(), requires_grad=False)
+
+        for i, input_t in enumerate(input.chunk(input.size(1), dim=1)):
+            h_t, c_t = self.lstm1(input_t, (h_t, c_t))
+            h_t2, c_t2 = self.lstm2(c_t, (h_t2, c_t2))
+            outputs += [c_t2]
+        for i in range(future):
+            h_t, c_t = self.lstm1(c_t2, (h_t, c_t))
+            h_t2, c_t2 = self.lstm2(c_t, (h_t2, c_t2))
+            outputs += [c_t2]
+        outputs = torch.stack(outputs, 1).squeeze(2)
+        return outputs
+
+
+
+if __name__ == '__main__':
+    # set ramdom seed to 0
+    np.random.seed(0)
+    torch.manual_seed(0)
+    # load data and make training set
+    data = pickle.load(open('traindata.pkl'))
+    input = Variable(torch.from_numpy(data[3:, :-1]), requires_grad=False)
+    target = Variable(torch.from_numpy(data[3:, 1:]), requires_grad=False)
+    # build the model
+    seq = Sequence()
+    seq.double()
+    criterion = nn.MSELoss()
+    # use LBFGS as optimizer since we can load the whole data to train
+    optimizer = optim.LBFGS(seq.parameters())
+    #begin to train
+    for i in range(20):
+        print i
+        def closure():
+            optimizer.zero_grad()
+            out = seq(input)
+            loss = criterion(out, target)
+            print 'loss:', loss.data.numpy()[0]
+            loss.backward()
+            return loss
+        optimizer.step(closure)
+        # begin to predict
+        future = 1000
+        pred = seq(input[:3], future = future)
+        y = pred.data.numpy()
+        # draw the result
+        plt.figure(figsize=(30,10))
+        plt.title('Predict future values for time sequences\n(Dashlines are predicted values)', fontsize=30) 
+        plt.xlabel('x', fontsize=20)
+        plt.ylabel('y', fontsize=20)
+        plt.xticks(fontsize=20)
+        plt.yticks(fontsize=20)
+        def draw(yi, color):
+            plt.plot(np.arange(input.size(1)), yi[:input.size(1)], color, linewidth = 2.0)
+            plt.plot(np.arange(input.size(1), input.size(1) + future), yi[input.size(1):], color + ':', linewidth = 2.0)
+        draw(y[0], 'r')
+        draw(y[1], 'g')
+        draw(y[2], 'b')
+        plt.savefig('predict%d.pdf'%i)
+        plt.close()
+