#With SplitCrossEntropyLoss parameters being updated CUDA_VISIBLE_DEVICES=3 python -u main.py --epochs 14 --nlayers 4 --emsize 400 --nhid 2500 --alpha 0 --beta 0 --dropoute 0 --dropouth 0.1 --dropouti 0.1 --dropout 0.1 --wdrop 0.5 --wdecay 0 --bptt 140 --batch_size 32 --optimizer adam --lr 1e-3 --data data/wikitext-103 --save WT103.12hr.QRNN.pt --when 12 --log-interval 10 Loading cached dataset... Applying weight drop of 0.5 to weight_hh_l0 Applying weight drop of 0.5 to weight_hh_l0 Applying weight drop of 0.5 to weight_hh_l0 Applying weight drop of 0.5 to weight_hh_l0 [WeightDrop ( (module): LSTM(400, 2500) ), WeightDrop ( (module): LSTM(2500, 2500) ), WeightDrop ( (module): LSTM(2500, 2500) ), WeightDrop ( (module): LSTM(2500, 400) )] Using [2800, 20000, 76000] Args: Namespace(alpha=0.0, batch_size=32, beta=0.0, bptt=140, clip=0.25, cuda=True, data='data/wikitext-103', dropout=0.1, dropoute=0.0, dropouth=0.1, dropouti=0.1, emsize=400, epochs=14, log_interval=10, lr=0.001, model='LSTM', nhid=2500, nlayers=4, nonmono=5, optimizer='adam', resume='', save='WT103.12hr.QRNN.pt', seed=1111, tied=True, wdecay=0.0, wdrop=0.5, when=[12]) | epoch 1 | 10/23041 batches | lr 0.00100 | ms/batch 1426.49 | loss 10.69 | ppl 43880.92 | bpc 15.421 | epoch 1 | 20/23041 batches | lr 0.00100 | ms/batch 936.87 | loss 8.09 | ppl 3266.35 | bpc 11.673 | epoch 1 | 30/23041 batches | lr 0.00100 | ms/batch 843.49 | loss 7.62 | ppl 2034.29 | bpc 10.990 | epoch 1 | 40/23041 batches | lr 0.00100 | ms/batch 794.54 | loss 7.63 | ppl 2067.34 | bpc 11.014 | epoch 1 | 50/23041 batches | lr 0.00100 | ms/batch 837.59 | loss 7.63 | ppl 2064.16 | bpc 11.011 | epoch 1 | 60/23041 batches | lr 0.00100 | ms/batch 825.59 | loss 7.60 | ppl 2007.57 | bpc 10.971 | epoch 1 | 70/23041 batches | lr 0.00100 | ms/batch 864.99 | loss 7.65 | ppl 2091.11 | bpc 11.030 | epoch 1 | 80/23041 batches | lr 0.00100 | ms/batch 870.91 | loss 7.59 | ppl 1983.57 | bpc 10.954 | epoch 1 | 90/23041 batches | lr 0.00100 | ms/batch 799.84 | loss 7.56 | ppl 1917.90 | bpc 10.905 | epoch 1 | 100/23041 batches | lr 0.00100 | ms/batch 845.25 | loss 7.60 | ppl 2001.65 | bpc 10.967 | epoch 1 | 110/23041 batches | lr 0.00100 | ms/batch 935.67 | loss 7.54 | ppl 1877.04 | bpc 10.874 | epoch 1 | 120/23041 batches | lr 0.00100 | ms/batch 825.33 | loss 7.54 | ppl 1879.97 | bpc 10.876 | epoch 1 | 130/23041 batches | lr 0.00100 | ms/batch 847.91 | loss 7.56 | ppl 1912.42 | bpc 10.901 | epoch 1 | 140/23041 batches | lr 0.00100 | ms/batch 841.25 | loss 7.53 | ppl 1853.87 | bpc 10.856