In [1]:
import gym
import collections
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
#Hyberparameters
learning_rate = 0.0005
gamma = 0.98
buffer_limit = 50000
batch_size = 32

In [2]:
class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen=buffer_limit)
    
    def put(self, transition):
        self.buffer.append(transition)
    
    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []
        
        for transition in mini_batch:
            s, a, r, s_prime, done_mask = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done_mask_lst.append([done_mask])

        return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
               torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
               torch.tensor(done_mask_lst)
    
    def size(self):
        return len(self.buffer)

In [3]:
class Qnet(nn.Module):
    def __init__(self):
        super(Qnet, self).__init__()
        self.fc1 = nn.Linear(4, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
      
    def sample_action(self, obs, epsilon):
        out = self.forward(obs)
        coin = random.random()
        if coin < epsilon:
            return random.randint(0,1)
        else : 
            return out.argmax().item()

In [4]:
def train(q, q_target, memory, optimizer):
    for i in range(10):
        s,a,r,s_prime,done_mask = memory.sample(batch_size)

        q_out = q(s)
        q_a = q_out.gather(1,a)
        max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1)
        target = r + gamma * max_q_prime * done_mask
        loss = F.smooth_l1_loss(q_a, target)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [5]:
def main():
    env = gym.make('CartPole-v1')
    q = Qnet()
    q_target = Qnet()
    q_target.load_state_dict(q.state_dict())
    memory = ReplayBuffer()

    print_interval = 20
    score = 0.0  
    optimizer = optim.Adam(q.parameters(), lr=learning_rate)

    for n_epi in range(10000):
        epsilon = max(0.01, 0.08 - 0.01*(n_epi/200)) #Linear annealing from 8% to 1%
        s = env.reset()
        done = False

        while not done:
            a = q.sample_action(torch.from_numpy(s).float(), epsilon)      
            s_prime, r, done, info = env.step(a)
            done_mask = 0.0 if done else 1.0
            memory.put((s,a,r/100.0,s_prime, done_mask))
            s = s_prime

            score += r
            if done:
                break
            
        if memory.size()>2000:
            train(q, q_target, memory, optimizer)

        if n_epi%print_interval==0 and n_epi!=0:
            q_target.load_state_dict(q.state_dict())
            print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format(
                                                            n_epi, score/print_interval, memory.size(), epsilon*100))
            score = 0.0
    env.close()

if __name__ == '__main__':
    main()

n_episode :20, score : 10.6, n_buffer : 212, eps : 7.9%
n_episode :40, score : 9.4, n_buffer : 400, eps : 7.8%
n_episode :60, score : 9.7, n_buffer : 594, eps : 7.7%
n_episode :80, score : 9.4, n_buffer : 783, eps : 7.6%
n_episode :100, score : 9.4, n_buffer : 971, eps : 7.5%
n_episode :120, score : 9.6, n_buffer : 1162, eps : 7.4%
n_episode :140, score : 9.6, n_buffer : 1354, eps : 7.3%
n_episode :160, score : 10.1, n_buffer : 1555, eps : 7.2%
n_episode :180, score : 9.1, n_buffer : 1737, eps : 7.1%
n_episode :200, score : 9.7, n_buffer : 1931, eps : 7.0%
n_episode :220, score : 9.6, n_buffer : 2123, eps : 6.9%
n_episode :240, score : 17.9, n_buffer : 2481, eps : 6.8%
n_episode :260, score : 13.3, n_buffer : 2748, eps : 6.7%
n_episode :280, score : 14.7, n_buffer : 3041, eps : 6.6%
n_episode :300, score : 12.1, n_buffer : 3282, eps : 6.5%
n_episode :320, score : 12.2, n_buffer : 3525, eps : 6.4%
n_episode :340, score : 19.6, n_buffer : 3916, eps : 6.3%
n_episode :360, score : 73.3, n_

n_episode :2760, score : 285.9, n_buffer : 50000, eps : 1.0%
n_episode :2780, score : 327.0, n_buffer : 50000, eps : 1.0%
n_episode :2800, score : 302.9, n_buffer : 50000, eps : 1.0%
n_episode :2820, score : 349.6, n_buffer : 50000, eps : 1.0%
n_episode :2840, score : 198.3, n_buffer : 50000, eps : 1.0%
n_episode :2860, score : 127.7, n_buffer : 50000, eps : 1.0%
n_episode :2880, score : 157.0, n_buffer : 50000, eps : 1.0%
n_episode :2900, score : 183.6, n_buffer : 50000, eps : 1.0%
n_episode :2920, score : 286.6, n_buffer : 50000, eps : 1.0%
n_episode :2940, score : 374.5, n_buffer : 50000, eps : 1.0%
n_episode :2960, score : 369.2, n_buffer : 50000, eps : 1.0%
n_episode :2980, score : 312.9, n_buffer : 50000, eps : 1.0%
n_episode :3000, score : 171.2, n_buffer : 50000, eps : 1.0%
n_episode :3020, score : 251.7, n_buffer : 50000, eps : 1.0%
n_episode :3040, score : 271.6, n_buffer : 50000, eps : 1.0%
n_episode :3060, score : 286.4, n_buffer : 50000, eps : 1.0%
n_episode :3080, score :

n_episode :5460, score : 372.6, n_buffer : 50000, eps : 1.0%
n_episode :5480, score : 444.2, n_buffer : 50000, eps : 1.0%
n_episode :5500, score : 454.8, n_buffer : 50000, eps : 1.0%
n_episode :5520, score : 474.9, n_buffer : 50000, eps : 1.0%
n_episode :5540, score : 421.9, n_buffer : 50000, eps : 1.0%
n_episode :5560, score : 348.5, n_buffer : 50000, eps : 1.0%
n_episode :5580, score : 272.6, n_buffer : 50000, eps : 1.0%
n_episode :5600, score : 410.4, n_buffer : 50000, eps : 1.0%
n_episode :5620, score : 393.1, n_buffer : 50000, eps : 1.0%
n_episode :5640, score : 432.8, n_buffer : 50000, eps : 1.0%
n_episode :5660, score : 429.1, n_buffer : 50000, eps : 1.0%
n_episode :5680, score : 447.9, n_buffer : 50000, eps : 1.0%
n_episode :5700, score : 392.4, n_buffer : 50000, eps : 1.0%
n_episode :5720, score : 446.8, n_buffer : 50000, eps : 1.0%
n_episode :5740, score : 457.8, n_buffer : 50000, eps : 1.0%
n_episode :5760, score : 368.2, n_buffer : 50000, eps : 1.0%
n_episode :5780, score :

n_episode :8160, score : 500.0, n_buffer : 50000, eps : 1.0%
n_episode :8180, score : 500.0, n_buffer : 50000, eps : 1.0%
n_episode :8200, score : 486.1, n_buffer : 50000, eps : 1.0%
n_episode :8220, score : 500.0, n_buffer : 50000, eps : 1.0%
n_episode :8240, score : 500.0, n_buffer : 50000, eps : 1.0%
n_episode :8260, score : 500.0, n_buffer : 50000, eps : 1.0%
n_episode :8280, score : 500.0, n_buffer : 50000, eps : 1.0%
n_episode :8300, score : 499.5, n_buffer : 50000, eps : 1.0%
n_episode :8320, score : 488.4, n_buffer : 50000, eps : 1.0%
n_episode :8340, score : 482.9, n_buffer : 50000, eps : 1.0%
n_episode :8360, score : 461.2, n_buffer : 50000, eps : 1.0%
n_episode :8380, score : 500.0, n_buffer : 50000, eps : 1.0%
n_episode :8400, score : 500.0, n_buffer : 50000, eps : 1.0%
n_episode :8420, score : 484.1, n_buffer : 50000, eps : 1.0%
n_episode :8440, score : 471.1, n_buffer : 50000, eps : 1.0%
n_episode :8460, score : 493.6, n_buffer : 50000, eps : 1.0%
n_episode :8480, score :

In [6]:
line=3

In [8]:
a = int(input())

line = []

for i in range(a):
    a = int(input())
    b = int(input())
    line.append([a,b])

8
0
3
1
4
2
5
4
7
5
8
6
9
7
10
8
11


In [41]:

i = 0
for l in line:
    l.append(i)
    i = i + 1

In [42]:
line

[[0, 3, 0, 0],
 [1, 4, 0, 1],
 [2, 5, 0, 2],
 [4, 7, 0, 3],
 [5, 8, 0, 4],
 [6, 9, 0, 5],
 [7, 10, 0, 6],
 [8, 11, 0, 7]]

In [43]:
a = line.copy()
a.sort(key=lambda x:-x[1])    

In [44]:
for i in range(len(line)):
    if 
    

[[8, 11, 0, 7],
 [7, 10, 0, 6],
 [6, 9, 0, 5],
 [5, 8, 0, 4],
 [4, 7, 0, 3],
 [2, 5, 0, 2],
 [1, 4, 0, 1],
 [0, 3, 0, 0]]

In [11]:
b = []

for i in a:
    b

[[0, 3], [1, 4], [2, 5], [4, 7], [5, 8], [6, 9], [7, 10], [8, 11]]

In [16]:
max(a[:][1])

4

a[:][1]

In [17]:
a[:][1]

[1, 4]

In [18]:
a

[[0, 3], [1, 4], [2, 5], [4, 7], [5, 8], [6, 9], [7, 10], [8, 11]]

In [19]:
a[0]

[0, 3]

In [23]:
a[:][1]

[1, 4]

In [34]:
b=

In [35]:
a

[[8, 11], [7, 10], [6, 9], [5, 8], [4, 7], [2, 5], [1, 4], [0, 3]]