In [2]:
import torch as tc
from torch import nn
from torch.nn import functional as F
import time

In [3]:
device = tc.device('cuda:2')

In [26]:
class LargefcNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, device, dropout=0.2):
        super(LargefcNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, hidden_size)
        self.fc5 = nn.Linear(hidden_size, hidden_size)
        self.fc6 = nn.Linear(hidden_size, hidden_size)
        self.end = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.dropout(self.relu(self.fc1(x)))
        for i in range(100):
            x = self.dropout(self.relu(self.fc2(x)))
        x = self.end(x)
        return x


model1 = LargefcNet(100, 3000, 100, tc.device(device),
                    dropout=0.4).to(tc.device(device))
model2 = LargefcNet(100, 3000, 100, tc.device(device),
                    dropout=0.4).to(tc.device(device))
model3 = LargefcNet(100, 3000, 100, tc.device(device),
                    dropout=0.4).to(tc.device(device))

input1 = tc.randn(100, 100).to(tc.device(device))
input2 = tc.randn(100, 100).to(tc.device(device))
input3 = tc.randn(100, 100).to(tc.device(device))


In [28]:
# this one is for gpu loading
[model1, model2, model3] = [model1.to(tc.device(device)), model2.to(tc.device(device)), model3.to(tc.device(device))]
[input1, input2, input3] = [input1.to(tc.device(device)), input2.to(tc.device(device)), input3.to(tc.device(device))]
output1 = model1(input1)
output2 = model2(input2)
output3 = model3(input3)

start_time = time.time()
for i in range(10):
    output1 = model1(input1)
print(f'output1: {time.time() - start_time}')

start_time = time.time()
for i in range(10):
    output2 = model2(input2)
print(f'output2: {time.time() - start_time}')

start_time = time.time()
for i in range(10):
    output3 = model3(input3)
print(f'output3: {time.time() - start_time}')

#method 1
start_time = time.time()
for i in range(10):
    output1 = model1(input1)
    output2 = model2(input2)
    output3 = model3(input3)
print(f'for loop output1, output2, output3: {time.time() - start_time}')

#method 2
[model1, model2, model3] = [model1.to(tc.device('cuda:0')), model2.to(tc.device('cuda:1')), model3.to(tc.device('cuda:2'))]
[input1, input2, input3] = [input1.to(tc.device('cuda:0')), input2.to(
    tc.device('cuda:1')), input3.to(tc.device('cuda:2'))]
start_time = time.time()
for i in range(10):
    output1 = model1(input1)
    output2 = model2(input2)
    output3 = model3(input3)
print(f'for loop output1, output2, output3 sending another device: {time.time() - start_time}')

[model1, model2, model3] = [model1.to(tc.device(device)), model2.to(tc.device(device)), model3.to(tc.device(device))]
[input1, input2, input3] = [input1.to(tc.device(device)), input2.to(tc.device(device)), input3.to(tc.device(device))]
#method 3
start_time = time.time()
for i in range(10):
    outputs = [model(input) for model, input in zip(
        [model1, model2, model3], [input1, input2, input3])]
print(f'list together outputs: {time.time() - start_time}')

# parallel method 1 it will run model1(input1) on gpu 0, model2(input2) on gpu 1, model3(input3) on gpu 2 and all calculation will be start and end with simultaneous
start_time = time.time()
for i in range(10):
    outputs = tc.nn.parallel.parallel_apply(
        [model1, model2, model3], [input1, input2, input3])
print(f'parallel_apply: {time.time() - start_time}')


output1: 0.35519957542419434
output2: 0.36833763122558594
output3: 0.3359034061431885
for loop output1, output2, output3: 1.0291898250579834
for loop output1, output2, output3 sending another device: 0.4717831611633301
list together outputs: 1.2263202667236328
parallel_apply: 1.0801422595977783


In [6]:


# Generate some sample data
x = tc.randn((30002, 9000), requires_grad=True).to(device)
y1 = tc.randn((30002, 9000), requires_grad=True).to(device)
y2 = tc.randn((30002, 9000), requires_grad=True).to(device)

# Calculate the losses separately
start_time = time.time()
loss1 = tc.sum((x - y1) ** 2)
loss2 = tc.sum((x - y2) ** 2)
loss1.backward()
loss2.backward()
end_time = time.time()
print(f"Time for separate backward: {end_time - start_time}")

x.detach()
y1.detach()
y2.detach()

# Calculate the losses separately
start_time = time.time()
loss1 = tc.sum((x - y1) ** 2)
loss2 = tc.sum((x - y2) ** 2)
loss_sum = loss1 + loss2
loss_sum.backward()
end_time = time.time()
print(f"Time for summing and backwarding: {end_time - start_time}")

x.detach()
y1.detach()
y2.detach()


# Calculate the losses separately
start_time = time.time()
loss1 = tc.sum((x - y1) ** 2)
loss2 = tc.sum((x - y2) ** 2)
tc.autograd.backward([loss1, loss2])
end_time = time.time()
print(f"Time for tc.autograd.backward: {end_time - start_time}")



Time for separate backward: 3.1925063133239746
Time for summing and backwarding: 2.6383376121520996
Time for tc.autograd.backward: 2.6009981632232666
