In [None]:
!pip install -q transformers pillow gTTS torch torchvision

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
from gtts import gTTS
import torch, torch.nn as nn, torch.optim as optim
from google.colab import files
from IPython.display import Audio, display
import torchvision.transforms as T
import torchvision
from torchvision.utils import make_grid
import matplotlib.pyplot as plt

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
processor = BlipProcessor.from_pretrained('Salesforce/blip-image-captioning-base')
model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base').to(device)

def caption_image(pil_img):
    inputs = processor(images=pil_img, return_tensors='pt').to(device)
    out = model.generate(**inputs, max_new_tokens=20)
    return processor.decode(out[0], skip_special_tokens=True)

In [None]:
uploaded = files.upload()
name = list(uploaded.keys())[0]
img = Image.open(name).convert('RGB')
display(img)

cap = caption_image(img)
print('Caption:', cap)

tts = gTTS(cap)
tts.save('caption.mp3')
Audio('caption.mp3')

In [None]:
class G(nn.Module):
    def __init__(self, nz=64):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(nz,256), nn.ReLU(),
            nn.Linear(256,28*28), nn.Tanh()
        )
    def forward(self,z): return self.net(z).view(-1,1,28,28)

class D(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Flatten(),
            nn.Linear(28*28,256), nn.LeakyReLU(0.2),
            nn.Linear(256,1), nn.Sigmoid()
        )
    def forward(self,x): return self.net(x)

In [None]:
transform = T.Compose([T.ToTensor(), T.Normalize((0.5,), (0.5,))])
mnist = torchvision.datasets.MNIST(root='.', download=True, transform=transform)
loader = torch.utils.data.DataLoader(mnist, batch_size=128, shuffle=True)

In [None]:
gen,disc = G().to(device), D().to(device)
optG = optim.Adam(gen.parameters(), lr=0.0005)
optD = optim.Adam(disc.parameters(), lr=0.0005)
loss_fn = nn.BCELoss()

batch,_ = next(iter(loader))
batch = batch.to(device)
B = batch.size(0)

z = torch.randn(B,64).to(device)
fake = gen(z)
real_loss = loss_fn(disc(batch), torch.ones(B,1).to(device))
fake_loss = loss_fn(disc(fake.detach()), torch.zeros(B,1).to(device))
d_loss = real_loss + fake_loss
optD.zero_grad(); d_loss.backward(); optD.step()

output = disc(fake)
g_loss = loss_fn(output, torch.ones(B,1).to(device))
optG.zero_grad(); g_loss.backward(); optG.step()

In [None]:
z = torch.randn(16,64).to(device)
samples = gen(z).cpu()
samples = (samples + 1)/2
grid = make_grid(samples, nrow=4)
plt.imshow(grid.permute(1,2,0))
plt.axis('off')
plt.show()

In [None]:
class VAE(nn.Module):
    def __init__(self):
        super().__init__()
        self.enc = nn.Sequential(nn.Flatten(), nn.Linear(28*28,128), nn.ReLU())
        self.mu = nn.Linear(128,20)
        self.logvar = nn.Linear(128,20)
        self.dec = nn.Sequential(nn.Linear(20,128), nn.ReLU(), nn.Linear(128,28*28), nn.Sigmoid())
    def encode(self,x):
        h=self.enc(x); return self.mu(h), self.logvar(h)
    def reparam(self,mu,lv): return mu + torch.randn_like(mu)*torch.exp(0.5*lv)
    def forward(self,x):
        mu,lv=self.encode(x); z=self.reparam(mu,lv)
        return self.dec(z).view(-1,1,28,28)

In [None]:
vae = VAE().to(device)
opt = optim.Adam(vae.parameters(), lr=0.001)

batch,_ = next(iter(loader))
batch=batch.to(device)
recon = vae(batch)
loss = ((batch-recon)**2).mean()
opt.zero_grad(); loss.backward(); opt.step()
print('VAE trained once, loss =', loss.item())

In [None]:
z = torch.randn(16,20).to(device)
with torch.no_grad():
    gen_imgs = vae.dec(z).view(-1,1,28,28)

grid = make_grid(gen_imgs, nrow=4)
plt.imshow(grid.permute(1,2,0))
plt.axis('off')
plt.show()