In [167]:
import pandas as pd
import glob
import numpy as np
from torch.utils.data import Dataset
from PIL import Image


In [168]:
# !pip install lightning-bolts

In [169]:
BASEPATH = "../data/"

In [170]:
train_df = pd.read_csv(BASEPATH+"ADE20K-QA/vqa_train_df.tsv", sep="\t")
val_df = pd.read_csv(BASEPATH+"ADE20K-QA/vqa_val_df.tsv", sep="\t")

In [171]:
train_df.head()

Unnamed: 0,img_id,question,answer
0,ADE_train_00003661,What part of the room is a photo frame attache...,wall
1,ADE_train_00003661,"Along with a window, what is on the inside of ...",curtain
2,ADE_train_00003661,"Along with blankets, what is on the bed?",pillow
3,ADE_train_00003661,In this picture I can see the inside view of w...,room
4,ADE_train_00003661,What do I see through the window?,sky


In [175]:
train_img_paths = glob.glob(BASEPATH+"ADE20K-pairs/final-pairs/unsplit/train/*.jpg")
val_img_paths = glob.glob(BASEPATH+"ADE20K-pairs/final-pairs/unsplit/val/*.jpg")

train_img_ids = [path.split(".jpg")[0].split("/")[-1] for path in train_img_paths]
val_img_ids = [path.split(".jpg")[0].split("/")[-1] for path in val_img_paths]


len(train_img_ids), len(val_img_ids)

(20210, 2000)

In [176]:
x = pd.DataFrame([(i,j) for i, j in zip(train_img_ids, train_img_paths)], columns=['img_id','img_path'])
y = pd.DataFrame([(i,j) for i, j in zip(val_img_ids, val_img_paths)], columns=['img_id','img_path'])

s = set(train_df.img_id)
t = set(x.img_id)
u = set(val_df.img_id)
v = set(y.img_id)

train_id_to_path = dict(zip(train_img_ids, train_img_paths))
val_id_to_path = dict(zip(val_img_ids, val_img_paths))
s==t, u==v

(True, True)

In [177]:
%cd ../antarctic-captions
# !ls

/project/dataset-exploration/clip-finetune-ade20k/antarctic-captions


In [178]:
import os
from CLIP import clip
import torch
from tqdm import tqdm

# CHECKPOINT = "../training-scripts/open_clip/logs/low-lr-low-wd-rn50/checkpoints/epoch_6.pt"

device = "cuda:1" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("RN50",device=device,jit=True) #Must set jit=False for training

In [179]:
# checkpoint = torch.load(CHECKPOINT)

# # # Use these 3 lines if you use default model setting(not training setting) of the clip. For example, if you set context_length to 100 since your string is very long during training, then assign 100 to checkpoint['model_state_dict']["context_length"] 
# checkpoint['state_dict']["input_resolution"] = model.input_resolution #default is 224
# checkpoint['state_dict']["context_length"] = model.context_length # default is 77
# checkpoint['state_dict']["vocab_size"] = model.vocab_size 

# model.load_state_dict(checkpoint['state_dict'])

In [180]:
print(device)

cuda:1


In [181]:
def read_image(filepath):
    return preprocess(Image.open(filepath)).unsqueeze(0).to(device)

def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

def generate_splits(names):
    splits = []
    split_threshold = int(len(names)/6)
    print("size of each split: ", split_threshold)
    for i in range(6):
        first = i*split_threshold
        last = (i+1)*split_threshold
        if i==5:
            last=len(names)
        splits.append(names[first:last])

    return splits

def read_split(split):
    img_ids, names = split.img_id, split.img_path
    images = []
    img_emb = []
    for i, filepath in enumerate(names):
        if not i % 200:
            print(i)
        img = read_image(filepath)
        images.append(img)
    with torch.no_grad():
        for x in batch(images, 256):
            img_emb.append(model.encode_image(torch.cat(x)))
    #img_ids = torch.cat(tuple(img_ids)).cpu().numpy()
    img_emb = torch.cat(img_emb).cpu().numpy()
    return img_ids, names, img_emb

def read_all(splits):
    img_ids = []
    image_embeddings = []
    filenames = []
    for i, s in enumerate(splits):
        print("Now encoding split #", i)
        img_ids_, names, embeddings = read_split(s)
        img_ids.extend(img_ids_)
        filenames.extend(names)
        image_embeddings.extend(embeddings)
        
    return img_ids, filenames, image_embeddings

In [182]:
train_ids, _, train_img_embeddings = read_all(generate_splits(x))
val_ids, _, val_img_embeddings = read_all(generate_splits(y))
# images = [read_image(filepath) for filepath in img_names]

size of each split:  3368
Now encoding split # 0
0
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
Now encoding split # 1
0
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
Now encoding split # 2
0
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
Now encoding split # 3
0
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
Now encoding split # 4
0
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
Now encoding split # 5
0
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
size of each split:  333
Now encoding split # 0
0
200
Now encoding split # 1
0
200
Now encoding split # 2
0
200
Now encoding split # 3
0
200
Now encoding split # 4
0
200
Now encoding split # 5
0
200


In [184]:
train_img_embedding_pairs = dict()
for i, img_id in enumerate(train_ids): 
    train_img_embedding_pairs[img_id] = train_img_embeddings[i]

val_img_embedding_pairs = dict()
for i, img_id in enumerate(val_ids): 
    val_img_embedding_pairs[img_id] = val_img_embeddings[i]

np.save("../outputs/clip-embeddings/pretrained-img-embeddings-train.npy", train_img_embedding_pairs)
np.save("../outputs/clip-embeddings/pretrained-img-embeddings-val.npy", val_img_embedding_pairs)


In [185]:
# train_img_embedding_pairs = np.load("../outputs/clip-embeddings/finetuned-img-embeddings-train.npy", allow_pickle=True)
# train_img_embedding_pairs = train_img_embedding_pairs.item()

# val_img_embedding_pairs = np.load("../outputs/clip-embeddings/finetuned-img-embeddings-val.npy", allow_pickle=True)
# val_img_embedding_pairs = val_img_embedding_pairs.item()

train_img_embedding_pairs = np.load("../outputs/clip-embeddings/pretrained-img-embeddings-train.npy", allow_pickle=True)
train_img_embedding_pairs = train_img_embedding_pairs.item()

val_img_embedding_pairs = np.load("../outputs/clip-embeddings/pretrained-img-embeddings-val.npy", allow_pickle=True)
val_img_embedding_pairs = val_img_embedding_pairs.item()





In [186]:
def get_text_features(texts):
    all_text_features = []
    for i, x in enumerate(batch(texts, 3000)):
        print("Batch: ", i)
        with torch.no_grad():
            text_inputs = torch.cat([clip.tokenize(text) for text in x]).to(device)
            text_features = model.encode_text(text_inputs)
            all_text_features.extend(text_features.cpu().numpy())

    return all_text_features

In [187]:
get_text_features(train_df.question[0:5])[0].shape

Batch:  0


(1024,)

In [188]:
a = get_text_features([train_df.question[0]])
b = train_img_embedding_pairs[train_df.img_id[0]]
np.concatenate((a, b), axis=None).shape

Batch:  0


(2048,)

In [189]:
def get_X_y(df, train=True):
    text_features_all = get_text_features(df.question)
    print(len(text_features_all))
    X = []
    y = []
    for i, row in enumerate(df.itertuples()):
        if not i % 500: print("i = ", i)
        img_id = row.img_id
        if train:
            img_features = train_img_embedding_pairs[img_id]
        else:
            img_features = val_img_embedding_pairs[img_id]
        text_features = text_features_all[i]
        features = np.concatenate((text_features, img_features), axis=None)
        label = row.answer
        
        X.append(features)
        y.append(label)

    return X, np.array(y)

In [190]:
X_train, y_train = get_X_y(train_df)

Batch:  0
Batch:  1
Batch:  2
Batch:  3
Batch:  4
Batch:  5
Batch:  6
Batch:  7
Batch:  8
Batch:  9
Batch:  10
Batch:  11
Batch:  12
Batch:  13
Batch:  14
Batch:  15
Batch:  16
Batch:  17
Batch:  18
Batch:  19
Batch:  20
Batch:  21
Batch:  22
Batch:  23
Batch:  24
Batch:  25
Batch:  26
Batch:  27
Batch:  28
Batch:  29
Batch:  30
Batch:  31
Batch:  32
Batch:  33
Batch:  34
Batch:  35
Batch:  36
Batch:  37
Batch:  38
Batch:  39
Batch:  40
Batch:  41
Batch:  42
Batch:  43
Batch:  44
Batch:  45
Batch:  46
Batch:  47
Batch:  48
Batch:  49
Batch:  50
Batch:  51
Batch:  52
Batch:  53
Batch:  54
Batch:  55
Batch:  56
Batch:  57
Batch:  58
Batch:  59
Batch:  60
Batch:  61
Batch:  62
Batch:  63
Batch:  64
Batch:  65
Batch:  66
Batch:  67
Batch:  68
Batch:  69
Batch:  70
Batch:  71
Batch:  72
Batch:  73
Batch:  74
Batch:  75
Batch:  76
Batch:  77
Batch:  78
Batch:  79
Batch:  80
Batch:  81
Batch:  82
Batch:  83
Batch:  84
Batch:  85
Batch:  86
Batch:  87
Batch:  88
Batch:  89
Batch:  90
Batch:  9

i =  261000
i =  261500
i =  262000
i =  262500
i =  263000
i =  263500
i =  264000
i =  264500
i =  265000
i =  265500
i =  266000
i =  266500
i =  267000
i =  267500
i =  268000
i =  268500
i =  269000
i =  269500
i =  270000
i =  270500
i =  271000
i =  271500
i =  272000
i =  272500
i =  273000
i =  273500
i =  274000
i =  274500
i =  275000
i =  275500
i =  276000
i =  276500
i =  277000
i =  277500
i =  278000
i =  278500
i =  279000
i =  279500
i =  280000
i =  280500
i =  281000
i =  281500
i =  282000
i =  282500
i =  283000
i =  283500
i =  284000
i =  284500
i =  285000
i =  285500
i =  286000
i =  286500
i =  287000
i =  287500
i =  288000
i =  288500
i =  289000
i =  289500
i =  290000
i =  290500
i =  291000
i =  291500
i =  292000
i =  292500
i =  293000
i =  293500
i =  294000
i =  294500
i =  295000
i =  295500
i =  296000
i =  296500
i =  297000
i =  297500
i =  298000
i =  298500
i =  299000
i =  299500
i =  300000
i =  300500
i =  301000
i =  301500
i =  302000
i = 

i =  616500
i =  617000
i =  617500
i =  618000
i =  618500
i =  619000
i =  619500
i =  620000
i =  620500
i =  621000
i =  621500


In [191]:
X_val, y_val = get_X_y(val_df, train=False)

Batch:  0
Batch:  1
Batch:  2
Batch:  3
Batch:  4
Batch:  5
Batch:  6
Batch:  7
Batch:  8
Batch:  9
Batch:  10
Batch:  11
Batch:  12
Batch:  13
Batch:  14
Batch:  15
Batch:  16
Batch:  17
Batch:  18
Batch:  19
Batch:  20
Batch:  21
Batch:  22
68761
i =  0
i =  500
i =  1000
i =  1500
i =  2000
i =  2500
i =  3000
i =  3500
i =  4000
i =  4500
i =  5000
i =  5500
i =  6000
i =  6500
i =  7000
i =  7500
i =  8000
i =  8500
i =  9000
i =  9500
i =  10000
i =  10500
i =  11000
i =  11500
i =  12000
i =  12500
i =  13000
i =  13500
i =  14000
i =  14500
i =  15000
i =  15500
i =  16000
i =  16500
i =  17000
i =  17500
i =  18000
i =  18500
i =  19000
i =  19500
i =  20000
i =  20500
i =  21000
i =  21500
i =  22000
i =  22500
i =  23000
i =  23500
i =  24000
i =  24500
i =  25000
i =  25500
i =  26000
i =  26500
i =  27000
i =  27500
i =  28000
i =  28500
i =  29000
i =  29500
i =  30000
i =  30500
i =  31000
i =  31500
i =  32000
i =  32500
i =  33000
i =  33500
i =  34000
i =  34500
i =  

In [192]:
import torch
import torchvision
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets
from PIL import Image


class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, targets):
        self.data = [torch.FloatTensor(item) for item in data]
        self.targets = torch.LongTensor(targets)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        x = self.data[index]
        y = self.targets[index]
        return x, y

In [193]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder


In [194]:
enc = LabelEncoder()


t = pd.DataFrame({'x': X_train, 
                   'y': y_train}).sample(frac=1).reset_index(drop=True)
# t = t[~t.y.isin(['yes','no'])]

v = pd.DataFrame({'x': X_val, 
                   'y': y_val}).sample(frac=1).reset_index(drop=True)
# v = v[~v.y.isin(['yes','no'])]

# x_t, x_v, y_t, y_v = train_test_split(x_no_binary_train.x, x_no_binary_train.y, test_size=0.10, random_state=42)

x_t = t.x
y_t = t.y
y_t = enc.fit_transform(y_t)

x_v = v.x
y_v = v.y
y_v = enc.transform(y_v)

# X_test = x_no_binary_test.x
# y_test = enc.transform(x_no_binary_test.y)

In [195]:

# x_t, x_v, y_t, y_v = train_test_split(X_train, y_train, test_size=0.10, random_state=42)

# enc = LabelEncoder()

# y_t = enc.fit_transform(y_t)
# y_v = enc.transform(y_v)

# X_test = X_val
# y_test = enc.transform(y_val)

In [196]:
train_dataset = CustomDataset(x_t, y_t)
val_dataset = CustomDataset(x_v, y_v)
test_dataset = CustomDataset(x_v, y_v)

In [197]:

'''
STEP 1: LOADING DATASET
'''

# train_dataset = dsets.MNIST(root='./data', 
#                             train=True, 
#                             transform=transforms.ToTensor(),
#                             download=True)

# test_dataset = dsets.MNIST(root='./data', 
#                            train=False, 
#                            transform=transforms.ToTensor())

# train_dataset = CustomDataset(X_train, y_t)
# test_dataset = CustomDataset(X_val, y_v)

'\nSTEP 1: LOADING DATASET\n'

In [198]:

'''
STEP 2: MAKING DATASET ITERABLE
'''

batch_size = 512
n_iters = 10000
num_epochs = 20 #n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

In [199]:
n_iters / len(train_dataset)/ batch_size

3.140274294166828e-05

In [200]:
'''
STEP 3: CREATE MODEL CLASS
'''
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_size, output_dim):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        out = self.linear(x)
        return out
    
class FeedforwardNeuralNetModel(nn.Module):
    def __init__(self, input_dim, h1, h2, output_dim):
        super(FeedforwardNeuralNetModel, self).__init__()
        # Linear function 1: 2048 --> 1600
        self.fc1 = nn.Linear(input_dim, h1) 
        # Non-linearity 1
        self.relu1 = nn.ReLU()

#         # Linear function 2: 1600 --> 1000
#         self.fc2 = nn.Linear(h1, h2)
#         # Non-linearity 2
#         self.relu2 = nn.ReLU()

#         # Linear function 3: 100 --> 100
#         self.fc3 = nn.Linear(hidden_dim, hidden_dim)
#         # Non-linearity 3
#         self.relu3 = nn.ReLU()

        # Linear function 4 (readout): 1000 --> 754
        self.fc4 = nn.Linear(h1, output_dim)  

    def forward(self, x):
        # Linear function 1
        out = self.fc1(x)
        # Non-linearity 1
        out = self.relu1(out)

        # Linear function 2
#         out = self.fc2(out)
#         # Non-linearity 2
#         out = self.relu2(out)

#         # Linear function 2
#         out = self.fc3(out)
#         # Non-linearity 2
#         out = self.relu3(out)

        # Linear function 4 (readout)
        out = self.fc4(out)
        return out

In [201]:

'''
STEP 4: INSTANTIATE MODEL CLASS
'''
input_dim = 2048
h1 = 1600
h2 = 1000
output_dim = len(set(y_v))


vqa_model = FeedforwardNeuralNetModel(input_dim, h1, h2, output_dim)
# model = LogisticRegressionModel(input_dim, output_dim)

#######################
#  USE GPU FOR MODEL  #
#######################

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
vqa_model.to(device)

'''
STEP 5: INSTANTIATE LOSS CLASS
'''
criterion = nn.CrossEntropyLoss()


'''
STEP 6: INSTANTIATE OPTIMIZER CLASS
'''
learning_rate = 0.001

optimizer = torch.optim.Adam(vqa_model.parameters(), lr=learning_rate)

In [202]:
len(train_dataset), len(val_dataset), len(test_dataset)

(621960, 68761, 68761)

In [203]:

'''
STEP 7: TRAIN THE MODEL
'''
iter = 0
for epoch in range(num_epochs):
    print("Epoch: ", epoch)
    for i, (features, labels) in enumerate(train_loader):

        #######################
        #  USE GPU FOR MODEL  #
        #######################
        features = features.requires_grad_().to(device)
        labels = labels.to(device)

        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()

        # Forward pass to get output/logits
        outputs = vqa_model(features)

        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)

        # Getting gradients w.r.t. parameters
        loss.backward()

        # Updating parameters
        optimizer.step()

        iter += 1

        if iter % 500 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for features, labels in val_loader:
                #######################
                #  USE GPU FOR MODEL  #
                #######################
                features = features.to(device)

                # Forward pass only to get logits/output
                outputs = vqa_model(features)

                # Get predictions from the maximum value
                _, predicted = torch.max(outputs.data, 1)

                # Total number of labels
                total += labels.size(0)

                #######################
                #  USE GPU FOR MODEL  #
                #######################
                # Total correct predictions
                if torch.cuda.is_available():
                    correct += (predicted.cpu() == labels.cpu()).sum()
                else:
                    correct += (predicted == labels).sum()

            accuracy = 100 * correct.item() / total

            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))
            
# print("Test Set")
# # Calculate Accuracy         
# correct = 0
# total = 0
# # Iterate through test dataset
# for features, labels in test_loader:
#     #######################
#     #  USE GPU FOR MODEL  #
#     #######################
#     features = features.to(device)

#     # Forward pass only to get logits/outputpredicted
#     outputs = vqa_model(features)

#     # Get predictions from the maximum value
#     _, predicted = torch.max(outputs.data, 1)

#     # Total number of labels
#     total += labels.size(0)

#     #######################
#     #  USE GPU FOR MODEL  #
#     #######################
#     # Total correct predictions
#     if torch.cuda.is_available():
#         correct += (predicted.cpu() == labels.cpu()).sum()
#     else:
#         correct += (predicted == labels).sum()

# accuracy = 100 * correct.item() / total

# # Print Loss
# print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))



Epoch:  0
Iteration: 500. Loss: 0.8088433742523193. Accuracy: 77.31708381204461
Iteration: 1000. Loss: 0.6553743481636047. Accuracy: 79.4447433865127
Epoch:  1
Iteration: 1500. Loss: 0.7887970209121704. Accuracy: 80.54565814924158
Iteration: 2000. Loss: 0.7032681703567505. Accuracy: 81.2044618315615
Epoch:  2
Iteration: 2500. Loss: 0.6179889440536499. Accuracy: 81.5811288375678
Iteration: 3000. Loss: 0.6018741726875305. Accuracy: 81.93452683934207
Iteration: 3500. Loss: 0.48334023356437683. Accuracy: 82.10177280725993
Epoch:  3
Iteration: 4000. Loss: 0.5481206178665161. Accuracy: 81.92580096275505
Iteration: 4500. Loss: 0.5482903122901917. Accuracy: 82.18903157313012
Epoch:  4
Iteration: 5000. Loss: 0.43066781759262085. Accuracy: 81.90398627128751
Iteration: 5500. Loss: 0.4971775412559509. Accuracy: 82.28647052835183
Iteration: 6000. Loss: 0.4635138511657715. Accuracy: 82.44644493244718
Epoch:  5
Iteration: 6500. Loss: 0.37151405215263367. Accuracy: 82.58896758336849
Iteration: 7000. L

In [204]:
def top_k_accuracy(vqa_, loader, k=1):
    correct = 0
    total = 0
    for features, labels in loader:
        features = features.to(device)
#         print(features.shape)
        outputs = vqa_(features)
        _, predicted = torch.topk(outputs.data, k=k, dim=1)
        
        total += labels.size(0)
        
        for counter, label in enumerate(labels.cpu()):
            correct+=label in predicted.cpu()[counter]
        
    accuracy = 100 * correct / total
    print('Accuracy @ {}: {}'.format(k, accuracy))
    return accuracy




In [205]:
top_k_accuracy(vqa_model, test_loader, k = 1)
top_k_accuracy(vqa_model, test_loader, k = 5)
top_k_accuracy(vqa_model, test_loader, k = 10)

Accuracy @ 1: 81.98542778609968
Accuracy @ 5: 94.48960893529762
Accuracy @ 10: 96.05154084437399


96.05154084437399

In [206]:
color = pd.read_csv("../data/ADE20K-QA/categorized/what_color_q_val.csv")

color = get_X_y(color, train=False)

color = pd.DataFrame({'x': color[0], 
                   'y': enc.transform(color[1])}).sample(frac=1).reset_index(drop=True)

color_dataset = CustomDataset(color.x, color.y)

color_loader = torch.utils.data.DataLoader(dataset=color_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

top_k_accuracy(vqa_model, color_loader, k = 1)
top_k_accuracy(vqa_model, color_loader, k = 5)
top_k_accuracy(vqa_model, color_loader, k = 10)


Batch:  0
440
i =  0
Accuracy @ 1: 47.27272727272727
Accuracy @ 5: 82.72727272727273
Accuracy @ 10: 92.95454545454545


92.95454545454545

In [207]:
binary = pd.read_csv("../data/ADE20K-QA/categorized/binary_q_val.csv")

binary = get_X_y(binary, train=False)

binary = pd.DataFrame({'x': binary[0], 
                   'y': enc.transform(binary[1])}).sample(frac=1).reset_index(drop=True)

binary_dataset = CustomDataset(binary.x, binary.y)

binary_loader = torch.utils.data.DataLoader(dataset=binary_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

top_k_accuracy(vqa_model, binary_loader, k = 1)
# top_k_accuracy(vqa_model, binary_loader, k = 5)
# top_k_accuracy(vqa_model, binary_loader, k = 10)


Batch:  0
Batch:  1
Batch:  2
Batch:  3
Batch:  4
Batch:  5
Batch:  6
Batch:  7
Batch:  8
Batch:  9
Batch:  10
Batch:  11
Batch:  12
Batch:  13
Batch:  14
Batch:  15
Batch:  16
Batch:  17
Batch:  18
56007
i =  0
i =  500
i =  1000
i =  1500
i =  2000
i =  2500
i =  3000
i =  3500
i =  4000
i =  4500
i =  5000
i =  5500
i =  6000
i =  6500
i =  7000
i =  7500
i =  8000
i =  8500
i =  9000
i =  9500
i =  10000
i =  10500
i =  11000
i =  11500
i =  12000
i =  12500
i =  13000
i =  13500
i =  14000
i =  14500
i =  15000
i =  15500
i =  16000
i =  16500
i =  17000
i =  17500
i =  18000
i =  18500
i =  19000
i =  19500
i =  20000
i =  20500
i =  21000
i =  21500
i =  22000
i =  22500
i =  23000
i =  23500
i =  24000
i =  24500
i =  25000
i =  25500
i =  26000
i =  26500
i =  27000
i =  27500
i =  28000
i =  28500
i =  29000
i =  29500
i =  30000
i =  30500
i =  31000
i =  31500
i =  32000
i =  32500
i =  33000
i =  33500
i =  34000
i =  34500
i =  35000
i =  35500
i =  36000
i =  36500
i =  

90.87971146463835

In [208]:
where = pd.read_csv("../data/ADE20K-QA/categorized/where_q_val.csv")

where = get_X_y(where, train=False)

where = pd.DataFrame({'x': where[0], 
                   'y': enc.transform(where[1])}).sample(frac=1).reset_index(drop=True)

where_dataset = CustomDataset(where.x, where.y)

where_dataloader = torch.utils.data.DataLoader(dataset=where_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

top_k_accuracy(vqa_model, where_dataloader, k = 1)
top_k_accuracy(vqa_model, where_dataloader, k = 5)
top_k_accuracy(vqa_model, where_dataloader, k = 10)


Batch:  0
713
i =  0
i =  500
Accuracy @ 1: 62.41234221598878
Accuracy @ 5: 83.16970546984572
Accuracy @ 10: 87.9382889200561


87.9382889200561

In [209]:
how_many = pd.read_csv("../data/ADE20K-QA/categorized/how_many_q_val.csv")

how_many = get_X_y(how_many, train=False)

how_many = pd.DataFrame({'x': how_many[0], 
                   'y': enc.transform(how_many[1])}).sample(frac=1).reset_index(drop=True)

how_many_dataset = CustomDataset(how_many.x, how_many.y)

how_many_dataloader = torch.utils.data.DataLoader(dataset=how_many_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

top_k_accuracy(vqa_model, how_many_dataloader, k = 1)
top_k_accuracy(vqa_model, how_many_dataloader, k = 5)
top_k_accuracy(vqa_model, how_many_dataloader, k = 10)


Batch:  0
256
i =  0
Accuracy @ 1: 47.265625
Accuracy @ 5: 83.984375
Accuracy @ 10: 92.1875


92.1875

In [316]:
enc.inverse_transform(predicted.cpu().numpy())

array(['building', 'one', 'white', 'chair', 'tree', 'rack',
       'glass window', 'board', 'cabinet', 'candle stand', 'candle stand',
       'kitchen platform', 'wall', 'tree', 'cloud', 'sky', 'floor',
       'floor', 'bush', 'floor', 'board', 'rock', 'table', 'cloudy',
       'stone', 'door', 'pillow', 'plant', 'table', 'dustbin', 'city',
       'right', 'pillow', 'chair', 'rod', 'monitor', 'flower vase',
       'right', 'poster', 'building', 'road', 'table', 'paper', 'middle',
       'glass window', 'white', 'tree', 'window', 'roof', 'door', 'lamp',
       'small', 'floor', 'table', 'grass', 'pillar', 'cupboard',
       'chimney', 'food', 'wooden', 'flush tank', 'frame', 'cloud',
       'roof', 'floor', 'two', 'sky', 'flower', 'glass', 'candle',
       'cloud', 'glass', 'building', 'road', 'lamp', 'sofa chair', 'wire',
       'decor', 'pole', 'brown', 'desk', 'light', 'text', 'photo',
       'building', 'sky', 'frame', 'right', 'floor', 'wall', 'table',
       'yellow', 'number', 'p

In [317]:
enc.inverse_transform(labels.cpu().numpy())

array(['tree', 'two', 'white', 'person', 'tree', 'shed', 'blind',
       'button', 'stool', 'candle', 'light', 'kitchen', 'wall', 'water',
       'cloud', 'sky', 'floor', 'mat', 'tree', 'pipe', 'board', 'rock',
       'table', 'cloudy', 'tree', 'door', 'blanket', 'building', 'table',
       'pole', 'city', 'right', 'table', 'bottle', 'big', 'screen',
       'flower', 'right', 'board', 'number', 'road', 'table', 'frame',
       'front', 'glass', 'white', 'tree', 'curtain', 'ceiling', 'cabin',
       'bed', 'small', 'floor', 'lamp', 'water', 'floor', 'curtain',
       'tree', 'tin', 'wooden', 'flush tank', 'pillar', 'cloud', 'roof',
       'front', 'two', 'sky', 'flower', 'glass', 'candle', 'cloud',
       'trolley', 'building', 'road', 'pen holder', 'lamp', 'bathtub',
       'floor', 'footpath', 'red', 'center', 'light', 'card', 'frame',
       'fort', 'sky', 'frame', 'right', 'table', 'floor', 'basket',
       'pink', 'two', 'man', 'chess', 'couch', 'chair', 'area', 'front',
       'wi

In [24]:
import torch
from torch.autograd import Variable
import torchvision.transforms as transforms
import torchvision.datasets as dsets

xxxx = dsets.MNIST(root='~/.cache/', 
                            train=True, 
                            transform=transforms.ToTensor(),
                            download=True)


In [29]:
type(xxxx[0])

tuple

In [None]:
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
torch.cuda.empty_cache()

In [26]:
len(set(train_df.answer))

756

In [27]:
from pl_bolts.models.regression import LogisticRegression
from pl_bolts.datamodules import SklearnDataModule
import pytorch_lightning as pl

from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()


In [29]:
y_train = enc.fit_transform(y_train)
y_val = enc.transform(y_val)

array([694, 389, 678, ..., 443, 443, 443])

In [51]:

dm = SklearnDataModule(X = X_train, y = y_train, x_test = X_val, y_test = y_val, 
                       val_split=0.1, test_split=0, num_workers=12, batch_size=256)
# y_val = enc.transform(y_val)
# dm_v = SklearnDataModule(X_val, y_val)

model = LogisticRegression(input_dim=2048, num_classes=len(set(y_train)), learning_rate=0.001)
model.train_dataloader = dm.train_dataloader
model.val_dataloader = dm.val_dataloader
model.test_dataloader = dm.test_dataloader

In [52]:
trainer = pl.Trainer(gpus="1", precision=16, max_epochs=5)

trainer.fit(model)
# trainer.fit(model, train_dataloader=dm_t.train_dataloader())#, val_dataloader=dm_t.val_dataloader())

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Using native 16bit precision.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name   | Type   | Params
----------------------------------
0 | linear | Linear | 1.5 M 
----------------------------------
1.5 M     Trainable params
0         Non-trainable params
1.5 M     Total params
6.196     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [62]:
# def test_epoch_end(self, outputs): 
#       results = process_outputs(outputs)
#       self.test_results = results
#       return results 

# trainer.test(model, datamodule)
# results = model.test_results

trainer.test(model,test_dataloaders=dm.test_dataloader())

  cpuset_checked))
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{}
--------------------------------------------------------------------------------


[{}]

In [32]:
from cuml.linear_model import LogisticRegression

from cuml.multiclass import MulticlassClassifier

ModuleNotFoundError: No module named 'cuml'

In [318]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(random_state=0, C=0.316, max_iter=1000, verbose=1, solver='saga')

In [None]:
classifier.fit(X_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
predictions = classifier.predict(X_val)
accuracy = np.mean((y_val == predictions).astype(np.float)) * 100.
print(f"Accuracy = {accuracy:.3f}")