# Generate Bottom-up-Features using Detection Transformer (DETR)

In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git timm

In [None]:
import glob,os
from google.colab import drive
drive.mount('/content/drive')
!unzip '/content/drive/MyDrive/images.zip'

In [None]:
from transformers import DetrFeatureExtractor, DetrForObjectDetection,DetrConfig,pipeline
import numpy as np
import torch
from PIL import Image
import requests

In [None]:
config=DetrConfig(num_queries = 36,d_model = 2048) # 36 features per image of 2048 dimensions 
model = DetrForObjectDetection(config)
feature_extractor = DetrFeatureExtractor.from_pretrained("facebook/detr-resnet-50")

Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnet50_a1_0-14fe96d1.pth" to /root/.cache/torch/hub/checkpoints/resnet50_a1_0-14fe96d1.pth


Downloading:   0%|          | 0.00/274 [00:00<?, ?B/s]

In [None]:
def create_BU(img):
  
  image = Image.open(img)
  name=os.path.basename(img)[:-4]
  inputs = feature_extractor(images=image, return_tensors="pt")
  outputs = model(**inputs)
  target_sizes = torch.tensor([image.size[::-1]])
  results = feature_extractor.post_process(outputs, target_sizes=target_sizes)[0]
  dct={}

  dct['boxes']=np.array(results['boxes'].tolist())
  dct['class']=np.array(results['labels'].tolist())
  dct['features']=np.array(outputs['last_hidden_state'].tolist()[0])
  dct['scores']=np.array(results['scores'].tolist())
  os.chdir('/content/drive/MyDrive/SAS_results') # To store results directly to Gdrive
  with open(f'{name}.npy', 'wb') as f:
      np.save(f, dct)



In [None]:
from tqdm import tqdm
for file in tqdm(glob.glob('/content/images/*')):
  create_BU(file)

After the BU have been created, copy them to the "Data_for_SAS\bottom_up_features_36_info" folder.

### **Run the following command in terminal to train the SAS model**
---



In [None]:
!python train.py --data_dir Data_for_SAS --save_path output 

### **Run the following command to evaluate and generate the audios for the test**

In [None]:
!python train.py --data_dir Data_for_SAS --save_path output --only_val

# Automatic Speech Recognition (ASR)

The generated audio needs to be evaluated for performance using BLEU and METEOR which require texts so we need to convert the generated audio to text for evaluation.

In [None]:
from transformers import pipeline
clss=pipeline("automatic-speech-recognition")

Import the generated audio files.

In [None]:
!unzip Audio_Base.zip
!unzip Audio_DETR.zip

In [None]:
import glob,os
audio_res=[]
for i in glob.glob('./Audio_Base/*'):
  res=clss(i)
  audio_res.append([os.path.basename(i),res['text']])

In [None]:
audio_res=sorted(audio_res,reverse=False)

View the generated texts

In [None]:
audio_res 

# BLEU / METEOR

In [None]:
import pickle # The captions.pkl contains only the capions for 100 test texts
with open('captions.pkl', 'rb') as f:
  original_captions = pickle.load(f)

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
BLEU_1,BLEU_2,BLEU_3,BLEU_4,METEOR=[],[],[],[],[]
st,en=0,5
for candidate in audio_res:
  BLEU_1.append(sentence_bleu(original_captions[st:en], candidate[1].split() , weights=(1, 0, 0, 0)))
  BLEU_2.append(sentence_bleu(original_captions[st:en], candidate[1].split() , weights=(0, 1, 0, 0)))
  BLEU_3.append(sentence_bleu(original_captions[st:en], candidate[1].split() , weights=(0, 0, 1, 0)))
  BLEU_4.append(sentence_bleu(original_captions[st:en], candidate[1].split() , weights=(0, 0, 0, 1)))
  
  METEOR.append(meteor_score(original_captions[st:en], candidate[1].split() ))
  st+=5
  en+=5

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
import numpy as np
print("Audio_Base")
print(np.mean(BLEU_1)*100)
print(np.mean(BLEU_2)*100)
print(np.mean(BLEU_3)*100)
print(np.mean(BLEU_4)*100)
print(np.mean(METEOR)*100)

Audio_Base
6.944107221797573
1.8411425192689353e-306
1.8411425192689353e-306
1.8411425192689353e-306
21.269182534287605
