# Our project uses 3 models
1. Speech_model: takes a sound and returns a word 
2. CNN_model (Xception): Takes a pictures and returns a vector representation of the image 
3. RNN_model: Takes a vector representation of an image and a sequence and returns a caption



# Sound to text model

For this part of the project we used this notebook as inspiration: https://github.com/douglas125/SpeechCmdRecognition

Cite: 
@ARTICLE{2018arXiv180808929C,
   author = {{Coimbra de Andrade}, D. and {Leo}, S. and {Loesener Da Silva Viana}, M. and 
	{Bernkopf}, C.},
    title = "{A neural attention model for speech command recognition}",
  journal = {ArXiv e-prints},
archivePrefix = "arXiv",
   eprint = {1808.08929},
 keywords = {Electrical Engineering and Systems Science - Audio and Speech Processing, Computer Science - Sound},
     year = 2018,
    month = aug,
   adsurl = {http://adsabs.harvard.edu/abs/2018arXiv180808929C},
  adsnote = {Provided by the SAO/NASA Astrophysics Data System}
}

2## Download and import the files needed from the github project

In [None]:
!pip install {"../input/kapree/kapre-0.1.7-py3-none-any.whl"}

# This file is used to download audio from various datasets and converts them audio into numpy 
!wget -q https://raw.githubusercontent.com/douglas125/SpeechCmdRecognition/master/SpeechDownloader.py

# A generator for reading and serving audio files
!wget -q https://raw.githubusercontent.com/douglas125/SpeechCmdRecognition/master/SpeechGenerator.py

# Utility functions for audio files
!wget -q https://raw.githubusercontent.com/douglas125/SpeechCmdRecognition/master/audioUtils.py

# The actual speech model
!wget -q https://raw.githubusercontent.com/douglas125/SpeechCmdRecognition/master/SpeechModels.py

import SpeechDownloader
import SpeechGenerator
import SpeechModels
import audioUtils

## Install the needed requirements

In [None]:
!wget pip install tensorflow>=2
!wget pip install kapre==0.2
!wget pip install pandas>=0.25
!wget pip install librosa>=0.8
!wget pip install tqdm
!wget pip install matplotlib

## Import other libraries that we need 

In [None]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt  
import pickle

## Download and prepare Google Speech Dataset

In [None]:
# this indicates that we want to use a pretrained model
version = 2

# specify the amount of commands that should be recognized by the model
gscInfo, nCategs = SpeechDownloader.PrepareGoogleSpeechCmd(version=version, task='35word')

# gscInfo: path to trainig, test and validation files 
# nCategs: the amount of classes
print("gscInfo: ", gscInfo)
print("nCategs: ", nCategs)

with open('gscInfo','wb') as f: 
  pickle.dump(gscInfo, f)

with open('nCategs','wb') as f: 
  pickle.dump(nCategs, f)

### Pickle dump the previous datastructures so that we dont have to download them again 

In [None]:
with open('gscInfo','rb') as f: gscInfo = pickle.load(f)

with open('nCategs','rb') as f: nCategs = pickle.load(f)

# test if it worked 
#print(numpy.array_equal(gscInfo,gscInfo2))
#print(numpy.array_equal(nCategs,nCategs2))

## Explanation of how the pre-trained model we use was trained

It was trained using ...

## Build the test and validation generator (dont need train since we use a pre-trained model)

In [None]:
# to handla that the number of samples in validation may not be multiple of batch_size
shuffle = True  

# Create the speech generators, one for validation and one for testing
validation_speech_generator   = SpeechGenerator.SpeechGen(gscInfo['val']['files'], gscInfo['val']['labels'], shuffle=shuffle)
test_speech_generator = SpeechGenerator.SpeechGen(gscInfo['test']['files'], gscInfo['test']['labels'], shuffle=False, batch_size=len(gscInfo['test']['files']))

print("validation generator: ", validation_speech_generator)
print("test generator: ", test_speech_generator)

print("Length of validation generator: ", validation_speech_generator.__len__())
print("Length of test generator: ", test_speech_generator.__len__())

## Look at validation data

In [None]:
X, Y = validation_speech_generator.__getitem__(5)
print("Features: ", X)
print("Labels: ", Y)

## Create the speech model and load the pre-trained weights

In [None]:
# varför är input length None? Kan det vara vad som helst då? 
speech_model = SpeechModels.AttRNNSpeechModel(nCategs, inputLength = None)

# vad gör denna?? trodde vi hade tränat redan?
speech_model.compile(optimizer='adam', loss=['sparse_categorical_crossentropy'], metrics=['sparse_categorical_accuracy'])

# gissar att det är här vi laddar in vikterna som de tränat fram 
# ÄNDRA LÄNKEN SEN NÄR ALLT ÄR NEDLADDAT 
speech_model.load_weights('../input/pre-trained-speech-model/model-attRNN_pre_trained.h5')

## Test if the model works by looking at the test data
'unknown': 0,
'silence': 0,
'_unknown_': 0,
'_silence_': 0,
'_background_noise_': 0,
'yes': 2,
'no': 3,
'up': 4,
'down': 5,
'left': 6,
'right': 7,
'on': 8,
'off': 9,
'stop': 10,
'go': 11,
'zero': 12,
'one': 13,
'two': 14,
'three': 15,
'four': 16,
'five': 17,
'six': 18,
'seven': 19,
'eight': 20,
'nine': 1,
'backward': 21,
'bed': 22,
'bird': 23,
'cat': 24,
'dog': 25,
'follow': 26,
'forward': 27,
'happy': 28,
'house': 29,
'learn': 30,
'marvin': 31,
'sheila': 32,
'tree': 33,
'visual': 34,
'wow': 35}

['nine', 'yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go',
           'zero', 'one', 'two', 'three', 'four', 'five', 'six', 
           'seven',  'eight', 'backward', 'bed', 'bird', 'cat', 'dog',
           'follow', 'forward', 'happy', 'house', 'learn', 'marvin', 'sheila', 'tree',
           'visual', 'wow']




In [None]:
X_test, Y_test = test_speech_generator.__getitem__(0)

In [None]:
predictions = np.argmax(speech_model.predict(X_test, verbose = 1), 1)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
classes = ['unknown', 'nine', 'yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go',
           'zero', 'one', 'two', 'three', 'four', 'five', 'six', 
           'seven',  'eight', 'backward', 'bed', 'bird', 'cat', 'dog',
           'follow', 'forward', 'happy', 'house', 'learn', 'marvin', 'sheila', 'tree',
           'visual', 'wow']

cm = confusion_matrix(Y_test, predictions)
audioUtils.plot_confusion_matrix(cm, classes, normalize = False)


# Use the code from Lab 2 to create the model that generates captions 

## Import Everything 

In [None]:
import tensorflow as tf

import matplotlib.pyplot as plt

# regex
import re

# merge paths
import os

import numpy as np 

# create datasets
import pandas as pd 

# to get file names in a folder 
from glob import glob

# The Image module provides a class with the same name which is used to represent a PIL image. The module also provides a number of factory functions, including functions to load images from files, and to create new images.
from PIL import Image

# to save a model as a file 
from pickle import load, dump

# to print a model 
from keras.utils import plot_model

# one-hot encoding 
from keras.utils import to_categorical

# It takes as input a list of tensors, all of the same shape, and returns a single tensor (also of the same shape).
from keras.layers.merge import add

# Loads a model saved via model.save(), Model is the actual model 
from keras.models import Model, load_model

# Dense: fully connected layer that perform classification on the features extracted by the convolutional layers and down-sampled by the pooling layers in the CNN and that are used in the RNN. 
# 
# LSTM: RNN may suffer from the vanishing gradient problem - LSTM solves this problem, LSTM knows what to store and what to throw away. 
# Dropout: The Dropout layer randomly sets input units to 0 with a frequency of rate at each step during training time, which helps prevent overfitting. Inputs not set to 0 are scaled up by 1/(1 - rate) such that the sum over all inputs is unchanged.
# Embedding: Use to create our own word embeddings using a tokenizer 
# Input: is used to instantiate a Keras tensor.
from keras.layers import Dense, LSTM, Dropout, Embedding, Input

# Pads sequences to the same length.
from keras.preprocessing.sequence import pad_sequences

## Define global variables

In [None]:
start = "<start>"
end = "<end>"
pathToImageFolder = '../input/flickr-image-dataset/flickr30k_images/flickr30k_images/'
pathToImageCaptionCSV = '../input/flickr-image-dataset/flickr30k_images/results.csv'

# training params 
pictures_used_when_training = 100

# In terms of artificial neural networks, an epoch refers to one cycle through the full training dataset
epochs = 10

# dimentions of the pictures
xdim = 299
ydim = 299

## Define functions used when preprocessing data

We use the pretrained CNN model "Xception": https://keras.io/api/applications/xception/

Quote from webbsite: 
Optionally loads weights pre-trained on ImageNet. 
the default input image size for this model is 299x299.

In [None]:
# returns a list with the name of all vectors and a dictionary with the name as key and caption as value
def fetchData(): 
    
    # read the captions to a panda datastructure 
    captions = pd.read_csv(pathToImageCaptionCSV, delimiter='|')
    captions.columns = ['imageName', 'captionNumber', 'caption']

    # stores all the file names in a list 
    all_img_name_vector = []
    
    # stores all the captions in a dictionary with filenames as key 
    all_captions = {}
    
    # parse the panda and fill the structures above 
    for index, row in captions.head(n=pictures_used_when_training).iterrows():
        caption = start + " " + row[2].replace(".", "").strip() + " " + end
        im_ID = row[0]
        all_img_name_vector.append(im_ID)
        
        if im_ID not in all_captions:
            all_captions[im_ID] = []
        all_captions[im_ID].append(caption)
    
    all_img_name_vector = list(set(all_img_name_vector))
    return all_img_name_vector, all_captions

# creates the feature dictionary (feature representation of every picture)
def createFeatures(img_name_vector, CNN_model):
    
    # create a features.p file if it does not exist
    if not os.path.exists('features.p'):
        
        # load the pretrained Xception model 
        # the pooling layers downsomples the image data extracted by the convolutional layers to reduce the dimentionality of the feaature map inorder to decrease processing time 
        # include_top false to excklude the top layer 
        
        # dictionary used to store the features images 
        features = {}       
        
        # create the feature of each picture (represent the picture as a vector) using the CNN
        for im_ID in img_name_vector:       
            image = loadPicture(im_ID)    
            
            
            feature = CNN_model.predict(image)
            features[im_ID] = feature

        # store the feature representation fo every picture in the file feature.p
        dump(features, open("../input/long-training-overnight/results/features.p","wb"))  

    features = load(open("features.p", "rb"))
   
    # only return the images that are in the name vector 
    features = {im_ID:features[im_ID] for im_ID in img_name_vector}
    
    return features

# transform picture to standard dimention  (299x299)
def loadPicture(im_ID):
    imagePath = pathToImageFolder + im_ID
    image = Image.open(imagePath).resize((xdim,ydim))
    image = np.expand_dims(image, axis=0) 
    return normalize(image) 

# normalising the values to -1 to 1 
def normalize(image):
    return (image/127.5) - 1.0

# create and return tokenizer 
# This class allows to vectorize a text corpus, by turning each text into a sequence of integers 

def extractTokens(all_captions, num_words):
    tkz = tf.keras.preprocessing.text.Tokenizer(num_words = num_words, oov_token = "<unk>", filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
    # Updates internal vocabulary based on a list of texts.
    tkz.fit_on_texts(all_captions)
    dump(tkz, open('tokenizer.p', 'wb'))
    return tkz 

# flatten dictionary
def flatten(dictionary):
    flat = list()
    for item in dictionary.keys():
        [flat.append(d) for d in dictionary[item]]
    return flat

## Preprocess data

In [None]:
all_img_name, all_captions = fetchData()

# feature representation of the images
CNN_model = tf.keras.applications.Xception(pooling = 'avg', include_top = False)

# The output of our CNN is a 2048 feature vector
image = loadPicture("10002456.jpg")    
print(CNN_model.predict(image).shape)

features = createFeatures(all_img_name, CNN_model)

# tokenizer
flat_all_captions = flatten(all_captions)
tkz = extractTokens(flat_all_captions, 5000)
vocabSize = len(tkz.word_index) + 1

# max length used for the caption length
caption_max_length = max(len(t.split()) for t in flat_all_captions)

## Define functions used when building and training

In [None]:
# Build the RNN 
def buildRNN(vocabSize, caption_max_length):

    # specify the shape (the number of features that each input sample has) 
    # 2048 because that if the shape that Imagenet uses 
    # Output from Imagenets is the input for the RNN model 
    inputs1 = Input(shape=(2048,))
    
    # the dropout layer randomly sets input units to 0 with a frequency of 50%
    # to avoid overfitting 
    fe1 = Dropout(0.5)(inputs1)
    
    # create the fully connected layer with 256 neurons and use the relu as the activation function
    fe2 = Dense(256, activation='relu')(fe1)
    
    # Create the LSTM layer
    # shape of the caption input 
    inputs2 = Input(shape=(caption_max_length,))
    se1 = Embedding(vocabSize, 256, mask_zero=True)(inputs2)
    # drop 50% of all neurons 
    se2 = Dropout(0.5)(se1)
    # the layer for the picture 
    se3 = LSTM(256)(se2)
    
    # Merg the two models 
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocabSize, activation='softmax')(decoder2)
    
    # tie it together [image, seq] [word]
    RNN_model = Model(inputs=(inputs1, inputs2), outputs=outputs)
    RNN_model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return RNN_model

# generator is used so that we dont have to give all data to the model at once (that will exhaust the model)
def data_generator(all_captions, features, tkz, caption_max_length):
    
    while True:
        for im_ID, captions in all_captions.items():   
            
            # vector representation of the picture 
            feature = features[im_ID][0]
            
            # create the sequence 
            input_image, input_sequence, output_word = createSequences(tkz, caption_max_length, captions, feature)
            
            yield ([input_image, input_sequence], output_word)
            
def createSequences(tkz, caption_max_length, captions, feature):
    X1 = []
    X2 = []
    y = []
    
    for caption in captions:
        # transform text to vector using the tokenizor
        seq = tkz.texts_to_sequences([caption])[0]
        
        # loop every word in the sequence 
        for i in range(1, len(seq)):
            
            # all the words in the caption until letter i
            inputSequence = seq[:i]
            
            # pad so that it always is the same length
            inputSequence = pad_sequences([inputSequence], maxlen = caption_max_length)[0]

            # the new word 
            outputSequence = seq[i]
            
            # go from words to indexes 
            outputSequence = to_categorical([outputSequence], num_classes=vocabSize)[0]
            
            # append to inputs and outputs 
            X1.append(feature)
            X2.append(inputSequence)
            y.append(outputSequence)
    
    X1 = np.array(X1)
    X2 = np.array(X2)
    y = np.array(y)
    return X1, X2 , y

## Build and train the model

In [None]:
model = buildRNN(vocabSize, caption_max_length)
steps_per_epoch = len(all_captions)

for i in range(epochs):
    filename = "model_" + str(i) + ".h5"

    gen = data_generator(all_captions, features, tkz, caption_max_length)
    model.fit_generator(gen, epochs = epochs, steps_per_epoch = steps_per_epoch, verbose = 1)
    
    model.save(filename)

## Load pre-saved model (so we dont have to train it every time)

In [None]:
tkz = load(open("../input/long-training-overnight/results/tokenizer.p","rb"))

CNN_model = tf.keras.applications.Xception(pooling = 'avg', include_top = False)
RNN_model = load_model('../input/long-training-overnight/results/model_8.h5')

tkz = load(open("../input/long-training-overnight/results/tokenizer.p","rb"))
RNN_model = load_model('../input/long-training-overnight/results/model_8.h5')

# ./model_8.h5
# ./tokenizer.p


## Define functions used when testing the model

In [None]:
# takes an index and returns a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word    
    return None
    
def generate_caption(model, tkz, img_feature, caption_max_length):    
    # first word is always the start symbol 
    caption = start
    
    # loop until we get "<end>" or reached the max length
    for i in range(caption_max_length):
        
        # convert the text to a vector using the tokenizer
        seq = tkz.texts_to_sequences([caption])[0]
        
        # transform to the same length 
        seq = pad_sequences([seq], maxlen = caption_max_length)
        
        # gives the probability of each word given the picture and the previous sequence
        predicted_id_list = model.predict([img_feature, seq], verbose=0)
        
        # gives the index of the word with the highest probability
        pred_id = np.argmax(predicted_id_list)
        
        # gives the word of the above index 
        word = word_for_id(pred_id, tkz)
        
        # bug-fix when a tokenizer that does not match the captions have been used 
        if word is None:
            break
            
        caption += ' ' + word
        
        # end if the word is end or it it wants to write start again 
        if word == end or word == start:
            break
            
    return caption

def test_model(CNN_model, RNN_model, tkz, im_ID):        
    
    # create the vector representation of the image 
    image = loadPicture(im_ID)
    feature = CNN_model.predict(image)
    
    # generate caption
    print(generate_caption(RNN_model, tkz, feature, caption_max_length))
    
    # display image 
    img = Image.open(pathToImageFolder + im_ID)
    plt.imshow(img)

def test_model2(CNN_model, RNN_model, tkz, im_ID):        
    
    # create the vector representation of the image 
    image = loadPicture(im_ID)
    feature = CNN_model.predict(image)
    
    # generate caption
    caption = generate_caption(RNN_model, tkz, feature, caption_max_length)

    return caption

## Test the RNN model

In [None]:
test_model(CNN_model, RNN_model, tkz, '1355703632.jpg')

In [None]:
test_model(CNN_model, RNN_model, tkz, '1032122270.jpg')

In [None]:
test_model(CNN_model, RNN_model, tkz, '1009434119.jpg')

In [None]:
test_model(CNN_model, RNN_model, tkz, '1714937792.jpg')

# Combine the models to find pictures 

In [None]:
# use the speech model to predict the classes of all test sound 
word_predictions= np.argmax(speech_model.predict(X_test, verbose = 1), 1)

In [None]:
# To quickly be able to go fram an index to a word
classes = ['nine', 'yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go',
           'zero', 'one', 'two', 'three', 'four', 'five', 'six', 
           'seven',  'eight', 'backward', 'bed', 'bird', 'cat', 'dog',
           'follow', 'forward', 'happy', 'house', 'learn', 'marvin', 'sheila', 'tree',
           'visual', 'wow']

In [None]:
# now use the CNN and RNN model to create a caption for each picture and return the picture if the 
# caption contains the word we choose 

# Lets say we are looking for a picture for sound 5 in the test data;
sound_index = 900
print(word_predictions[sound_index])
predicted_word = classes[word_predictions[sound_index]]
real_word = classes[Y_test[sound_index]]

print("We are looking for the sound with index", sound_index, ". We classified this sound as the word:", predicted_word,"(the real class for this word is:",real_word,")")

pictures_Wanted = 3
pictures_looked_at = 0
found_images = 0
found_images_ID = []
found_pictures = 0

for img_name in all_img_name:
    pictures_looked_at +=1
    caption = test_model2(CNN_model, RNN_model, tkz, img_name) 
    
    print("\n\n\nLooking at picture: ", pictures_looked_at)
    
    if predicted_word in caption:
        print("found an image!")
        found_pictures += 1
        found_images_ID.append(img_name)
    
    if found_pictures == pictures_Wanted:
        break

if found_pictures == 0:
    print("\n\nCould not find a picture with the word")
else:
    print("\n\nWe found these pictures for sound : ",sound_index, "(",predicted_word, ")",found_images_ID)
    print("One of them looks like this; ")
    # display image 
    img = Image.open(pathToImageFolder + found_images_ID[found_pictures-1])
    plt.imshow(img)
    
    