In [None]:
from utils import *
from tensorflow.keras.models import Sequential, Model, Model
from tensorflow.keras.layers import LSTM, Embedding, Input, Dense, Add
from tensorflow.keras.utils import plot_model

### Loading data 

In [None]:
# Location of the Flickr8k images and caption files
dataset_image_path ="flickr8k/Images/"
dataset_text_path  ="flickr8k/captions.txt" 
# Wanted shape for images
wanted_shape = (224,224,3)

In [None]:
# To obtain the text dataset corresponding to images
train, infer = False, True

if train:
    df_texts = pd.read_csv(dataset_text_path, sep=",") #["image","caption"] 
elif infer:
    df_texts = pd.read_csv("df_texts.csv", index_col=0) # ["image","caption","cleaned","cleaned_tokenized"]
    print("df_texts loaded")
    print(df_texts.head())

n_img = df_texts.count()/5 # 40455/5 
unique_img = pd.unique(df_texts["image"])# 8091 unique images

### Preprocessing images with pretrained VGG16 : FEATURE MAPS 4096

In [None]:
base_model = VGG16(
    include_top=True, weights='imagenet', input_tensor=None,
    input_shape=wanted_shape, pooling=None, classes=1000
)
# Feature extraction
vgg_model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output) #end the modèle with a 4096 feature layer

In [None]:
charge_image, one_by_one = False, False# false to gain time when testing other parts
# To obtain the feature maps
if train :
    if charge_image:
        feature_maps = np.array([vgg_model.predict(load_img_from_ds(unique_img[i])) for i in range(len(unique_img))])
        print(f"Shape des fm {feature_maps.shape}")
    elif one_by_one:
        feature_maps=[]
        for i in range(len(unique_img)):
            if i!=0:
                print(f"{i}/{len(unique_img)} - time elapsed :{time.time()-a}")
            else:
                print(f"{i}/{len(unique_img)}")
            a=time.time()
            img = load_img_from_ds(unique_img[i])
            feature_map = vgg_model.predict(img)
            feature_maps.append(feature_map)
        feature_maps=np.array(feature_maps)
        #save to csv
        feature_maps_sav=feature_maps[:,0,:]
        df_fm = pd.DataFrame(feature_maps_sav)
        df_fm.to_csv("image_feature_maps.csv")

elif infer:
    df_fm = pd.read_csv("image_feature_maps.csv")
    feature_maps = np.array(df_fm.drop([df_fm.columns[0]], axis=1))
    print(f"Image feature maps loaded : {feature_maps.shape}")

### Preprocessing captions - WORD2VEC : EMBEDDINGS 4096

In [6]:
 if train :
    # Text preprocessing
    df_texts["cleaned"]=[process_sentence(s) for s in df_texts["caption"]]
    df_texts["cleaned_tokenized"]=[word_tokenize(w) for w in df_texts["cleaned"]]
    
    import logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    word2vec_model = gensim.models.Word2Vec([word_tokenize(w) for w in df_texts["cleaned"]], min_count=1, size=4096)
    word2vec_model.save("word2vec.model")
    
    text_features = word2vec(df_texts,word2vec_model.wv)
    np.save("text_feature_maps.npy",text_features)
    df_texts.to_csv("text_feature_maps.csv")

elif infer:
    #word2vec_model = gensim.models.Word2Vec.load("word2vec.model")
    #vocab_size = len(word2vec_model.wv.vocab)
    print("Word2vec model loaded.")
    text_features  = np.load("text_feature_maps.npy", allow_pickle=True)
    print(f"Text features loaded. {text_features.shape}")

Word2vec model loaded.
Text features loaded. (40455,)


### Preparing total model inputs

In [7]:
if(len(feature_maps)==8091):
    n_images_considered = 5#int(len(feature_maps)/9)
    # Reduce memory consumption
    feature_maps = feature_maps[:n_images_considered]
    text_features= text_features[:n_images_considered*5]

print(f"Image feature maps : {feature_maps.shape}\nText features : {text_features.shape}")

Image feature maps : (5, 4096)
Text features : (25,)


In [8]:
dimages = feature_maps

In [9]:
def multiple_feature_maps(dimages):
    Ximage = []
    for image in dimages:
        for i in range (5):
            Ximage.append(image)
    Ximage=np.array(Ximage)
    return Ximage

In [10]:
dfeaturemaps = multiple_feature_maps(dimages)
print(f"Images duplicated {dfeaturemaps.shape}")

Images duplicated (25, 4096)


In [11]:
# Split dataset
def split_test_val_train(df, Ntest, Nval):
    return(df[:Ntest],
           df[Ntest:Ntest+Nval],
           df[Ntest+Nval:])
    """
    return(df[:1000],
           df[1000:20000],
           df[20000:21000])
    """

In [12]:
# Split du dataset
prop_test, prop_val = 0.2, 0.2
N = dfeaturemaps.shape[0]#len(df_texts["cleaned_tokenized"])
Ntest, Nval = int(N*prop_test), int(N*prop_val)

Nimg = len(dimages)
# dt = true image caption cleaned
dt_test, dt_val, dt_train = split_test_val_train(text_features, Ntest, Nval)
# di = true image array
di_test, di_val, di_train = split_test_val_train(dfeaturemaps, Ntest, Nval)
# fnm = image_name
fnm_test, fnm_val, fnm_train = split_test_val_train(df_texts["image"], Ntest, Nval)

In [13]:
print(f" Train: text {dt_train.shape}, image {di_train.shape}, fnm {fnm_train.shape}")
print(f" Test : text {dt_test.shape}, image {di_test.shape}, fnm {fnm_test.shape}")
print(f" Val  : text {dt_val.shape}, image {di_val.shape}, fnm {fnm_val.shape}")

 Train: text (15,), image (15, 4096), fnm (40445,)
 Test : text (5,), image (5, 4096), fnm (5,)
 Val  : text (5,), image (5, 4096), fnm (5,)


In [14]:
def stringtoarray(text):
    txt = []
    text = text.replace("[", "")
    text = text.replace("]", "")
    text = text.split()
    for i in text:
        txt.append(float(i))
    return txt

In [15]:
def finalpreprocessing(dftext, dfimage, vocab_size):
    print("# captions/images = {}".format(len(dftext)))
    
    maxlen = np.max([len(text) for text in dftext])
    Xtext, Ximage, ytext = [], [], []
    step = 0
    for text, image in zip(dftext, dfimage):
        step += 1
        # Commented because our array is a numpy ndarray dtype=float32
        #text = stringtoarray(text)
        for i in range(0, len(text)):
            in_text, out_text = text[:i], text[i]
            in_text = pad_sequences([in_text], maxlen=maxlen).flatten()
            out_text = to_categorical(out_text, num_classes = vocab_size)
            
            Xtext.append(in_text)
            Ximage.append(image)
            ytext.append(out_text)
    print(f"Number of step/associated image and caption {step}")
    
    Xtext = np.array(Xtext)
    Ximage = np.array(Ximage)
    ytext = np.array(ytext)
    
    return(Xtext, Ximage, ytext, maxlen)

In [16]:
vocab_size = 8747#len(word2vec_model.wv.vocab)

Xtext_train, Ximage_train, ytext_train, maxlen = finalpreprocessing(dt_train, di_train, vocab_size) 
Xtext_val, Ximage_val, ytext_val, _ = finalpreprocessing(dt_val, di_val, vocab_size)

print(f"Vocab size {vocab_size}")
print(f"Training set : \n \tInput image : {Ximage_train.shape}\n\tInput text : {Xtext_train.shape}\n\tOutput text : {ytext_train.shape}")

# captions/images = 15
Number of step/associated image and caption 15
# captions/images = 5
Number of step/associated image and caption 5
Vocab size 8747
Training set : 
 	Input image : (132, 4096)
	Input text : (132,)
	Output text : (132, 4096, 8747)


In [19]:
for i in range(len(Xtext_train)):
    if(1 in Xtext_train[i]):
        print("Maeva a raison !")
print("end")

end


### RNN Model 

In [21]:
dim_embedding=64

# image input
input_img = Input(shape=(Ximage_train.shape[1],), name="InputImage")
input_img = ( Dense(units=256,activation='relu',name="CompressedImageFeatures"))(input_img)
# text input
input_txt = Input(shape=(maxlen,), name="InputSequence")
input_txt = Embedding(vocab_size,dim_embedding, mask_zero=True)(input_txt)
input_txt = LSTM(units=256, activation="relu", name="CaptionFeatures")(input_txt)

# Common part
common = Add()([input_txt, input_img])
common = Dense(256, activation='relu')(common)
common = Dense(vocab_size, activation='softmax')(common)

#Model
total_model  = Model(inputs=[input_img, input_txt],outputs=common)

model.compile(loss='categorical_crossentropy', optimizer='adam')
print(model.summary())

Note that input tensors are instantiated via `tensor = tf.keras.Input(shape)`.
The tensor that caused the issue was: CompressedImageFeatures/Relu_1:0
Note that input tensors are instantiated via `tensor = tf.keras.Input(shape)`.
The tensor that caused the issue was: CaptionFeatures/strided_slice_7:0


ValueError: Graph disconnected: cannot obtain value for tensor Tensor("InputSequence_1:0", shape=(None, 12), dtype=float32) at layer "embedding_1". The following previous layers were accessed without issue: []

In [None]:
plot_model(model, to_file="model.png")

### Model training 

In [None]:
hist = model.fit([Ximage_train, Xtext_train], ytext_train, epochs=5, verbose=2, batch_size=64, validation_data=([Ximage_val, Xtext_val], ytext_val))

### Model evaluation

In [None]:
for label in ["loss", "val_loss"]:
    plt.plot(hist.history[label], label=label)]
plt.legend()
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.show()

In [None]:
'''
PREDICTION
'''

# 1 couche 256 LSTM ?
# A partir de combien de couce=hes c est ok 1 8 16 32 256 
# Temps d entrainement : compromis 
# Voir si dimensions pas trop grandes ?
# GRU ! :D mieux (3 params au lieu de 4)
# simpleRNN ? 
# Etude comparative : 3 RNN (simple, LSTM, GRU & Etude de perf)
# Limiter Dataset ! => entrainements en O(heure)

#tf.keras.utils.get_file(origin="lien", fname="nom_que_tu_veux_donner_au_fichier.zip", extract=True)