# Ensemble Learning to predict Video Memorability Scores

Importing the necessary packages

In [20]:
#importing all the necessary packages
import pandas as pd
import scipy
from keras import Sequential
from keras import layers
from keras import regularizers
import numpy as np
from string import punctuation
from collections import Counter
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from numpy import mean
from numpy import std
from matplotlib import pyplot
import os
import glob
import io
from sklearn import ensemble
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
import mlxtend
from mlxtend.regressor import StackingCVRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [4]:
pip install PyPrind

Note: you may need to restart the kernel to use updated packages.


In [5]:
import pyprind

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/souradipgoswami/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

## Model 1: Random Forest Regressor

Load the Captions Dataset and Ground truth dataset

In [89]:
#function to load the captions into a data frame from a text file
def read_caps(fname):
    """Load the captions into a dataframe"""
    video = []
    caption = []
    df = pd.DataFrame();
    with open(fname) as f:
        for line in f:
            pairs = line.split()
            video.append(pairs[0])
            caption.append(pairs[1])
        df['video']=video
        df['caption']=caption
    return df
# load the captions
caption_path = '/Users/souradipgoswami/Desktop/dev-set_video-captions.txt'
df_caption=read_caps(caption_path)
#load ground truth
labels=pd.read_csv('/Users/souradipgoswami/Desktop/ground-truth.csv')

Steps to remove stopwords and punctuations from the Caption

In [90]:
#loading the nltk stopwords of English
stopwords = nltk.corpus.stopwords.words('english')
print(f'Length of Stopwords: {len(stopwords)}')

Length of Stopwords: 179


In [91]:
pbar = pyprind.ProgBar(len(df_caption['caption']), title='Counting word occurrences')
for i, caption in enumerate(df_caption['caption']):
    # replace punctuations with space
    # convert words to lower case 
    text = ''.join([c if c not in punctuation else ' ' for c in caption]).lower()
    #removing stopwords
    rmv_stopwords= ' '.join([word for word in text.split() if word not in stopwords])
    df_caption.loc[i,'caption'] = rmv_stopwords #updating the original captions 
    pbar.update()

Counting word occurrences
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01


In [92]:
#Converting word to bag of captions using vectorizer
vectorizer = CountVectorizer(analyzer = "word",max_features=3112) 
captions_bag = vectorizer.fit_transform(df_caption.caption).toarray()
type(captions_bag)

numpy.ndarray

In [94]:
captions_bag.shape

(6000, 3112)

Preparing train and validation data and applying the algorithm

In [95]:
X = captions_bag
y = labels[['short-term_memorability','long-term_memorability']].values

In [96]:
# Splitting the dataset into the Training set and validation set
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [97]:
#defining the Random Forest Regressor
captions_rf = RandomForestRegressor(n_estimators=200,random_state=42)

In [98]:
#fitting the regressor
captions_rf.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [99]:
#predicting the validation set
prediction1 = captions_rf.predict(X_test)

Creating the function for generating the scores

In [26]:
#Function to calculate Spearman coefficient scores
def Get_score(Y_pred,Y_true):
    '''Calculate the Spearmann"s correlation coefficient'''
    Y_pred = np.squeeze(Y_pred)
    Y_true = np.squeeze(Y_true)
    if Y_pred.shape != Y_true.shape:
        print('Input shapes don\'t match!')
    else:
        if len(Y_pred.shape) == 1:
            Res = pd.DataFrame({'Y_true':Y_true,'Y_pred':Y_pred})
            score_mat = Res[['Y_true','Y_pred']].corr(method='spearman',min_periods=1)
            print('The Spearman\'s correlation coefficient is: %.3f' % score_mat.iloc[1][0])
        else:
            for ii in range(Y_pred.shape[1]):
                Get_score(Y_pred[:,ii],Y_true[:,ii])

In [14]:
#generating the score
Get_score(prediction1, y_test)

The Spearman's correlation coefficient is: 0.414
The Spearman's correlation coefficient is: 0.179


# Model2: Multilayer Perceptron Model

Load the captions and ground truth dataset

In [140]:
def read_caps1(fname):
    """Load the captions into a dataframe"""
    video = []
    caption = []
    df = pd.DataFrame();
    with open(fname) as f:
        for line in f:
            pairs = line.split()
            video.append(pairs[0])
            caption.append(pairs[1])
        df['video']=video
        df['caption']=caption
    return df


# load the captions
caption_path1 = '/Users/souradipgoswami/Desktop/dev-set_video-captions.txt'
df_caption1=read_caps1(caption_path1)
#load ground truth
labels1=pd.read_csv('/Users/souradipgoswami/Desktop/ground-truth.csv')

Preprocessing the datasets to remove punctuations and apply one hot encoding 

In [141]:
## Lets count the words and remove punctuations
counts = Counter()
for i, caption in enumerate(df_caption1['caption']):
    # replace punctuations with space
    # convert words to lower case 
    text = ''.join([c if c not in punctuation else ' ' for c in caption]).lower()
    df_caption1.loc[i,'caption'] = text
    counts.update(text.split())

In [142]:
len_token = len(counts) # create length of token
tokenizer = Tokenizer(num_words=len_token) # use tokenizer

In [143]:
tokenizer.fit_on_texts(list(df_caption1.caption.values))

In [144]:
## Convert the integers into binary 0 and 1 format
one_hot_enc = tokenizer.texts_to_matrix(list(df_caption1.caption.values),mode='binary')
print(one_hot_enc)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]


Defining the predictor variables and selecting the train and validation sets and applying the ML

In [145]:
#defining the predictor and response variables
predictor1 = labels1[['short-term_memorability','long-term_memorability']].values
Y = predictor1
X = one_hot_enc

In [146]:
#dcreating the training and validation data
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X,Y, test_size=0.2, random_state=42)

In [147]:
#defining and fitting the model
model = Sequential()
#adding drop out to prevent overfitting
model.add(layers.Dropout(0.8, input_shape=(len_token,)))
# two layers of 20 neurons each with selu activation function
model.add(layers.Dense(20,activation='selu',kernel_regularizer=regularizers.l2(0.001), kernel_initializer='zeros'))
model.add(layers.Dense(20,activation='selu',kernel_regularizer=regularizers.l2(0.001)))
model.add(layers.Dropout(0.8))
#final layer with 2 neurons and activation function as sigmoid
model.add(layers.Dense(2,activation='sigmoid'))
#using adamax as optimizer and Mean squared error as loss function
model.compile(optimizer='adamax',loss='mse',metrics=['accuracy'])
#fitting the model with 30 epochs
_model = model.fit(X_train1,Y_train1,epochs=30, validation_data=(X_test1,Y_test1))

Train on 4800 samples, validate on 1200 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [149]:
#predicting using 1200 validation samples
prediction2 = model.predict(X_test1)
prediction2

array([[0.87019867, 0.7860913 ],
       [0.85349554, 0.77410007],
       [0.86272764, 0.78063405],
       ...,
       [0.87508655, 0.78975636],
       [0.86597896, 0.7829913 ],
       [0.8611691 , 0.77951384]], dtype=float32)

In [37]:
#generating the scores
Get_score(prediction2, Y_test1)

The Spearman's correlation coefficient is: 0.401
The Spearman's correlation coefficient is: 0.211


# Model 3: Recurrent Neural Network

Load the captions and ground truth

In [38]:
def read_caps2(fname):
    """Load the captions into a dataframe"""
    video = []
    caption = []
    df = pd.DataFrame();
    with open(fname) as f:
        for line in f:
            pairs = line.split()
            video.append(pairs[0])
            caption.append(pairs[1])
        df['video']=video
        df['caption']=caption
    return df


# load the captions
caption_path2 = '/Users/souradipgoswami/Desktop/dev-set_video-captions.txt'
df_caption2=read_caps1(caption_path2)
#load ground truth
labels2=pd.read_csv('/Users/souradipgoswami/Desktop/ground-truth.csv')

Preprocessing the data to remove punctuations and apply sequence embedding

In [39]:
sequences = tokenizer.texts_to_sequences(list(df_caption2.caption.values))

In [40]:
## set max length of all dimensions to 50
max_length=50
X_seq = np.zeros((len(sequences),max_length))
for i in range(len(sequences)):
    n = len(sequences[i])
    if n==0:
        print(i)
    else:
        X_seq[i,-n:] = sequences[i]
X_seq.shape

(6000, 50)

Defining the predictor variables, training and validation sets and applying the ML

In [41]:
predictor2 = labels2[['short-term_memorability','long-term_memorability']].values
Y = predictor2
X = X_seq
## Train-Validation Split
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X,Y, test_size=0.2, random_state=42)

In [54]:
modelRNN=Sequential() # Create Sequential NN model

## add Embedding layer for RNN to map our data into a format suitable for LSTM layer
modelRNN.add(layers.Embedding(input_dim=5191, output_dim=20, input_length=50, init='uniform'))

## add LSTM layer for some hidden layer and memory into the network
modelRNN.add(layers.LSTM(200, activation='selu', recurrent_initializer='uniform', kernel_initializer='zeros', kernel_regularizer=regularizers.l2(0.001)))
modelRNN.add(layers.Dropout(0.8))

modelRNN.add(layers.Dense(50, activation='selu', kernel_regularizer=regularizers.l2(0.001)))
modelRNN.add(layers.Dropout(0.8))

## Output layer of 2 neurons for each score with sigmoid activation 
modelRNN.add(layers.Dense(2, activation='sigmoid'))

## Compile the model 
# Optimizer Adamax
modelRNN.compile(optimizer='adamax',loss='mse',metrics=['accuracy'])
_modelResult = modelRNN.fit(X_train2,Y_train2,epochs=30, validation_data=(X_test2,Y_test2))

  after removing the cwd from sys.path.


Train on 4800 samples, validate on 1200 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [55]:
#predicting the scores using 1200 samples
prediction3 = modelRNN.predict(X_test2)

In [56]:
#generating the Spearman correlation coefficient
Get_score(prediction3, Y_test2)

The Spearman's correlation coefficient is: 0.369
The Spearman's correlation coefficient is: 0.189


# Ensemble Model using Simple Average

In [57]:
#adding up the model prediction and averaging them
prediction_test1 = np.add(prediction1,prediction2)
prediction_test = np.add(prediction_test1,prediction3)

In [58]:
#predicitng the scores using simple average
prediction_testfinal = np.divide(prediction_test, 3)

In [59]:
#generating the spearman correlation coefficient
Get_score(prediction_testfinal, Y_test2)

The Spearman's correlation coefficient is: 0.450
The Spearman's correlation coefficient is: 0.213


The Simple Average ensemble suggests that it performs better than the individual models i.e. Random Forest regressor, MLP and RNN

Let us now look into the Stacking ensemble using Support Vector Regressor and Random Forest Regressor. 

# Support Vector Regression

In [60]:
#loading the caption and ground truth
def read_caps(fname):
    """Load the captions into a dataframe"""
    video = []
    caption = []
    df = pd.DataFrame();
    with open(fname) as f:
        for line in f:
            pairs = line.split()
            video.append(pairs[0])
            caption.append(pairs[1])
        df['video']=video
        df['caption']=caption
    return df
# load the captions
caption_path = '/Users/souradipgoswami/Desktop/dev-set_video-captions.txt'
df_caption=read_caps(caption_path)
#load ground truth
labels=pd.read_csv('/Users/souradipgoswami/Desktop/ground-truth.csv')

Steps to remove stopwords and punctuations

In [61]:
#loading the nltk stopwords of English
stopwords = nltk.corpus.stopwords.words('english')
print(f'Length of Stopwords: {len(stopwords)}')

Length of Stopwords: 179


In [62]:
pbar = pyprind.ProgBar(len(df_caption['caption']), title='Counting word occurrences')
for i, caption in enumerate(df_caption['caption']):
    # replace punctuations with space
    # convert words to lower case 
    text = ''.join([c if c not in punctuation else ' ' for c in caption]).lower()
    #removing stopwords
    rmv_stopwords= ' '.join([word for word in text.split() if word not in stopwords])
    df_caption.loc[i,'caption'] = rmv_stopwords #updating the original captions 
    pbar.update()

Counting word occurrences
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01


In [63]:
#generating the caption bag using vectorizer
vectorizer = CountVectorizer(analyzer = "word",max_features=3112) 
captions_bag = vectorizer.fit_transform(df_caption.caption).toarray()
type(captions_bag)

numpy.ndarray

In [68]:
X = captions_bag
y_short = labels[['short-term_memorability']].values.ravel()
y_long = labels[['long-term_memorability']].values.ravel()
# Splitting the dataset into the Training set and Test set for short term scores
X_train,X_test,y_shorttrain,y_shorttest = train_test_split(X,y_short,test_size=0.2,random_state=42)
# Splitting the dataset into the Training set and Test set for long term scores
X_train,X_test,y_longtrain,y_longtest = train_test_split(X,y_long,test_size=0.2,random_state=42)

In [66]:
#creating a simple support vector regression model
svr = SVR(kernel='rbf')
#fitting the model for short-term memorability
svr.fit(X_train, y_shorttrain)
#predicting the short-term scores
pred = svr.predict(X_test)
#generating the spearman's correlation coefficient for short videos
Get_score(pred, y_shorttest)

The Spearman's correlation coefficient is: 0.419


In [69]:
#fitting the model for long-term memorability
svr.fit(X_train, y_longtrain)
#predicting the long-term scores
pred = svr.predict(X_test)
#generating the spearman's correlation coefficient for short videos
Get_score(pred, y_longtest)

The Spearman's correlation coefficient is: 0.179


# Stacking Ensemble Model using Random Forest and Support Vector Regression

In [2]:
def read_caps(fname):
    """Load the captions into a dataframe"""
    video = []
    caption = []
    df = pd.DataFrame();
    with open(fname) as f:
        for line in f:
            pairs = line.split()
            video.append(pairs[0])
            caption.append(pairs[1])
        df['video']=video
        df['caption']=caption
    return df
# load the captions
caption_path = '/Users/souradipgoswami/Desktop/dev-set_video-captions.txt'
df_caption=read_caps(caption_path)
#load ground truth
labels=pd.read_csv('/Users/souradipgoswami/Desktop/ground-truth.csv')

Steps to remove the stopwords and punctuations

In [7]:
#loading the nltk stopwords of English
stopwords = nltk.corpus.stopwords.words('english')
print(f'Length of Stopwords: {len(stopwords)}')

Length of Stopwords: 179


In [8]:
pbar = pyprind.ProgBar(len(df_caption['caption']), title='Counting word occurrences')
for i, caption in enumerate(df_caption['caption']):
    # replace punctuations with space
    # convert words to lower case 
    text = ''.join([c if c not in punctuation else ' ' for c in caption]).lower()
    #removing stopwords
    rmv_stopwords= ' '.join([word for word in text.split() if word not in stopwords])
    df_caption.loc[i,'caption'] = rmv_stopwords #updating the original captions 
    pbar.update()

Counting word occurrences
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01


In [9]:
#generating the caption bag using vectorizer
vectorizer = CountVectorizer(analyzer = "word",max_features=3112) 
captions_bag = vectorizer.fit_transform(df_caption.caption).toarray()
type(captions_bag)

numpy.ndarray

In [10]:
X_stack = captions_bag
y_short1 = labels[['short-term_memorability']].values.ravel()
y_long1 = labels[['long-term_memorability']].values.ravel()
# Splitting the dataset into the Training set and Test set for short term scores
X_trainstack,X_teststack,y_shorttrain1,y_shorttest1 = train_test_split(X_stack,y_short1,test_size=0.2,random_state=42)
# Splitting the dataset into the Training set and Test set for long term scores
X_trainstack,X_teststack,y_longtrain1,y_longtest1 = train_test_split(X_stack,y_long1,test_size=0.2,random_state=42)

In [22]:
#Meta regressor model
gradboost1 = ensemble.GradientBoostingRegressor(n_estimators=200,learning_rate=0.01)
#Base models
rf1= RandomForestRegressor()
svr1 = SVR(kernel='rbf')

In [24]:
#Defining the stackingcv regressor model
stack = StackingCVRegressor(regressors=(svr1, rf1,gradboost1),
                            meta_regressor=gradboost1, cv=10,
                            use_features_in_secondary=True,
                            store_train_meta_features=True,
                            shuffle=True)

Fitting the model and predicting the short-term score

In [27]:
stack.fit(X_trainstack, y_shorttrain1)
pred = stack.predict(X_teststack)
#generating the Spearman's correlation coefficient
Get_score(pred, y_shorttest1)

The Spearman's correlation coefficient is: 0.452


Fitting the model and predicting the long-term score

In [28]:
stack.fit(X_trainstack, y_longtrain1)
pred = stack.predict(X_teststack)
#generating the Spearman's correlation coefficient
Get_score(pred, y_longtest1)

The Spearman's correlation coefficient is: 0.180


# End of Code for Training and Validation

# Test the Models with test data

# Random Forest Regressor

In [100]:
#function to load the captions into a data frame from a text file
def read_caps(fname):
    """Load the captions into a dataframe"""
    video = []
    caption = []
    df = pd.DataFrame();
    with open(fname) as f:
        for line in f:
            pairs = line.split()
            video.append(pairs[0])
            caption.append(pairs[1])
        df['video']=video
        df['caption']=caption
    return df
# load the captions
caption_path = '/Users/souradipgoswami/Desktop/test-set-1_video-captions.txt'
df_caption=read_caps(caption_path)
#load ground truth
labels=pd.read_csv('/Users/souradipgoswami/Desktop/ground_truth_template.csv')

In [101]:
#loading the nltk stopwords of English
stopwords = nltk.corpus.stopwords.words('english')
print(f'Length of Stopwords: {len(stopwords)}')

Length of Stopwords: 179


In [102]:
pbar = pyprind.ProgBar(len(df_caption['caption']), title='Counting word occurrences')
for i, caption in enumerate(df_caption['caption']):
    # replace punctuations with space
    # convert words to lower case 
    text = ''.join([c if c not in punctuation else ' ' for c in caption]).lower()
    #removing stopwords
    rmv_stopwords= ' '.join([word for word in text.split() if word not in stopwords])
    df_caption.loc[i,'caption'] = rmv_stopwords #updating the original captions 
    pbar.update()

Counting word occurrences
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:00


In [103]:
#Converting word to bag of captions using vectorizer
vectorizer = CountVectorizer(analyzer = "word") 
captions_bag = vectorizer.fit_transform(df_caption.caption).toarray()
type(captions_bag)

numpy.ndarray

In [112]:
test_pred1=captions_rf.predict(captions_bag)

# Multi-layer Perceptron Model

In [157]:
def read_caps1(fname):
    """Load the captions into a dataframe"""
    video = []
    caption = []
    df = pd.DataFrame();
    with open(fname) as f:
        for line in f:
            pairs = line.split()
            video.append(pairs[0])
            caption.append(pairs[1])
        df['video']=video
        df['caption']=caption
    return df


# load the captions
caption_path1 = '/Users/souradipgoswami/Desktop/test-set-1_video-captions.txt'
df_caption1=read_caps1(caption_path1)
#load ground truth
labels1=pd.read_csv('/Users/souradipgoswami/Desktop/ground_truth_template.csv')

In [158]:
## Lets count the words and remove punctuations
counts = Counter()
for i, caption in enumerate(df_caption1['caption']):
    # replace punctuations with space
    # convert words to lower case 
    text = ''.join([c if c not in punctuation else ' ' for c in caption]).lower()
    df_caption1.loc[i,'caption'] = text
    counts.update(text.split())

In [159]:
len_token = 5191 # create length of token
tokenizer = Tokenizer(num_words=len_token) # use tokenizer

In [160]:
tokenizer.fit_on_texts(list(df_caption1.caption.values))

In [161]:
## Convert the integers into binary 0 and 1 format
one_hot_enc = tokenizer.texts_to_matrix(list(df_caption1.caption.values),mode='binary')
print(one_hot_enc)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]


In [164]:
test_pred2=model.predict(one_hot_enc)
test_pred2

array([[0.8489255 , 0.7709423 ],
       [0.8933321 , 0.8041558 ],
       [0.8798032 , 0.7933656 ],
       ...,
       [0.87109864, 0.7867634 ],
       [0.8692729 , 0.7854035 ],
       [0.8735341 , 0.78858244]], dtype=float32)

# Recurrent Neural Network

In [166]:
def read_caps2(fname):
    """Load the captions into a dataframe"""
    video = []
    caption = []
    df = pd.DataFrame();
    with open(fname) as f:
        for line in f:
            pairs = line.split()
            video.append(pairs[0])
            caption.append(pairs[1])
        df['video']=video
        df['caption']=caption
    return df


# load the captions
caption_path2 = '/Users/souradipgoswami/Desktop/test-set-1_video-captions.txt'
df_caption2=read_caps1(caption_path2)
#load ground truth
labels2=pd.read_csv('/Users/souradipgoswami/Desktop/ground_truth_template.csv')

In [167]:
sequences = tokenizer.texts_to_sequences(list(df_caption2.caption.values))

In [168]:
# set max length of all dimensions to 50
max_length=50
X_seq = np.zeros((len(sequences),max_length))
for i in range(len(sequences)):
    n = len(sequences[i])
    if n==0:
        print(i)
    else:
        X_seq[i,-n:] = sequences[i]
X_seq.shape

(2000, 50)

In [170]:
test_pred3=modelRNN.predict(X_seq)
test_pred3

array([[0.9414482 , 0.92183447],
       [0.932033  , 0.9053104 ],
       [0.9579474 , 0.94750667],
       ...,
       [0.88413733, 0.8230193 ],
       [0.84103835, 0.7474295 ],
       [0.8640305 , 0.78808594]], dtype=float32)

# Test for Simple Average

In [171]:
#adding up the model prediction and averaging them
prediction_sa = np.add(test_pred1,test_pred2)
prediction_safinal = np.add(prediction_sa,test_pred3)
prediction_finalsa = np.divide(prediction_safinal, 3)

In [180]:
pred_sa_new=pd.DataFrame()
pred_sa_new['video']=labels1['video']
pred_sa_new['short-term_memorability'] = prediction_finalsa[:,0]
pred_sa_new['nb_short-term_annotations']=labels1['nb_short-term_annotations']
pred_sa_new['long-term_memorability'] = prediction_finalsa[:,1]
pred_sa_new['nb_long-term_annotations']=labels1['nb_long-term_annotations']
pred_sa_new.head()

Unnamed: 0,video,short-term_memorability,nb_short-term_annotations,long-term_memorability,nb_long-term_annotations
0,7494,0.884959,33,0.812964,12
1,7495,0.898505,34,0.824853,10
2,7496,0.897284,32,0.849102,13
3,7497,0.893106,33,0.812605,10
4,7498,0.871964,33,0.779047,10


In [184]:
#exporting the results for Simple Average
pred_sa_new.to_csv("Results_Simple Average.csv",index=False)