#Predicting Memorability

In this notebook I have combined the videos captions and C3D features to predict video memorability and run them through an XGboost regressor model

##1. Set up and Libraries

In [0]:
from google.colab import drive   #Mount Google drive
import os
drive.mount('/content/drive/')
os.chdir('/content/drive/My Drive/Machine Learning/Project/')

In [0]:
!pip install pyprind

In [0]:
#import panda and other libraries required

import pandas as pd
from keras import Sequential
from keras import layers
from keras import regularizers
import numpy as np
from string import punctuation
import pyprind
from collections import Counter
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from keras import preprocessing
import os

#Randomising for reproducability
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(1)

In [0]:
def Get_score(Y_pred,Y_true):    #Spearman function
    '''Calculate the Spearmann"s correlation coefficient'''
    Y_pred = np.squeeze(Y_pred)
    Y_true = np.squeeze(Y_true)
    if Y_pred.shape != Y_true.shape:
        print('Input shapes don\'t match!')
    else:
        if len(Y_pred.shape) == 1:
            Res = pd.DataFrame({'Y_true':Y_true,'Y_pred':Y_pred})
            score_mat = Res[['Y_true','Y_pred']].corr(method='spearman',min_periods=1)
            print('The Spearman\'s correlation coefficient is: %.3f' % score_mat.iloc[1][0])
        else:
            for ii in range(Y_pred.shape[1]):
                Get_score(Y_pred[:,ii],Y_true[:,ii])

In [0]:
!pip install xgboost  #xgboost model and the libraries needed for it below

from numpy import loadtxt   
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import ensemble



##2. Captions

Loading Captions

In [0]:
# load labels and captions 
def read_caps(fname):
    """Load the captions into a dataframe"""
    vn = []
    cap = []
    df = pd.DataFrame();
    with open(fname) as f:
        for line in f:
            pairs = line.split()
            vn.append(pairs[0])
            cap.append(pairs[1])
        df['video']=vn
        df['caption']=cap
    return df


# load the captions txt file
cap_path = './dev-set_video-captions.txt'
df_cap=read_caps(cap_path)

# load the ground truth values from csv file
label_path = './'
labels=pd.read_csv(label_path+'dev-set_ground-truth.csv')

In [0]:
counts = Counter()
# setup prograss tracker
pbar = pyprind.ProgBar(len(df_cap['caption']), title='Counting word occurrences')
for i, cap in enumerate(df_cap['caption']):
    # replace punctuations with space
    # convert words to lower case 
    text = ''.join([c if c not in punctuation else ' ' for c in cap]).lower()
    df_cap.loc[i,'caption'] = text
    pbar.update()
    counts.update(text.split())

Counting word occurrences
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:04


Preprocessing Captions

In [0]:
# build the word index
len_token = len(counts)
tokenizer = Tokenizer(num_words=len_token)
print(len_token)

5191


In [0]:
tokenizer.fit_on_texts(list(df_cap.caption.values)) #fit a list of captions to the tokenizer
#the tokenizer vectorizes a text corpus, by turning each text into either a sequence of integers

In [0]:
one_hot_res = tokenizer.texts_to_matrix(list(df_cap.caption.values),mode='binary')
sequences = tokenizer.texts_to_sequences(list(df_cap.caption.values))

In [0]:
# calculating max length
max_len = 50

In [0]:
X_seq = np.zeros((len(sequences),max_len))  #creating sequences of tokenized captions (6000 rows of 50 columns, padded with zeros when cpations are not of length 50)
for i in range(len(sequences)):
    n = len(sequences[i])
    if n==0:
        print(i)
    else:
        X_seq[i,-n:] = sequences[i]
X_seq.shape

(6000, 50)

In [0]:
df_seq = pd.DataFrame(X_seq)    #create dataframe with the sequences
df_seq = pd.concat([df_cap, df_seq], axis = 1)   #merging sequnces dataframe and the captions dataframe to I can line up video name to each sequence
df_seq = df_seq.drop("caption", axis=1)  #removing column with word version of caption so we have a dataframe where the integer sequences are algined with the primary key (video names)

In [0]:
print(df_seq)  #final dataframe for captions

##3. C3D

In [0]:
def read_C3D(fname):   #function to read c3d vectors into dataframe
    with open(fname) as f:
        for line in f:
            C3D =[float(item) for item in line.split()] # convert to float type, using default separator
    return C3D

def vname2ID(vnames):  #function to read video name and format correclty for dataframe, which will be used as primary key to merge sequences dataframe and c3d dataframe
    vid = [ os.path.splitext(vn)[0]+'.webm' for vn in vnames]
    return vid

In [0]:
Feat_path = './Features/'    #creates file path to my features directory in drive

vid = labels.video.values  #pull in video labels first

c3d_features = pd.DataFrame({'video': vid,
                   'C3D': [read_C3D(Feat_path+'C3D'+'/'+os.path.splitext(item)[0]+'.txt') for item in vid],
                       }) #pulls c3d vectors from c3d feature txt file.
#This extraction to a dataframe takes quite awhile to run (15/20 minutes). 
#It usually fails the first 2 or 3 times on my device but works eventually. I believe this is due to processing power

In [0]:
print(c3d_features)

Splitting C3D values out into individual columns

In [0]:
features1 = pd.DataFrame(c3d_features.C3D.apply(pd.Series))  #split out c3d column into individual rows
features2 = pd.concat([c3d_features, features1], axis = 1)  #merges features and features1 (splits out into columns)
features2 = features2.drop("C3D", axis=1)    #drops column where all c3d features are combined in 1 column

##4. Combining Dataframes 

In [0]:
df_x = pd.merge(df_seq, features2, on='video')  #merging captions sequences dataframe to ce3 features dataframe on the column video

In [0]:
df_x = df_x.drop('video', axis = 1)  #drop video column

In [0]:
print(df_x)

##5. Test Data

Pulling test data

In [0]:
# load the test captions txt file
test_cap_path = './Test/test-set-1_video-captions.txt'
df_test_cap=read_caps(test_cap_path)

In [0]:
print(df_test_cap)

In [0]:
# Load testing data - this should load in test set of videos (again this can take around 10 mins to run)
test_path = './Test/'
#test_vid = os.listdir(test_path+'C3D_test')

vid = df_test_cap.video.values  #pull in video labels first

Features_test = pd.DataFrame({'video': vid,
                   'C3D': [read_C3D(test_path+'C3D_test'+'/'+os.path.splitext(item)[0]+'.txt') for item in vid],
                       })

#Features_test = pd.DataFrame({'video': test_vid,
                   #'C3D': [read_C3D(test_path+'C3D_test'+'/'+item) for item in test_vid],
                    #   })
X_test = np.stack(Features_test['C3D'].values)

In [0]:
print(Features_test)

Pre-processing test caption data to mirror dev data

In [0]:
counts = Counter()
# setup prograss tracker
pbar = pyprind.ProgBar(len(df_test_cap['caption']), title='Counting word occurrences')
for i, cap in enumerate(df_test_cap['caption']):
    # replace punctuations with space
    # convert words to lower case 
    text = ''.join([c if c not in punctuation else ' ' for c in cap]).lower()
    df_test_cap.loc[i,'caption'] = text
    pbar.update()
    counts.update(text.split())

In [0]:
# build the word index
len_token = len(counts)
tokenizer = Tokenizer(num_words=len_token)
print(len_token)

In [0]:
tokenizer.fit_on_texts(list(df_test_cap.caption.values))

In [0]:
one_hot_res = tokenizer.texts_to_matrix(list(df_test_cap.caption.values),mode='binary')
test_sequences = tokenizer.texts_to_sequences(list(df_test_cap.caption.values))

In [0]:
X_test_seq = np.zeros((len(test_sequences),max_len))
for i in range(len(test_sequences)):
    n = len(test_sequences[i])
    if n==0:
        print(i)
    else:
        X_test_seq[i,-n:] = test_sequences[i]
X_test_seq.shape

In [0]:
df_test_seq = pd.DataFrame(X_test_seq)    #create dataframe with the sequences
df_test_seq = pd.concat([df_test_cap, df_test_seq], axis = 1)   #merging sequnces dataframe and the captions dataframe to I can line up video name to each sequence
df_test_seq = df_test_seq.drop("caption", axis=1)  #removing column with word version of caption so we have a dataframe where the integer sequences are algined with the primary key (video names)

In [0]:
print(df_test_seq)

Pre-processing test c3d data to mirror dev

In [0]:
Features_test1 = pd.DataFrame(Features_test.C3D.apply(pd.Series))  #split out c3d column into individual rows

In [0]:
print(Features_test1)

In [0]:
Features_test2 = pd.concat([Features_test, Features_test1], axis = 1)  #merges features and features1 (splits out into columns)
Features_test2 = Features_test2.drop("C3D", axis=1)    #drops column where all c3d features are combined in 1 column

In [0]:
print(Features_test2)

Merging test captions sequences and test c3d features

In [0]:
df_test_x = pd.merge(df_test_seq, Features_test2, on='video')
df_test_x = df_test_x.drop('video', axis = 1)

In [0]:
print(df_test_x)

##6. Training Model using Gradient Boosting Regressor

In [0]:
Y_st = labels['short-term_memorability'].values # st targets
Y_lt = labels['long-term_memorability'].values  # lt targets
X = df_x.values # sequences & C3D merged

In [0]:
X_train_st, X_test_st, Y_train_st, Y_test_st = train_test_split(X,Y_st, test_size=0.2, random_state=42) 
#splitting the short term dev data into a train and validate split of 80 to 20 with a random state for reproducability

X_train_lt, X_test_lt, Y_train_lt, Y_test_lt = train_test_split(X,Y_lt, test_size=0.2, random_state=42) 
# #splitting the long term dev data into a train and validate split of 80 to 20 with a random state for reproducability

In [0]:
#Just testing to see shape of data split for Short Term
print('X_train', X_train_st.shape)
print('X_test', X_test_st.shape)
print('Y_train', Y_train_st.shape)
print('Y_test', Y_test_st.shape)

In [0]:
#Just testing to see shape of data split for Long Term
print('X_train', X_train_lt.shape)
print('X_test', X_test_lt.shape)
print('Y_train', Y_train_lt.shape)
print('Y_test', Y_test_lt.shape)

In [0]:
#Model Params - 650 decision tree, 12 depth, learning rate 0.01
params = {'n_estimators':650, 'max_depth':12, 'min_samples_split':2, 'learning_rate':0.01, 'loss':'lad'}
clf = ensemble.GradientBoostingRegressor(**params)


In [54]:
#fit to short term training set
clf.fit(X_train_st, Y_train_st)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='lad', max_depth=12,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=650,
             n_iter_no_change=None, presort='auto', random_state=None,
             subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [0]:
#predict for stm for test set
print('Short Term:')
print(Get_score(clf.predict(X_test_st), Y_test_st))

In [56]:
#fit to training set
clf.fit(X_train_lt, Y_train_lt)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='lad', max_depth=12,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=650,
             n_iter_no_change=None, presort='auto', random_state=None,
             subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [0]:
#predict for ltm for test set
print('Long Term:')
print(Get_score(clf.predict(X_test_lt), Y_test_lt))

In [0]:
a = pd.DataFrame(clf.predict(X_test_st))
a.columns = ['x']
b = pd.DataFrame(clf.predict(X_test_lt))
b.columns = ['y']


ab = pd.concat([a, b], axis=1)

print(ab)

In [0]:
#Mapping my predictions (on test split of videos) against the trained set for short term

#Y_pred_train = model.predict(X_train)
Y_pred_train = clf.predict(X_train_st)
Y = labels['short-term_memorability'].values
Y_train = Y_train_st
#Y_pred_val = model.predict(X_val)
Y_pred_val = clf.predict(X_test_st)
X = df_x.values
Y_val = Y_test_st

if len(Y.shape) == 2:
    plt.figure()
    plt.scatter(Y_pred_train[:,0],Y_pred_train[:,1],marker='o',c='r',label='train')
    plt.scatter(Y_pred_val[:,0],Y_pred_val[:,1],marker='x',c='g',label='val')
    plt.scatter(Y[:,0],Y[:,1],marker='x',c='b',label='true',alpha=0.1)
    plt.xlim([0,1])
    plt.ylim([0,1])
    plt.legend()
    plt.xlabel('short-term')
    plt.ylabel('long-term')
    plt.title('Training results')
    plt.show()
    
    # plot testing results
    Y_pred = model.predict(X_test[:,:],batch_size=32)
    plt.figure()
    plt.scatter(Y_pred[:,0],Y_pred[:,1],marker='o',c='r')
    plt.xlabel('short-term')
    plt.ylabel('long-term')
    plt.show()
    
else:
    plt.figure()
    plt.scatter(Y_pred_train,Y_train,marker='o',c='r',label='train')
    plt.scatter(Y_pred_val,Y_val,marker='x',c='g',label='val')
    #plt.scatter(Y[:,0],Y[:,1],marker='x',c='b',label='true',alpha=0.1)
    plt.xlim([0,1])
    plt.ylim([0,1])
    plt.legend()
    plt.xlabel('true')
    plt.ylabel('pred')
    plt.title('Training results [short-term]')
    plt.show()

In [0]:
#Mapping my predictions (on test split of videos) against the trained set for long term

#Y_pred_train = model.predict(X_train)
Y_pred_train = clf.predict(X_train_lt)
Y = labels['long-term_memorability'].values
Y_train = Y_train_lt
#Y_pred_val = model.predict(X_val)
Y_pred_val = clf.predict(X_test_lt)
X = df_x.values
Y_val = Y_test_lt

if len(Y.shape) == 2:
    plt.figure()
    plt.scatter(Y_pred_train[:,0],Y_pred_train[:,1],marker='o',c='r',label='train')
    plt.scatter(Y_pred_val[:,0],Y_pred_val[:,1],marker='x',c='g',label='val')
    plt.scatter(Y[:,0],Y[:,1],marker='x',c='b',label='true',alpha=0.1)
    plt.xlim([0,1])
    plt.ylim([0,1])
    plt.legend()
    plt.xlabel('short-term')
    plt.ylabel('long-term')
    plt.title('Training results')
    plt.show()
    
    # plot testing results
    Y_pred = model.predict(X_test[:,:],batch_size=32)
    plt.figure()
    plt.scatter(Y_pred[:,0],Y_pred[:,1],marker='o',c='r')
    plt.xlabel('short-term')
    plt.ylabel('long-term')
    plt.show()
    
else:
    plt.figure()
    plt.scatter(Y_pred_train,Y_train,marker='o',c='r',label='train')
    plt.scatter(Y_pred_val,Y_val,marker='x',c='g',label='val')
    #plt.scatter(Y[:,0],Y[:,1],marker='x',c='b',label='true',alpha=0.1)
    plt.xlim([0,1])
    plt.ylim([0,1])
    plt.legend()
    plt.xlabel('true')
    plt.ylabel('pred')
    plt.title('Training results [Long-term]')
    plt.show()

##7. Predictions on Test Data

Retrain on all 6000 and then predict versus extra 2000 videos (test set)

In [0]:
X_full_train = df_x.values
Y_full_train_st = labels['short-term_memorability'].values    
Y_full_train_lt = labels['long-term_memorability'].values

In [0]:
print('X_train', X_full_train.shape)
print('Y_train lt', Y_full_train_lt.shape)
print('Y_train st', Y_full_train_st.shape)

In [0]:
#Model Params
params = {'n_estimators':650, 'max_depth':12, 'min_samples_split':2, 'learning_rate':0.01, 'loss':'lad'}
clf_final_st = ensemble.GradientBoostingRegressor(**params)

In [65]:
#fit to training set short term
clf_final_st.fit(X_full_train, Y_full_train_st)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='lad', max_depth=12,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=650,
             n_iter_no_change=None, presort='auto', random_state=None,
             subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [66]:
prediction_st = clf_final_st.predict(df_test_x)
print('long term:')
print(prediction_st)

long term:
[0.86469816 0.84926329 0.8906271  ... 0.89554188 0.83799726 0.88480207]


In [0]:
#Model Params
params = {'n_estimators':650, 'max_depth':12, 'min_samples_split':2, 'learning_rate':0.01, 'loss':'lad'}
clf_final_lt = ensemble.GradientBoostingRegressor(**params)

In [68]:
#fit to training set long term
clf_final_lt.fit(X_full_train, Y_full_train_lt)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='lad', max_depth=12,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=650,
             n_iter_no_change=None, presort='auto', random_state=None,
             subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [69]:
#predict test set long term
prediction_lt = clf_final_lt.predict(df_test_x)
print('short term:')
print(prediction_lt)

short term:
[0.81577341 0.71708777 0.79955709 ... 0.8310571  0.7486182  0.80393799]


##8. Preparing CSV and Exporting

In [0]:
#Creating data frames for both st and lt predictions
df_predictions_final_st = pd.DataFrame(prediction_st)
df_predictions_final_st.columns = ['short-term_memorability']
df_predictions_final_lt = pd.DataFrame(prediction_lt)
df_predictions_final_lt.columns = ['long-term_memorability']

In [0]:
print(df_predictions_final_st)

In [0]:
print(df_predictions_final_lt)

In [0]:
my_predictions_st = pd.concat([df_test_cap, df_predictions_final_st], axis = 1)  #merges predictions st to captions
my_predictions_st = my_predictions_st.drop("caption", axis=1)    #drop captions row

In [0]:
print(my_predictions_st)

In [0]:
my_predictions_lt = pd.concat([df_test_cap, df_predictions_final_lt], axis = 1)  #merges predictions st to captions
my_predictions_lt = my_predictions_lt.drop("caption", axis=1)    #drop captions row

In [0]:
print(my_predictions_lt)

In [0]:
df_predictions_final = pd.merge(my_predictions_st, my_predictions_lt, on='video')
df_predictions_final = df_predictions_final.drop("video", axis=1)

In [0]:
print(df_predictions_final)    #Here is the data frame containing my short term and long term prediction having dropped the video column

In [0]:
template_path = './' #set path to current directory
template=pd.read_csv(label_path+'Copy of ground_truth_template.csv')  #pulls in csv file from google drive and store in template
template1 = pd.DataFrame(template)  #create data frame

In [0]:
print(template1)

In [0]:
ground_truth_values = template1.drop('short-term_memorability', axis=1)   #drop short term memorability row
ground_truth_values = ground_truth_values.drop('long-term_memorability', axis=1)   #drop long term memorability row


ground_truth_values = pd.concat([ground_truth_values, df_predictions_final], axis = 1)   #merge with my final predictions

ground_truth_values = ground_truth_values[['video', 'short-term_memorability','nb_short-term_annotations','long-term_memorability', 'nb_long-term_annotations']]  #rearrange columns to right order

In [0]:
print(ground_truth_values)

In [0]:
ground_truth_values.to_csv('ground_truth_values.csv')
!cp ground_truth_values.csv csv/