# CS 598 DLH Final Project Experiment 2
by Regan Brown (rnbrown3, group 51)

This is NOT intended as a bonus "Descriptive notebook". This contains all the source code for Experiment 2 of my final project.

Please see the Experiment 1 notebook for most instructions and explanations. Assume steps and reasonings are the same unless otherwise specified.

In order to ensure the proper environment to run this code, please download the latest versions of all libraries/imports mentioned in the code blocks. In addition, you will need to install necessary CoreNLP packages and start up an instance of the Stanford Core NLP server, following the instructions at https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK

You will need to either download the STS benchmark dataset from https://ixa2.si.ehu.eus/stswiki/index.php/STSbenchmark, or use the copies I have provided in my repo. You will need to replace the filepaths in the .read_csv statements below accordingly; I have commented these spots for extra clarity.

## Preprocessing
The first steps to take are loading in the STS benchmark dataset, then extracting the sentence pairs and scores/labels for each of the four subsets of data we are performing the experiment on. Then we initialize the Stanford parser.

In [1]:
#be sure to start up Stanford Parser server following steps here: https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK
from nltk.parse import CoreNLPParser
from nltk.tree import Tree, ParentedTree
import time #for tracking time to run
import tracemalloc #for tracking memory usage
_START_RUNTIME = time.time()
tracemalloc.start()

#first, load in the STS training data
import pandas as pd
#REPLACE FILEPATH HERE: PATH TO TRAIN DATA
df = pd.read_csv(r'C:\Users\rbrow\Downloads\Stsbenchmark\Stsbenchmark\stsbenchmark\sts-train.csv', sep='\t', on_bad_lines='skip', header=None, names=["genre", "file", "year", 'idx', 'score', 'sentence1', 'sentence2'])
MSRvid = df[df['file'] == 'MSRvid'] #1000 rows, all from 2012
headlines = df[(df['file'] == 'headlines') & df['year'].str.startswith('2013')] #597 rows
images2014 = df[(df['file'] == 'images') & df['year'].str.startswith('2014')] #497 rows
images2015 = df[(df['file'] == 'images') & df['year'].str.startswith('2015')] #503 rows

#load in the STS test data
import csv
#REPLACE FILEPATH HERE: PATH TO TEST DATA
df = pd.read_csv(r'C:\Users\rbrow\Downloads\Stsbenchmark\Stsbenchmark\stsbenchmark\sts-test.csv', quoting=csv.QUOTE_NONE, on_bad_lines='skip', sep='\t', header=None, names=["genre", "file", "year", 'idx', 'score', 'sentence1', 'sentence2'])
MSRvid_test = df[df['file'] == 'MSRvid'] #250 rows, all from 2012
headlines_test = df[(df['file'] == 'headlines') & df['year'].str.startswith('2013')] #72 rows
images2014_test = df[(df['file'] == 'images') & df['year'].str.startswith('2014')] #135 rows
images2015_test = df[(df['file'] == 'images') & df['year'].str.startswith('2015')] #115 rows

#MSRvid variables
labels = MSRvid['score'].tolist()
sentence1 = MSRvid['sentence1'].tolist()
sentence2 = MSRvid['sentence2'].tolist()
labels_test = MSRvid_test['score'].tolist()
sentence1_test = MSRvid_test['sentence1'].tolist()
sentence2_test = MSRvid_test['sentence2'].tolist()

#headlines variables
labels_h = headlines['score'].tolist()
sentence1_h = headlines['sentence1'].tolist()
sentence2_h = headlines['sentence2'].tolist()
labels_test_h = headlines_test['score'].tolist()
sentence1_test_h = headlines_test['sentence1'].tolist()
sentence2_test_h = headlines_test['sentence2'].tolist()

#images2014 variables
labels_4 = images2014['score'].tolist()
sentence1_4 = images2014['sentence1'].tolist()
sentence2_4 = images2014['sentence2'].tolist()
labels_test_4 = images2014_test['score'].tolist()
sentence1_test_4 = images2014_test['sentence1'].tolist()
sentence2_test_4 = images2014_test['sentence2'].tolist()

#images2015 variables
labels_5 = images2015['score'].tolist()
sentence1_5 = images2015['sentence1'].tolist()
sentence2_5 = images2015['sentence2'].tolist()
labels_test_5 = images2015_test['score'].tolist()
sentence1_test_5 = images2015_test['sentence1'].tolist()
sentence2_test_5 = images2015_test['sentence2'].tolist()

#initialize the Stanford parser
parser = CoreNLPParser(url='http://localhost:9000')

In [2]:
#implementation of SPO algorithm as outlined in the paper's pseudocode (Algorithm 1)
#to help make sense of this code, please check the label definitions here: https://stackoverflow.com/questions/1833252/java-stanford-nlp-part-of-speech-labels
def spo(sentence):
    tree = parser.raw_parse(sentence)
    tree = next(tree) #need to pull the Tree out of the iter
    
    subject = ""
    predicate = ""
    obj = ""
    for t in tree[0]:
        if t.label() == 'NP': #identify subject
            for s in t.subtrees():
                for n in s.subtrees():
                    if n.label().startswith("NN"):
                        subject = n[0]
        if t.label() == 'VP': #identify predicate
            for p in t.subtrees():
                for m in p.subtrees():
                    if m.label().startswith("VB"):
                        predicate = m[0]
        if t.label() == 'VP': #identify object (code based on code found here: https://github.com/HassanElmadany/Extract-SVO/blob/master/Subject_Verb_Object_Extractor.py)
            for k in t.subtrees(lambda n: n.label() in ['NP', 'PP', 'ADJP']):
                if k.label() in ['NP', 'PP']:
                    for c in k.subtrees(lambda c: c.label().startswith('NN')):
                        obj = c[0]
                else:
                    for c in k.subtrees(lambda c: c.label().startswith('JJ')):
                        obj = c[0]
    return [subject, predicate, obj]

In [3]:
#parse first sentences in sentence pairs, for both train and test sets
#for MSRvid
sentence1_parsed = []
for s in sentence1:
    parsed = spo(s)
    sentence1_parsed.append(parsed)
sentence1_parsed_test = []
for s in sentence1_test:
    parsed = spo(s)
    sentence1_parsed_test.append(parsed)
    
#for headlines
sentence1_parsed_h = []
for s in sentence1_h:
    parsed = spo(s)
    sentence1_parsed_h.append(parsed)
sentence1_parsed_test_h = []
for s in sentence1_test_h:
    parsed = spo(s)
    sentence1_parsed_test_h.append(parsed)
    
#for images2014
sentence1_parsed_4 = []
for s in sentence1_4:
    parsed = spo(s)
    sentence1_parsed_4.append(parsed)
sentence1_parsed_test_4 = []
for s in sentence1_test_4:
    parsed = spo(s)
    sentence1_parsed_test_4.append(parsed)

#for images2015
sentence1_parsed_5 = []
for s in sentence1_5:
    parsed = spo(s)
    sentence1_parsed_5.append(parsed)
sentence1_parsed_test_5 = []
for s in sentence1_test_5:
    parsed = spo(s)
    sentence1_parsed_test_5.append(parsed)

In [4]:
#parse second sentences in sentence pairs, for both train and test sets
sentence2_parsed = []
for s in sentence2:
    parsed = spo(s)
    sentence2_parsed.append(parsed)
sentence2_parsed_test = []
for s in sentence2_test:
    parsed = spo(s)
    sentence2_parsed_test.append(parsed)
    
#for headlines
sentence2_parsed_h = []
for s in sentence2_h:
    parsed = spo(s)
    sentence2_parsed_h.append(parsed)
sentence2_parsed_test_h = []
for s in sentence2_test_h:
    parsed = spo(s)
    sentence2_parsed_test_h.append(parsed)
    
#for images2014
sentence2_parsed_4 = []
for s in sentence2_4:
    parsed = spo(s)
    sentence2_parsed_4.append(parsed)
sentence2_parsed_test_4 = []
for s in sentence2_test_4:
    parsed = spo(s)
    sentence2_parsed_test_4.append(parsed)

#for images2015
sentence2_parsed_5 = []
for s in sentence2_5:
    parsed = spo(s)
    sentence2_parsed_5.append(parsed)
sentence2_parsed_test_5 = []
for s in sentence2_test_5:
    parsed = spo(s)
    sentence2_parsed_test_5.append(parsed)

In [5]:
#sentences where SPO could not parse out any of the subject, predicate, or object are meaningless to us
#Since accurate comparisons cannot be made, remove any pairs affected by this
import pandas as pd
#MSRvid data
df1 = pd.DataFrame(sentence1_parsed, columns = ["S1", "P1", "O1"])
df2 = pd.DataFrame(sentence2_parsed, columns = ["S2", "P2", "O2"])
df3 = pd.DataFrame(labels, columns = ["label"])
combined_df = df1.join(df2)
combined_df = combined_df.join(df3)
cleaned_df = combined_df[(combined_df.S1 != '') & (combined_df.P1 != '') & (combined_df.O1 != '')]
cleaned_df = cleaned_df[(cleaned_df.S2 != '') & (cleaned_df.P2 != '') & (cleaned_df.O2 != '')]
display(cleaned_df)
#now split these back out into separate lists; still need to process those via Word2Vec
df1 = cleaned_df.iloc[:,:3]
df2 = cleaned_df.iloc[:,3:6]
df3 = cleaned_df.iloc[:,6:]
sentence1_cleaned = df1.values.tolist()
sentence2_cleaned = df2.values.tolist()
labels = df3.values.tolist()
#now do the same thing for test data
df1 = pd.DataFrame(sentence1_parsed_test, columns = ["S1", "P1", "O1"])
df2 = pd.DataFrame(sentence2_parsed_test, columns = ["S2", "P2", "O2"])
df3 = pd.DataFrame(labels_test, columns = ["label"])
combined_df = df1.join(df2)
combined_df = combined_df.join(df3)
cleaned_df = combined_df[(combined_df.S1 != '') & (combined_df.P1 != '') & (combined_df.O1 != '')]
cleaned_df = cleaned_df[(cleaned_df.S2 != '') & (cleaned_df.P2 != '') & (cleaned_df.O2 != '')]
display(cleaned_df)
#now split these back out into separate lists; still need to process those via Word2Vec
df1 = cleaned_df.iloc[:,:3]
df2 = cleaned_df.iloc[:,3:6]
df3 = cleaned_df.iloc[:,6:]
sentence1_cleaned_test = df1.values.tolist()
sentence2_cleaned_test = df2.values.tolist()
labels_test = df3.values.tolist()

#headlines data
df1 = pd.DataFrame(sentence1_parsed_h, columns = ["S1", "P1", "O1"])
df2 = pd.DataFrame(sentence2_parsed_h, columns = ["S2", "P2", "O2"])
df3 = pd.DataFrame(labels_h, columns = ["label"])
combined_df = df1.join(df2)
combined_df = combined_df.join(df3)
cleaned_df = combined_df[(combined_df.S1 != '') & (combined_df.P1 != '') & (combined_df.O1 != '')]
cleaned_df = cleaned_df[(cleaned_df.S2 != '') & (cleaned_df.P2 != '') & (cleaned_df.O2 != '')]
display(cleaned_df)
#now split these back out into separate lists; still need to process those via Word2Vec
df1 = cleaned_df.iloc[:,:3]
df2 = cleaned_df.iloc[:,3:6]
df3 = cleaned_df.iloc[:,6:]
sentence1_cleaned_h = df1.values.tolist()
sentence2_cleaned_h = df2.values.tolist()
labels_h = df3.values.tolist()
#now do the same thing for test data
df1 = pd.DataFrame(sentence1_parsed_test_h, columns = ["S1", "P1", "O1"])
df2 = pd.DataFrame(sentence2_parsed_test_h, columns = ["S2", "P2", "O2"])
df3 = pd.DataFrame(labels_test_h, columns = ["label"])
combined_df = df1.join(df2)
combined_df = combined_df.join(df3)
cleaned_df = combined_df[(combined_df.S1 != '') & (combined_df.P1 != '') & (combined_df.O1 != '')]
cleaned_df = cleaned_df[(cleaned_df.S2 != '') & (cleaned_df.P2 != '') & (cleaned_df.O2 != '')]
display(cleaned_df)
#now split these back out into separate lists; still need to process those via Word2Vec
df1 = cleaned_df.iloc[:,:3]
df2 = cleaned_df.iloc[:,3:6]
df3 = cleaned_df.iloc[:,6:]
sentence1_cleaned_test_h = df1.values.tolist()
sentence2_cleaned_test_h = df2.values.tolist()
labels_test_h = df3.values.tolist()

#images2014 data
df1 = pd.DataFrame(sentence1_parsed_4, columns = ["S1", "P1", "O1"])
df2 = pd.DataFrame(sentence2_parsed_4, columns = ["S2", "P2", "O2"])
df3 = pd.DataFrame(labels_4, columns = ["label"])
combined_df = df1.join(df2)
combined_df = combined_df.join(df3)
cleaned_df = combined_df[(combined_df.S1 != '') & (combined_df.P1 != '') & (combined_df.O1 != '')]
cleaned_df = cleaned_df[(cleaned_df.S2 != '') & (cleaned_df.P2 != '') & (cleaned_df.O2 != '')]
display(cleaned_df)
#now split these back out into separate lists; still need to process those via Word2Vec
df1 = cleaned_df.iloc[:,:3]
df2 = cleaned_df.iloc[:,3:6]
df3 = cleaned_df.iloc[:,6:]
sentence1_cleaned_4 = df1.values.tolist()
sentence2_cleaned_4 = df2.values.tolist()
labels_4 = df3.values.tolist()
#now do the same thing for test data
df1 = pd.DataFrame(sentence1_parsed_test_4, columns = ["S1", "P1", "O1"])
df2 = pd.DataFrame(sentence2_parsed_test_4, columns = ["S2", "P2", "O2"])
df3 = pd.DataFrame(labels_test_4, columns = ["label"])
combined_df = df1.join(df2)
combined_df = combined_df.join(df3)
cleaned_df = combined_df[(combined_df.S1 != '') & (combined_df.P1 != '') & (combined_df.O1 != '')]
cleaned_df = cleaned_df[(cleaned_df.S2 != '') & (cleaned_df.P2 != '') & (cleaned_df.O2 != '')]
display(cleaned_df)
#now split these back out into separate lists; still need to process those via Word2Vec
df1 = cleaned_df.iloc[:,:3]
df2 = cleaned_df.iloc[:,3:6]
df3 = cleaned_df.iloc[:,6:]
sentence1_cleaned_test_4 = df1.values.tolist()
sentence2_cleaned_test_4 = df2.values.tolist()
labels_test_4 = df3.values.tolist()

#images2015 data
df1 = pd.DataFrame(sentence1_parsed_5, columns = ["S1", "P1", "O1"])
df2 = pd.DataFrame(sentence2_parsed_5, columns = ["S2", "P2", "O2"])
df3 = pd.DataFrame(labels_5, columns = ["label"])
combined_df = df1.join(df2)
combined_df = combined_df.join(df3)
cleaned_df = combined_df[(combined_df.S1 != '') & (combined_df.P1 != '') & (combined_df.O1 != '')]
cleaned_df = cleaned_df[(cleaned_df.S2 != '') & (cleaned_df.P2 != '') & (cleaned_df.O2 != '')]
display(cleaned_df)
#now split these back out into separate lists; still need to process those via Word2Vec
df1 = cleaned_df.iloc[:,:3]
df2 = cleaned_df.iloc[:,3:6]
df3 = cleaned_df.iloc[:,6:]
sentence1_cleaned_5 = df1.values.tolist()
sentence2_cleaned_5 = df2.values.tolist()
labels_5 = df3.values.tolist()
#now do the same thing for test data
df1 = pd.DataFrame(sentence1_parsed_test_5, columns = ["S1", "P1", "O1"])
df2 = pd.DataFrame(sentence2_parsed_test_5, columns = ["S2", "P2", "O2"])
df3 = pd.DataFrame(labels_test_5, columns = ["label"])
combined_df = df1.join(df2)
combined_df = combined_df.join(df3)
cleaned_df = combined_df[(combined_df.S1 != '') & (combined_df.P1 != '') & (combined_df.O1 != '')]
cleaned_df = cleaned_df[(cleaned_df.S2 != '') & (cleaned_df.P2 != '') & (cleaned_df.O2 != '')]
display(cleaned_df)
#now split these back out into separate lists; still need to process those via Word2Vec
df1 = cleaned_df.iloc[:,:3]
df2 = cleaned_df.iloc[:,3:6]
df3 = cleaned_df.iloc[:,6:]
sentence1_cleaned_test_5 = df1.values.tolist()
sentence2_cleaned_test_5 = df2.values.tolist()
labels_test_5 = df3.values.tolist()

Unnamed: 0,S1,P1,O1,S2,P2,O2,label
1,man,playing,flute,man,playing,flute,3.80
2,man,spreading,pizza,man,spreading,pizza,3.80
3,men,playing,chess,men,playing,chess,2.60
4,man,playing,cello,man,playing,cello,4.25
7,man,playing,piano,man,playing,guitar,1.60
...,...,...,...,...,...,...,...
994,man,playing,flute,boy,vacuuming,floor,0.00
995,woman,playing,guitar,woman,cuts,meat,0.75
996,men,playing,soccer,man,riding,motorcycle,0.00
997,woman,running,beach,dog,swimming,pool,0.00


Unnamed: 0,S1,P1,O1,S2,P2,O2,label
0,girl,styling,hair,girl,is,hair,2.50
1,men,play,beach,boys,playing,beach,3.60
2,woman,measuring,woman,woman,measures,woman,5.00
3,man,cutting,cucumber,man,slicing,cucumber,4.20
4,man,playing,harp,man,playing,keyboard,1.50
...,...,...,...,...,...,...,...
244,swimmers,jump,water,Swimmers,racing,lake,1.75
245,man,spins,board,man,putting,chicken,0.00
246,man,jumps,curb,man,riding,skateboard,2.75
247,person,dices,pepper,cook,slicing,peppers,1.80


Unnamed: 0,S1,P1,O1,S2,P2,O2,label
1,SC,dismisses,case,SC,dismisses,verdict,4.4
2,Explosion,hits,Syria,Explosion,hits,cities,2.6
4,Castro,celebrates,birthday,Castro,celebrates,birthday,4.2
11,Obama,backs,Sea,Obama,calm,Sea,2.4
12,China,kills,attack,Teenager,kills,attack,3.2
...,...,...,...,...,...,...,...
580,mayor,saves,fire,mayor,burning,house,4.2
585,Bombs,kill,wound,Thailand,kill,wound,3.0
586,lawmakers,take,sanctions,lawmakers,vote,president,0.2
587,Obama,need,weapons,gunman,die,hands,0.4


Unnamed: 0,S1,P1,O1,S2,P2,O2,label
1,bomber,kills,church,bomber,kills,Pakistan,0.75
4,Berri,launch,government,Spain,changed,government,0.4
7,Nigeria,killed,crash,Nigeria,opens,crash,1.6
10,airport,evacuated,threat,airport,evacuated,threat,4.0
16,forces,kill,protests,forces,kill,Damascus-activists,1.2
20,party,submits,presidency,party,aims,presidency,3.6
21,Publisher,acquitted,case,PBS,publishes,impartiality,0.4
25,Strauss,retires,cricket,Strauss,retires,cricket,4.8
32,forces,shell,rebels,rebels,clash,province,2.6
37,Germany,dodges,quarter,Eurozone,avoids,Germany,2.0


Unnamed: 0,S1,P1,O1,S2,P2,O2,label
1,eyes,standing,chair,cat,stands,floor,2.6
3,bus,driving,street,bus,driving,street,4.0
4,train,waiting,station,train,sits,station,4.8
6,computer,sitting,floor,computer,sitting,floor,3.8
8,people,eat,outside,people,sitting,table,2.8
...,...,...,...,...,...,...,...
489,bird,perched,hand,bird,perched,person,4.4
490,baby,holds,packet,highchair,holds,packet,4.0
494,cat,looking,camera,dog,looking,camera,0.8
495,bird,sitting,ground,bird,sitting,branch,2.5


Unnamed: 0,S1,P1,O1,S2,P2,O2,label
0,boys,look,shed,area,looking,shed,2.80
2,man,sleeps,lap,chair,holding,baby,4.00
3,cats,looking,window,cat,looking,window,2.60
4,person,wearing,structure,boots,standing,motorcycle,1.20
7,helmet,sits,bicycle,girl,wearing,background,3.40
...,...,...,...,...,...,...,...
125,gentleman,looking,motorcycle,man,looking,motorcycle,4.00
127,airplane,parked,grass,shirt,tied,railing,0.00
130,people,walking,mushroom,people,walking,mushroom,4.00
133,cows,look,camera,sink,looking,camera,0.75


Unnamed: 0,S1,P1,O1,S2,P2,O2,label
0,dogs,playing,ball,dogs,play,football,4.4
2,children,stand,fence,children,standing,fence,4.6
3,dog,emerges,mouth,dog,walking,mouth,4.8
6,dog,swims,pool,dog,swimming,pool,4.6
8,dog,runs,woods,dog,running,woods,4.0
...,...,...,...,...,...,...,...
497,person,squeezing,face,dog,running,area,0.0
498,Dog,running,mouth,dog,swims,mouth,2.0
499,girl,running,beach,girl,running,beach,4.2
500,player,throws,ball,player,holds,ball,1.6


Unnamed: 0,S1,P1,O1,S2,P2,O2,label
0,dogs,play,grass,dogs,playing,snow,2.8
1,building,painted,jack,building,painted,Jack,4.8
2,dog,swims,water,cape,running,snow,1.4
4,dog,runs,mouth,dogs,play,skyline,1.8
5,bird,flies,water,bird,flies,water,4.8
...,...,...,...,...,...,...,...
110,dog,runs,water,dog,runs,field,2.0
111,girl,running,path,girl,talking,phone,0.8
112,man,colored,houses,boat,passes,houses,5.0
113,sunglasses,holds,hands,top,blowing,bubble,1.0


In [6]:
#Word2Vec conversion. Dimensions and procedure match what's in the paper
#Point of ambiguity: we only ever have a single word for a subject, predicate, or object; but paper seems to suggest sometimes
#that there can be multi-word subjects/predicates/objects
import os
import numpy as np
RANDOM_SEED = 23432098
os.environ['PYTHONHASHSEED'] = str(RANDOM_SEED)

import gensim
from gensim.models import Word2Vec
#MSRvid data
#build a Word2Vec model for train data and one for test data
train_sentences = sentence1_cleaned + sentence2_cleaned
test_sentences = sentence1_cleaned_test + sentence2_cleaned_test
w2v1 = Word2Vec(train_sentences, vector_size=50, workers=1, min_count=1)
w2v1_test = Word2Vec(test_sentences, vector_size=50, workers=1, min_count=1)
#then to get the sentences_final, pull out the .wv for each word in the sentence and transpose it
sentence1_final = []
sentence2_final = []
sentence1_final_test = []
sentence2_final_test = []
for s in sentence1_cleaned:
    words = []
    for w in s:
        mat = w2v1.wv[w]
        words.append(mat.transpose())
    sentence1_final.append(words)
for s in sentence2_cleaned:
    words = []
    for w in s:
        mat = w2v1.wv[w]
        words.append(mat.transpose())
    sentence2_final.append(words)
for s in sentence1_cleaned_test:
    words = []
    for w in s:
        mat = w2v1_test.wv[w]
        words.append(mat.transpose())
    sentence1_final_test.append(words)
for s in sentence2_cleaned_test:
    words = []
    for w in s:
        mat = w2v1_test.wv[w]
        words.append(mat.transpose())
    sentence2_final_test.append(words)
    
#headlines data
#build a Word2Vec model for train data and one for test data
train_sentences = sentence1_cleaned_h + sentence2_cleaned_h
test_sentences = sentence1_cleaned_test_h + sentence2_cleaned_test_h
w2v1 = Word2Vec(train_sentences, vector_size=50, workers=1, min_count=1)
w2v1_test = Word2Vec(test_sentences, vector_size=50, workers=1, min_count=1)
#then to get the sentences_final, pull out the .wv for each word in the sentence and transpose it
sentence1_final_h = []
sentence2_final_h = []
sentence1_final_test_h = []
sentence2_final_test_h = []
for s in sentence1_cleaned_h:
    words = []
    for w in s:
        mat = w2v1.wv[w]
        words.append(mat.transpose())
    sentence1_final_h.append(words)
for s in sentence2_cleaned_h:
    words = []
    for w in s:
        mat = w2v1.wv[w]
        words.append(mat.transpose())
    sentence2_final_h.append(words)
for s in sentence1_cleaned_test_h:
    words = []
    for w in s:
        mat = w2v1_test.wv[w]
        words.append(mat.transpose())
    sentence1_final_test_h.append(words)
for s in sentence2_cleaned_test_h:
    words = []
    for w in s:
        mat = w2v1_test.wv[w]
        words.append(mat.transpose())
    sentence2_final_test_h.append(words)
    
#images2014 data
#build a Word2Vec model for train data and one for test data
train_sentences = sentence1_cleaned_4 + sentence2_cleaned_4
test_sentences = sentence1_cleaned_test_4 + sentence2_cleaned_test_4
w2v1 = Word2Vec(train_sentences, vector_size=50, workers=1, min_count=1)
w2v1_test = Word2Vec(test_sentences, vector_size=50, workers=1, min_count=1)
#then to get the sentences_final, pull out the .wv for each word in the sentence and transpose it
sentence1_final_4 = []
sentence2_final_4 = []
sentence1_final_test_4 = []
sentence2_final_test_4 = []
for s in sentence1_cleaned_4:
    words = []
    for w in s:
        mat = w2v1.wv[w]
        words.append(mat.transpose())
    sentence1_final_4.append(words)
for s in sentence2_cleaned_4:
    words = []
    for w in s:
        mat = w2v1.wv[w]
        words.append(mat.transpose())
    sentence2_final_4.append(words)
for s in sentence1_cleaned_test_4:
    words = []
    for w in s:
        mat = w2v1_test.wv[w]
        words.append(mat.transpose())
    sentence1_final_test_4.append(words)
for s in sentence2_cleaned_test_4:
    words = []
    for w in s:
        mat = w2v1_test.wv[w]
        words.append(mat.transpose())
    sentence2_final_test_4.append(words)

#images2015 data
#build a Word2Vec model for train data and one for test data
train_sentences = sentence1_cleaned_5 + sentence2_cleaned_5
test_sentences = sentence1_cleaned_test_5 + sentence2_cleaned_test_5
w2v1 = Word2Vec(train_sentences, vector_size=50, workers=1, min_count=1)
w2v1_test = Word2Vec(test_sentences, vector_size=50, workers=1, min_count=1)
#then to get the sentences_final, pull out the .wv for each word in the sentence and transpose it
sentence1_final_5 = []
sentence2_final_5 = []
sentence1_final_test_5 = []
sentence2_final_test_5 = []
for s in sentence1_cleaned_5:
    words = []
    for w in s:
        mat = w2v1.wv[w]
        words.append(mat.transpose())
    sentence1_final_5.append(words)
for s in sentence2_cleaned_5:
    words = []
    for w in s:
        mat = w2v1.wv[w]
        words.append(mat.transpose())
    sentence2_final_5.append(words)
for s in sentence1_cleaned_test_5:
    words = []
    for w in s:
        mat = w2v1_test.wv[w]
        words.append(mat.transpose())
    sentence1_final_test_5.append(words)
for s in sentence2_cleaned_test_5:
    words = []
    for w in s:
        mat = w2v1_test.wv[w]
        words.append(mat.transpose())
    sentence2_final_test_5.append(words)

## Model Definition and Training

In [7]:
#define the CNN model
#I am using MaxPool2d as opposed to k-max pooling as we know the sentences should always be the same size
#I have left the print statements intact so you can see how the values become smaller/closer to zero
#just uncomment, then run to see
import torch
import torch.nn as nn
import torch.nn.functional as F
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(50, 17, kernel_size=(3,3), padding=1)
        self.conv2 = nn.Conv2d(17, 6, kernel_size=(3,3), padding=1)
        #self.pool1 = nn.MaxPool2d(3) #errors if set to 3 as the original paper uses for its pooling
        self.pool2 = nn.MaxPool2d(1)
        self.fc1 = nn.Linear(18, 1)

    def forward(self, x_prime):
        x_prime = F.relu(self.conv1(x_prime))
        #print("After first conv layer:")
        #print(x_prime)
        x_prime = F.relu(self.conv2(x_prime))
        #print("After second conv layer:")
        #print(x_prime)
        x_prime = self.pool2(x_prime)
        #print("After first pool:")
        #print(x_prime)
        x_prime = self.pool2(x_prime)
        #print("After second pool:")
        #print(x_prime)
        x_prime = x_prime.view(-1, 18)
        #print("View X prime:")
        #print(x_prime)
        x_prime = self.fc1(x_prime)
        #print("After FC layer:")
        #print(x_prime) #show what is being output from the model
        return x_prime

In [8]:
#define the data loaders
#to do this, construct the training data by binding together final sentence1 and sentence2 with their target score
#MSRvid data
df1 = pd.DataFrame(sentence1_final, columns = ["S1", "P1", "O1"])
df2 = pd.DataFrame(sentence2_final, columns = ["S2", "P2", "O2"])
df3 = pd.DataFrame(labels, columns = ["label"])
combined_df = df1.join(df2)
combined_df = combined_df.join(df3)
train_data = combined_df.values.tolist()
train_loader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True)
#repeat for test data
df1 = pd.DataFrame(sentence1_final_test, columns = ["S1", "P1", "O1"])
df2 = pd.DataFrame(sentence2_final_test, columns = ["S2", "P2", "O2"])
df3 = pd.DataFrame(labels_test, columns = ["label"])
combined_df = df1.join(df2)
combined_df = combined_df.join(df3)
test_data = combined_df.values.tolist()
val_loader = torch.utils.data.DataLoader(test_data, batch_size=64, shuffle=False)

#headlines data
df1 = pd.DataFrame(sentence1_final_h, columns = ["S1", "P1", "O1"])
df2 = pd.DataFrame(sentence2_final_h, columns = ["S2", "P2", "O2"])
df3 = pd.DataFrame(labels_h, columns = ["label"])
combined_df = df1.join(df2)
combined_df = combined_df.join(df3)
train_data_h = combined_df.values.tolist()
train_loader_h = torch.utils.data.DataLoader(train_data_h, batch_size=64, shuffle=True)
#repeat for test data
df1 = pd.DataFrame(sentence1_final_test_h, columns = ["S1", "P1", "O1"])
df2 = pd.DataFrame(sentence2_final_test_h, columns = ["S2", "P2", "O2"])
df3 = pd.DataFrame(labels_test_h, columns = ["label"])
combined_df = df1.join(df2)
combined_df = combined_df.join(df3)
test_data_h = combined_df.values.tolist()
val_loader_h = torch.utils.data.DataLoader(test_data_h, batch_size=64, shuffle=False)

#images2014 data
df1 = pd.DataFrame(sentence1_final_4, columns = ["S1", "P1", "O1"])
df2 = pd.DataFrame(sentence2_final_4, columns = ["S2", "P2", "O2"])
df3 = pd.DataFrame(labels_4, columns = ["label"])
combined_df = df1.join(df2)
combined_df = combined_df.join(df3)
train_data_4 = combined_df.values.tolist()
train_loader_4 = torch.utils.data.DataLoader(train_data_4, batch_size=64, shuffle=True)
#repeat for test data
df1 = pd.DataFrame(sentence1_final_test_4, columns = ["S1", "P1", "O1"])
df2 = pd.DataFrame(sentence2_final_test_4, columns = ["S2", "P2", "O2"])
df3 = pd.DataFrame(labels_test_4, columns = ["label"])
combined_df = df1.join(df2)
combined_df = combined_df.join(df3)
test_data_4 = combined_df.values.tolist()
val_loader_4 = torch.utils.data.DataLoader(test_data_4, batch_size=64, shuffle=False)

#images2015 data
df1 = pd.DataFrame(sentence1_final_5, columns = ["S1", "P1", "O1"])
df2 = pd.DataFrame(sentence2_final_5, columns = ["S2", "P2", "O2"])
df3 = pd.DataFrame(labels_5, columns = ["label"])
combined_df = df1.join(df2)
combined_df = combined_df.join(df3)
train_data_5 = combined_df.values.tolist()
train_loader_5 = torch.utils.data.DataLoader(train_data_5, batch_size=64, shuffle=True)
#repeat for test data
df1 = pd.DataFrame(sentence1_final_test_5, columns = ["S1", "P1", "O1"])
df2 = pd.DataFrame(sentence2_final_test_5, columns = ["S2", "P2", "O2"])
df3 = pd.DataFrame(labels_test_5, columns = ["label"])
combined_df = df1.join(df2)
combined_df = combined_df.join(df3)
test_data_5 = combined_df.values.tolist()
val_loader_5 = torch.utils.data.DataLoader(test_data_5, batch_size=64, shuffle=False)

I couldn't tell from the original paper if they had trained the model on each dataset in isolation (i.e generating a new instance of the model for each dataset), or trained one model with all the datasets. I opted for the first approach. Note the only transformation I do on the Manhattan distance this time is scaling it by a factor of 100; the original paper notes that we don't need to do the same transform as in Experiment 1 as the score values are continuous instead of binary this time.

In [9]:
#now train the model
criterion = nn.MSELoss()
model = SimpleCNN()
model_h = SimpleCNN()
model_4 = SimpleCNN()
model_5 = SimpleCNN()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
n_epochs = 10
from scipy.spatial.distance import cityblock
import math
import random
from tqdm import tqdm
def train_model(model, train_dataloader, n_epoch=n_epochs, optimizer=optimizer, criterion=criterion):
    import torch.optim as optim
    model.train() # prep model for training
    for epoch in range(n_epoch):
        curr_epoch_loss = []
        for s1s, s1p, s1o, s2s, s2p, s2o, target in tqdm(train_dataloader):
            #first, process s1 and s2 through the model
            #ensure the batch size is accurate
            batch = s1s.shape[0]
            s1 = np.concatenate([s1s,s1p,s1o])
            s1 = np.reshape(s1, (batch,50,3,1))
            s1 = torch.tensor(s1) 
            s1_processed = model(s1)
            s2 = np.concatenate([s2s,s2p,s2o])
            s2 = np.reshape(s2,(batch,50,3,1))
            s2 = torch.tensor(s2)
            s2_processed = model(s2)
            #need to detach to perform manhattan distance calculation, otherwise throws error
            s1_detached = s1_processed.detach()
            s2_detached = s2_processed.detach()
            y_hats = torch.empty(target.shape[0])
            for i in range(target.shape[0]):
                s1_detached_i = torch.flatten(s1_detached)
                s2_detached_i = torch.flatten(s2_detached)
                #now calculate manhattan distance
                manhattan = cityblock(s1_detached_i, s2_detached_i)
                y_hat = torch.tensor(manhattan) * 100 #based on observed output values from model, need to multiply to scale values
                y_hats[i] = y_hat
            y_hats = y_hats.requires_grad_()
            target = target.float()
            loss = criterion(y_hats,target)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            curr_epoch_loss.append(loss.cpu().data.numpy())
        print(f"Epoch {epoch}: curr_epoch_loss={np.mean(curr_epoch_loss)}")
    return model
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
model = train_model(model, train_loader)
model_h = train_model(model_h, train_loader_h)
model_4 = train_model(model_4, train_loader_4)
model_5 = train_model(model_5, train_loader_5)

  return default_collate([torch.as_tensor(b) for b in batch])
100%|██████████| 13/13 [00:00<00:00, 50.50it/s]


Epoch 0: curr_epoch_loss=3.930372953414917


100%|██████████| 13/13 [00:00<00:00, 52.99it/s]


Epoch 1: curr_epoch_loss=3.9497084617614746


100%|██████████| 13/13 [00:00<00:00, 51.33it/s]


Epoch 2: curr_epoch_loss=3.955994129180908


100%|██████████| 13/13 [00:00<00:00, 53.88it/s]


Epoch 3: curr_epoch_loss=3.8489224910736084


100%|██████████| 13/13 [00:00<00:00, 50.44it/s]


Epoch 4: curr_epoch_loss=3.942751407623291


100%|██████████| 13/13 [00:00<00:00, 51.44it/s]


Epoch 5: curr_epoch_loss=3.8198726177215576


100%|██████████| 13/13 [00:00<00:00, 54.97it/s]


Epoch 6: curr_epoch_loss=4.14233922958374


100%|██████████| 13/13 [00:00<00:00, 53.52it/s]


Epoch 7: curr_epoch_loss=3.9616148471832275


100%|██████████| 13/13 [00:00<00:00, 51.00it/s]


Epoch 8: curr_epoch_loss=3.8235697746276855


100%|██████████| 13/13 [00:00<00:00, 52.34it/s]


Epoch 9: curr_epoch_loss=3.887640953063965


100%|██████████| 4/4 [00:00<00:00, 54.29it/s]


Epoch 0: curr_epoch_loss=4.999321460723877


100%|██████████| 4/4 [00:00<00:00, 56.61it/s]


Epoch 1: curr_epoch_loss=5.482011795043945


100%|██████████| 4/4 [00:00<00:00, 56.89it/s]


Epoch 2: curr_epoch_loss=4.972720146179199


100%|██████████| 4/4 [00:00<00:00, 56.62it/s]


Epoch 3: curr_epoch_loss=5.366934299468994


100%|██████████| 4/4 [00:00<00:00, 57.44it/s]


Epoch 4: curr_epoch_loss=5.792056083679199


100%|██████████| 4/4 [00:00<00:00, 53.10it/s]


Epoch 5: curr_epoch_loss=5.736116409301758


100%|██████████| 4/4 [00:00<00:00, 52.89it/s]


Epoch 6: curr_epoch_loss=5.5301103591918945


100%|██████████| 4/4 [00:00<00:00, 53.82it/s]


Epoch 7: curr_epoch_loss=5.271693229675293


100%|██████████| 4/4 [00:00<00:00, 57.95it/s]


Epoch 8: curr_epoch_loss=5.0240888595581055


100%|██████████| 4/4 [00:00<00:00, 54.31it/s]


Epoch 9: curr_epoch_loss=5.235907554626465


100%|██████████| 4/4 [00:00<00:00, 49.77it/s]


Epoch 0: curr_epoch_loss=3.214266777038574


100%|██████████| 4/4 [00:00<00:00, 50.14it/s]

Epoch 1: curr_epoch_loss=3.0992441177368164



100%|██████████| 4/4 [00:00<00:00, 52.01it/s]


Epoch 2: curr_epoch_loss=3.105893611907959


100%|██████████| 4/4 [00:00<00:00, 49.63it/s]


Epoch 3: curr_epoch_loss=3.155287027359009


100%|██████████| 4/4 [00:00<00:00, 50.28it/s]


Epoch 4: curr_epoch_loss=2.953911781311035


100%|██████████| 4/4 [00:00<00:00, 51.97it/s]


Epoch 5: curr_epoch_loss=2.986299991607666


100%|██████████| 4/4 [00:00<00:00, 50.69it/s]


Epoch 6: curr_epoch_loss=3.0852112770080566


100%|██████████| 4/4 [00:00<00:00, 51.77it/s]


Epoch 7: curr_epoch_loss=3.2000341415405273


100%|██████████| 4/4 [00:00<00:00, 51.23it/s]


Epoch 8: curr_epoch_loss=2.9050850868225098


100%|██████████| 4/4 [00:00<00:00, 52.16it/s]


Epoch 9: curr_epoch_loss=3.403782367706299


100%|██████████| 6/6 [00:00<00:00, 52.42it/s]


Epoch 0: curr_epoch_loss=6.493733882904053


100%|██████████| 6/6 [00:00<00:00, 51.99it/s]


Epoch 1: curr_epoch_loss=6.498775005340576


100%|██████████| 6/6 [00:00<00:00, 52.62it/s]


Epoch 2: curr_epoch_loss=6.496604919433594


100%|██████████| 6/6 [00:00<00:00, 53.43it/s]


Epoch 3: curr_epoch_loss=6.262085437774658


100%|██████████| 6/6 [00:00<00:00, 51.38it/s]


Epoch 4: curr_epoch_loss=6.288381099700928


100%|██████████| 6/6 [00:00<00:00, 51.98it/s]


Epoch 5: curr_epoch_loss=6.417821407318115


100%|██████████| 6/6 [00:00<00:00, 49.89it/s]


Epoch 6: curr_epoch_loss=6.320668697357178


100%|██████████| 6/6 [00:00<00:00, 48.70it/s]


Epoch 7: curr_epoch_loss=6.413120269775391


100%|██████████| 6/6 [00:00<00:00, 52.19it/s]


Epoch 8: curr_epoch_loss=6.185629367828369


100%|██████████| 6/6 [00:00<00:00, 55.78it/s]

Epoch 9: curr_epoch_loss=6.422279357910156





## Evaluating the Model
The Pearson's correlation between the predicted and true values for each dataset is shown in the table below this cell. In the next cell, the cumulative runtime and memory usage is displayed.

In [10]:
#Evaluate the model on the test data
def eval_model(model, dataloader):
    model.eval()
    Y_pred = []
    Y_true = []
    for s1s, s1p, s1o, s2s, s2p, s2o, target in dataloader:
        
        batch = s1s.shape[0]
        s1 = np.concatenate([s1s,s1p,s1o])
        s1 = np.reshape(s1, (batch,50,3,1))
        s1 = torch.tensor(s1)
        s1_processed = model(s1)
        s2 = np.concatenate([s2s,s2p,s2o])
        s2 = np.reshape(s2,(batch,50,3,1))
        s2 = torch.tensor(s2)
        s2_processed = model(s2)
        s1_detached = s1_processed.detach()
        s2_detached = s2_processed.detach()
        y_hats = torch.empty(target.shape[0])
        for i in range(target.shape[0]):
            s1_detached_i = torch.flatten(s1_detached)
            s2_detached_i = torch.flatten(s2_detached)
            #now calculate manhattan distance
            manhattan = cityblock(s1_detached_i, s2_detached_i)
            y_hat = torch.tensor(manhattan) * 100
            y_hats[i] = y_hat
        Y_pred.append(y_hats)
        Y_true.append(target)
    Y_pred = np.concatenate(Y_pred, axis=0)
    Y_true = np.concatenate(Y_true, axis=0)
    return Y_pred, Y_true

#print metrics
y_pred, y_true = eval_model(model, val_loader)
y_pred_h, y_true_h = eval_model(model_h, val_loader_h)
y_pred_4, y_true_4 = eval_model(model_4, val_loader_4)
y_pred_5, y_true_5 = eval_model(model_5, val_loader_5)
print(y_pred_h) #printed so you can see constant values
#for Experiment 2, we want to find Pearson's correlation between truth and predicted
from scipy import stats
pMSRvid = stats.pearsonr(y_true, y_pred).statistic
pheadlines = stats.pearsonr(y_true_h, y_pred_h).statistic
pimages2014 = stats.pearsonr(y_true_4, y_pred_4).statistic
pimages2015 = stats.pearsonr(y_true_5, y_pred_5).statistic
headers = ["MSRvid", "headlines", "images2014", "images2015"]
stats = [pMSRvid, pheadlines, pimages2014, pimages2015]
print(pd.DataFrame(stats,headers))

[0.32342672 0.32342672 0.32342672 0.32342672 0.32342672 0.32342672
 0.32342672 0.32342672 0.32342672 0.32342672 0.32342672 0.32342672
 0.32342672 0.32342672 0.32342672 0.32342672 0.32342672 0.32342672
 0.32342672 0.32342672 0.32342672 0.32342672 0.32342672 0.32342672
 0.32342672]
                   0
MSRvid      0.176221
headlines        NaN
images2014  0.065386
images2015 -0.030861




In [11]:
print("Total running time = {:.2f} seconds".format(time.time() - _START_RUNTIME))
print("Current and Peak Memory Usage:")
print(tracemalloc.get_traced_memory())
tracemalloc.stop()

Total running time = 152.84 seconds
Current and Peak Memory Usage:
(59204048, 59572440)
