In [26]:
import torch
import pickle as pkl
import pandas as pd
import numpy as np

In [27]:
import os
os.chdir('..')

In [28]:
torch.cuda.is_available()

True

In [29]:
from sentence_transformers import SentenceTransformer

model_name = 'bert-base-nli-mean-tokens'
model = SentenceTransformer(model_name)

In [30]:
device = torch.device('cuda:0')

In [31]:
model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [20]:
arc_body = pd.read_csv('/gpt_embeddings/pure_arc_body.csv')

In [21]:
arc_body

Unnamed: 0,Body ID,articleBody
0,90000,I'm curious when Democrats pull the lever for ...
1,90001,This is the worst article this newspaper has e...
2,90002,"'No spic Inglés, señor.' -Escúchame: No temas,..."
3,90003,Not a good idea because it would set a precede...
4,90004,I find it laughable that all the liberals argu...
...,...,...
4443,94443,"My girls' friend was texting through a movie, ..."
4444,94444,"If home captivity is the issue, then how to ex..."
4445,94445,It is easier for us at home and at our lake ca...
4446,94446,"Wonderful essay, thanks. The villain isn't ele..."


In [22]:
arc_head = pd.read_csv('/gpt_embeddings/pure_arc_stance.csv')

In [23]:
arc_head

Unnamed: 0,Headline,Body ID,Stance
0,We don't need to hear from candidate's spouse,90628,agree
1,Comment sections have failed,91932,unrelated
2,Casinos are not too much of a gamble,92724,unrelated
3,New York's bike lanes are not working,93804,unrelated
4,Drone attacks make more harm,91445,unrelated
...,...,...,...
14228,3-D printing will change the world,90050,unrelated
14229,Mother Teresa should be canonized,91820,unrelated
14230,Comment sections have failed,93169,unrelated
14231,Colleges do not need humanities programs,93466,unrelated


In [16]:
import nltk
from nltk.tokenize import sent_tokenize

In [24]:
arc_head['sentences'] = arc_head['Headline'].apply(sent_tokenize)
arc_body['sentences'] = arc_body['articleBody'].apply(sent_tokenize)

In [None]:
def get_embedding(sent):
    return model.encode(sent)

In [34]:
arc_body['embeddings'] = arc_body['sentences'].apply(get_embedding)

In [35]:
arc_head['embeddings'] = arc_head['sentences'].apply(get_embedding)

In [49]:
arc_train = arc_head.merge(arc_body, on='Body ID', suffixes=['_head', '_body'])

In [50]:
arc_train['embeddings_body'] = arc_train['embeddings_body'].apply(torch.tensor)
arc_train['embeddings_head'] = arc_train['embeddings_head'].apply(torch.tensor)

In [67]:
label_mapping = {'unrelated': 0, 'agree': 1, 'discuss': 2, 'disagree': 3}

arc_train['Stance'] = arc_train['Stance'].map(label_mapping)

In [69]:
arc_train

Unnamed: 0,Headline,Body ID,Stance,sentences_head,embeddings_head,articleBody,sentences_body,embeddings_body
0,We don't need to hear from candidate's spouse,90628,1,[We don't need to hear from candidate's spouse],"[[tensor(0.7675), tensor(0.9844), tensor(2.252...",Wives should never speak at conventions or on ...,[Wives should never speak at conventions or on...,"[[tensor(-0.0928), tensor(0.9080), tensor(1.39..."
1,Guns should be permitted on college campuses,90628,0,[Guns should be permitted on college campuses],"[[tensor(0.5609), tensor(0.5146), tensor(0.873...",Wives should never speak at conventions or on ...,[Wives should never speak at conventions or on...,"[[tensor(-0.0928), tensor(0.9080), tensor(1.39..."
2,Young people are being irresponsible by delayi...,90628,0,[Young people are being irresponsible by delay...,"[[tensor(0.5083), tensor(0.3331), tensor(1.156...",Wives should never speak at conventions or on ...,[Wives should never speak at conventions or on...,"[[tensor(-0.0928), tensor(0.9080), tensor(1.39..."
3,Guns should be permitted on college campuses,90628,0,[Guns should be permitted on college campuses],"[[tensor(0.5609), tensor(0.5146), tensor(0.873...",Wives should never speak at conventions or on ...,[Wives should never speak at conventions or on...,"[[tensor(-0.0928), tensor(0.9080), tensor(1.39..."
4,California doesn't need high-speed rail,90628,0,[California doesn't need high-speed rail],"[[tensor(0.9430), tensor(0.2814), tensor(0.311...",Wives should never speak at conventions or on ...,[Wives should never speak at conventions or on...,"[[tensor(-0.0928), tensor(0.9080), tensor(1.39..."
...,...,...,...,...,...,...,...,...
14228,Greece will destroy the Euro Zone,91602,0,[Greece will destroy the Euro Zone],"[[tensor(-0.2398), tensor(0.3448), tensor(0.36...",I disagree with coersion being the option. An ...,"[I disagree with coersion being the option., A...","[[tensor(0.7141), tensor(0.6541), tensor(1.680..."
14229,Prostituion is safer when it's legal,90826,3,[Prostituion is safer when it's legal],"[[tensor(-0.4055), tensor(-0.5117), tensor(0.7...",Prostitution is harmful and dehumanizing and e...,[Prostitution is harmful and dehumanizing and ...,"[[tensor(-0.2160), tensor(0.0737), tensor(-0.1..."
14230,Fees are necessary,92816,0,[Fees are necessary],"[[tensor(-0.1438), tensor(-0.5031), tensor(1.9...",The fact Bush got elected twice and Perry seve...,[The fact Bush got elected twice and Perry sev...,"[[tensor(-0.0203), tensor(0.3725), tensor(-0.2..."
14231,We don't need libraries,91291,3,[We don't need libraries],"[[tensor(0.9495), tensor(0.7203), tensor(1.357...",It will be a sad day for all of us when librar...,[It will be a sad day for all of us when libra...,"[[tensor(-0.5685), tensor(0.5428), tensor(1.05..."


In [43]:
from sklearn.model_selection import train_test_split

In [70]:
fnc_train = pkl.load(open('/gpt_embeddings/fnc_train.pkl','rb'))

In [71]:
total_train = pd.concat([arc_train, fnc_train], axis=0)

In [74]:
total_train.Stance.unique()

array([1, 0, 2, 3], dtype=int64)

In [75]:
total_train, total_val = train_test_split(total_train, test_size=0.2, random_state=42)

In [76]:
pkl.dump(total_train, open('gpt_embeddings/total_train.pkl','wb'))
pkl.dump(total_val, open('gpt_embeddings/total_val.pkl', 'wb'))

In [78]:
total_train_copy = pkl.load(open('gpt_embeddings/total_train.pkl','rb'))
total_val_copy = pkl.load(open('gpt_embeddings/total_val.pkl', 'rb'))

In [79]:
total_train_copy

Unnamed: 0,Headline,Body ID,Stance,sentences_head,embeddings_head,articleBody,sentences_body,embeddings_body
38384,Isis leader Abu Bakr al-Baghdadi's 'wife and s...,1913,0,[Isis leader Abu Bakr al-Baghdadi's 'wife and ...,"[[tensor(-0.2697), tensor(0.9552), tensor(-0.1...",A blast near the Nicaraguan capital city of Ma...,[A blast near the Nicaraguan capital city of M...,"[[tensor(-0.0075), tensor(0.4806), tensor(-0.3..."
20073,"Lady on FB: I'm 41, Intersex, and Fucked Micha...",879,0,"[Lady on FB: I'm 41, Intersex, and Fucked Mich...","[[tensor(-0.4410), tensor(0.7908), tensor(0.08...",Matt Taibbi is taking a “leave of absence” fro...,[Matt Taibbi is taking a “leave of absence” fr...,"[[tensor(0.0762), tensor(-0.0156), tensor(1.25..."
8404,"James Wright Foley, Kidnapped Journalist, Appa...",139,0,"[James Wright Foley, Kidnapped Journalist, App...","[[tensor(0.3436), tensor(0.1727), tensor(-0.29...",Judd Nelson rebuffs Internet rumors that he di...,[Judd Nelson rebuffs Internet rumors that he d...,"[[tensor(0.3818), tensor(0.8142), tensor(0.356..."
26909,Michael Brown shooting audio caught on tape?,195,0,[Michael Brown shooting audio caught on tape?],"[[tensor(0.5646), tensor(0.4658), tensor(-0.23...",A video posted by ISIL terrorists in Iraq purp...,[A video posted by ISIL terrorists in Iraq pur...,"[[tensor(0.4329), tensor(0.6699), tensor(-0.40..."
7671,Marijuana is a gateway drug,91046,0,[Marijuana is a gateway drug],"[[tensor(-0.0491), tensor(0.0707), tensor(0.99...",The whole world of nutrition is a mind field f...,[The whole world of nutrition is a mind field ...,"[[tensor(-0.1277), tensor(0.2603), tensor(0.08..."
...,...,...,...,...,...,...,...,...
48337,Seven teenage girls get pregnant on school trip,1804,1,[Seven teenage girls get pregnant on school trip],"[[tensor(-0.4397), tensor(0.0199), tensor(-0.0...","Seven girls from a Bosnian school, all aged be...","[Seven girls from a Bosnian school, all aged b...","[[tensor(-0.5624), tensor(0.6778), tensor(-0.4..."
23925,Macaulay Culkin Dead: Actor Once Again Victim ...,498,0,[Macaulay Culkin Dead: Actor Once Again Victim...,"[[tensor(0.3356), tensor(0.4754), tensor(1.345...",Twitter users are suggesting the phallic daubi...,[Twitter users are suggesting the phallic daub...,"[[tensor(-0.3486), tensor(1.1035), tensor(-0.6..."
860,Big banks are not out of control,92226,0,[Big banks are not out of control],"[[tensor(0.4441), tensor(-0.8802), tensor(0.44...",Thank you. I believe this is what is important...,"[Thank you., I believe this is what is importa...","[[tensor(-0.0789), tensor(0.0615), tensor(2.75..."
1562,Texas City plumber's truck ends up in Syrian war,2344,0,[Texas City plumber's truck ends up in Syrian ...,"[[tensor(0.1310), tensor(0.5537), tensor(-0.24...",North Korea may have a woman at the helm–Kim J...,[North Korea may have a woman at the helm–Kim ...,"[[tensor(0.1755), tensor(-0.2210), tensor(1.11..."


In [80]:
total_val_copy

Unnamed: 0,Headline,Body ID,Stance,sentences_head,embeddings_head,articleBody,sentences_body,embeddings_body
47361,"Nun gives birth, intends to keep baby",391,0,"[Nun gives birth, intends to keep baby]","[[tensor(0.9288), tensor(0.3911), tensor(1.291...",The Department of Homeland Security flatly den...,[The Department of Homeland Security flatly de...,"[[tensor(-0.2243), tensor(0.7804), tensor(-0.1..."
21733,Kim Jong-un So Fat He Injured His Ankles,1103,0,[Kim Jong-un So Fat He Injured His Ankles],"[[tensor(0.0239), tensor(0.1279), tensor(-0.37...","When most parents think of a school trip, they...","[When most parents think of a school trip, the...","[[tensor(-0.5554), tensor(-0.5530), tensor(0.9..."
12631,Christian Bale in Talks to Play Steve Jobs in ...,2455,2,[Christian Bale in Talks to Play Steve Jobs in...,"[[tensor(0.4828), tensor(0.2569), tensor(0.634...","Cristian Bale is eyeing — or i-ing, if you pre...","[Cristian Bale is eyeing — or i-ing, if you pr...","[[tensor(-0.2053), tensor(1.2861), tensor(-0.2..."
43533,That Was Fast: Christian Bale Bails on Steve J...,1841,1,[That Was Fast: Christian Bale Bails on Steve ...,"[[tensor(-0.0392), tensor(0.6430), tensor(0.68...",After early discussions to play the Apple foun...,[After early discussions to play the Apple fou...,"[[tensor(0.0424), tensor(0.4565), tensor(0.255..."
48539,How Canadian Sergeant-at-Arms Kevin Vickers ki...,1472,2,[How Canadian Sergeant-at-Arms Kevin Vickers k...,"[[tensor(0.5736), tensor(0.6943), tensor(-0.68...","Kevin Vickers, sergeant-at-arms of the House o...","[Kevin Vickers, sergeant-at-arms of the House ...","[[tensor(0.0711), tensor(0.3848), tensor(0.545..."
...,...,...,...,...,...,...,...,...
3391,Abolish Birthright Citizenship,90226,0,[Abolish Birthright Citizenship],"[[tensor(0.0406), tensor(0.5931), tensor(0.190...","Nope. I, a male, taught at an all girls high s...","[Nope., I, a male, taught at an all girls high...","[[tensor(0.4658), tensor(0.2772), tensor(2.624..."
46992,SEND IN THE DRONES: Predator drones flown over...,2186,0,[SEND IN THE DRONES: Predator drones flown ove...,"[[tensor(0.1675), tensor(0.5462), tensor(0.105...",A hoax that went viral on the internet has lef...,[A hoax that went viral on the internet has le...,"[[tensor(0.2566), tensor(0.3845), tensor(0.585..."
9014,It should be illegal to declaw your cat,91909,0,[It should be illegal to declaw your cat],"[[tensor(0.2775), tensor(0.3884), tensor(1.088...",There is a great responsibility to parenting. ...,[There is a great responsibility to parenting....,"[[tensor(0.1783), tensor(-0.0731), tensor(1.25..."
19858,New Audio Recording From Ferguson May Capture ...,693,2,[New Audio Recording From Ferguson May Capture...,"[[tensor(0.6904), tensor(-0.2299), tensor(1.37...",(KTVI)- An audio recording that allegedly capt...,[(KTVI)- An audio recording that allegedly cap...,"[[tensor(0.3192), tensor(0.2311), tensor(-0.02..."


In [81]:
trans = lambda x: torch.mean(x, dim=0)

In [82]:
total_train_copy['embeddings_head'] = total_train_copy['embeddings_head'].apply(trans)
total_train_copy['embeddings_body'] = total_train_copy['embeddings_body'].apply(trans)
total_val_copy['embeddings_head'] = total_val_copy['embeddings_head'].apply(trans)
total_val_copy['embeddings_body'] = total_val_copy['embeddings_body'].apply(trans)

In [83]:
total_train_copy

Unnamed: 0,Headline,Body ID,Stance,sentences_head,embeddings_head,articleBody,sentences_body,embeddings_body
38384,Isis leader Abu Bakr al-Baghdadi's 'wife and s...,1913,0,[Isis leader Abu Bakr al-Baghdadi's 'wife and ...,"[tensor(-0.2697), tensor(0.9552), tensor(-0.13...",A blast near the Nicaraguan capital city of Ma...,[A blast near the Nicaraguan capital city of M...,"[tensor(0.1567), tensor(0.4139), tensor(0.0904..."
20073,"Lady on FB: I'm 41, Intersex, and Fucked Micha...",879,0,"[Lady on FB: I'm 41, Intersex, and Fucked Mich...","[tensor(-0.4410), tensor(0.7908), tensor(0.085...",Matt Taibbi is taking a “leave of absence” fro...,[Matt Taibbi is taking a “leave of absence” fr...,"[tensor(0.0924), tensor(0.3163), tensor(1.1158..."
8404,"James Wright Foley, Kidnapped Journalist, Appa...",139,0,"[James Wright Foley, Kidnapped Journalist, App...","[tensor(0.3436), tensor(0.1727), tensor(-0.294...",Judd Nelson rebuffs Internet rumors that he di...,[Judd Nelson rebuffs Internet rumors that he d...,"[tensor(0.1822), tensor(0.5470), tensor(0.3063..."
26909,Michael Brown shooting audio caught on tape?,195,0,[Michael Brown shooting audio caught on tape?],"[tensor(0.5646), tensor(0.4658), tensor(-0.232...",A video posted by ISIL terrorists in Iraq purp...,[A video posted by ISIL terrorists in Iraq pur...,"[tensor(0.3007), tensor(0.2842), tensor(0.4186..."
7671,Marijuana is a gateway drug,91046,0,[Marijuana is a gateway drug],"[tensor(-0.0491), tensor(0.0707), tensor(0.991...",The whole world of nutrition is a mind field f...,[The whole world of nutrition is a mind field ...,"[tensor(0.1469), tensor(0.5588), tensor(-0.003..."
...,...,...,...,...,...,...,...,...
48337,Seven teenage girls get pregnant on school trip,1804,1,[Seven teenage girls get pregnant on school trip],"[tensor(-0.4397), tensor(0.0199), tensor(-0.05...","Seven girls from a Bosnian school, all aged be...","[Seven girls from a Bosnian school, all aged b...","[tensor(-0.1658), tensor(0.4432), tensor(0.524..."
23925,Macaulay Culkin Dead: Actor Once Again Victim ...,498,0,[Macaulay Culkin Dead: Actor Once Again Victim...,"[tensor(0.3356), tensor(0.4754), tensor(1.3451...",Twitter users are suggesting the phallic daubi...,[Twitter users are suggesting the phallic daub...,"[tensor(0.0507), tensor(0.6100), tensor(-0.099..."
860,Big banks are not out of control,92226,0,[Big banks are not out of control],"[tensor(0.4441), tensor(-0.8802), tensor(0.442...",Thank you. I believe this is what is important...,"[Thank you., I believe this is what is importa...","[tensor(-0.2332), tensor(0.2352), tensor(1.288..."
1562,Texas City plumber's truck ends up in Syrian war,2344,0,[Texas City plumber's truck ends up in Syrian ...,"[tensor(0.1310), tensor(0.5537), tensor(-0.243...",North Korea may have a woman at the helm–Kim J...,[North Korea may have a woman at the helm–Kim ...,"[tensor(-0.2684), tensor(0.2057), tensor(0.678..."


In [84]:
total_val_copy

Unnamed: 0,Headline,Body ID,Stance,sentences_head,embeddings_head,articleBody,sentences_body,embeddings_body
47361,"Nun gives birth, intends to keep baby",391,0,"[Nun gives birth, intends to keep baby]","[tensor(0.9288), tensor(0.3911), tensor(1.2913...",The Department of Homeland Security flatly den...,[The Department of Homeland Security flatly de...,"[tensor(0.0699), tensor(0.4565), tensor(0.6801..."
21733,Kim Jong-un So Fat He Injured His Ankles,1103,0,[Kim Jong-un So Fat He Injured His Ankles],"[tensor(0.0239), tensor(0.1279), tensor(-0.377...","When most parents think of a school trip, they...","[When most parents think of a school trip, the...","[tensor(-0.0544), tensor(0.2273), tensor(0.712..."
12631,Christian Bale in Talks to Play Steve Jobs in ...,2455,2,[Christian Bale in Talks to Play Steve Jobs in...,"[tensor(0.4828), tensor(0.2569), tensor(0.6341...","Cristian Bale is eyeing — or i-ing, if you pre...","[Cristian Bale is eyeing — or i-ing, if you pr...","[tensor(-0.0683), tensor(0.3371), tensor(0.664..."
43533,That Was Fast: Christian Bale Bails on Steve J...,1841,1,[That Was Fast: Christian Bale Bails on Steve ...,"[tensor(-0.0392), tensor(0.6430), tensor(0.682...",After early discussions to play the Apple foun...,[After early discussions to play the Apple fou...,"[tensor(0.0629), tensor(0.3318), tensor(0.6999..."
48539,How Canadian Sergeant-at-Arms Kevin Vickers ki...,1472,2,[How Canadian Sergeant-at-Arms Kevin Vickers k...,"[tensor(0.5736), tensor(0.6943), tensor(-0.687...","Kevin Vickers, sergeant-at-arms of the House o...","[Kevin Vickers, sergeant-at-arms of the House ...","[tensor(-0.1887), tensor(0.4966), tensor(0.432..."
...,...,...,...,...,...,...,...,...
3391,Abolish Birthright Citizenship,90226,0,[Abolish Birthright Citizenship],"[tensor(0.0406), tensor(0.5931), tensor(0.1904...","Nope. I, a male, taught at an all girls high s...","[Nope., I, a male, taught at an all girls high...","[tensor(-0.0292), tensor(0.2782), tensor(1.122..."
46992,SEND IN THE DRONES: Predator drones flown over...,2186,0,[SEND IN THE DRONES: Predator drones flown ove...,"[tensor(0.1675), tensor(0.5462), tensor(0.1052...",A hoax that went viral on the internet has lef...,[A hoax that went viral on the internet has le...,"[tensor(0.0620), tensor(0.3018), tensor(0.3126..."
9014,It should be illegal to declaw your cat,91909,0,[It should be illegal to declaw your cat],"[tensor(0.2775), tensor(0.3884), tensor(1.0888...",There is a great responsibility to parenting. ...,[There is a great responsibility to parenting....,"[tensor(-0.0828), tensor(0.2738), tensor(1.438..."
19858,New Audio Recording From Ferguson May Capture ...,693,2,[New Audio Recording From Ferguson May Capture...,"[tensor(0.6904), tensor(-0.2299), tensor(1.373...",(KTVI)- An audio recording that allegedly capt...,[(KTVI)- An audio recording that allegedly cap...,"[tensor(0.1169), tensor(0.2531), tensor(0.1378..."


In [85]:
pkl.dump(total_train_copy, open('/gpt_embeddings/total_train_mean.pkl','wb'))
pkl.dump(total_val_copy, open('/gpt_embeddings/total_val_mean.pkl','wb'))

In [None]:
total_train['embeddings_head']

In [89]:
temp = pd.read_csv('/gpt_embeddings/test2.csv')

In [90]:
temp.head()

Unnamed: 0,Headline,Body ID,sentences_head,embeddings_head,articleBody,sentences_body,embeddings_body,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28
0,Ferguson riots: Pregnant woman loses eye after...,2008,['Ferguson riots: Pregnant woman loses eye aft...,[[ 0.317956 0.59476596 -0.22569571 -0.28339...,A RESPECTED senior French police officer inves...,['A RESPECTED senior French police officer inv...,[[-1.15868144e-01 4.64112371e-01 -7.01989383e...,58,35,15,...,0,0,0,0,0,0,0,0,0.0,0.205
1,Apple Stores to install safes to secure gold A...,2008,['Apple Stores to install safes to secure gold...,[[-8.70428234e-02 1.49100876e+00 -7.10931659e...,A RESPECTED senior French police officer inves...,['A RESPECTED senior French police officer inv...,[[-1.15868144e-01 4.64112371e-01 -7.01989383e...,35,19,5,...,0,0,0,0,0,0,0,0,0.0,1.4171
2,Pregnant woman loses eye after police shoot be...,2008,['Pregnant woman loses eye after police shoot ...,[[-2.49820314e-02 2.49536976e-01 -1.13435455e...,A RESPECTED senior French police officer inves...,['A RESPECTED senior French police officer inv...,[[-1.15868144e-01 4.64112371e-01 -7.01989383e...,46,32,18,...,0,0,0,0,0,0,0,0,0.091268,0.4237
3,We just found out the #Ferguson Protester who ...,2008,['We just found out the #Ferguson Protester wh...,[[ 0.54422915 0.79883075 -0.63868016 ... 0.3...,A RESPECTED senior French police officer inves...,['A RESPECTED senior French police officer inv...,[[-1.15868144e-01 4.64112371e-01 -7.01989383e...,67,56,31,...,0,0,0,1,0,0,0,0,0.007743,0.9956
4,Police Chief In Charge of Paris Attacks Commit...,2008,['Police Chief In Charge of Paris Attacks Comm...,[[ 4.13983725e-02 2.24963576e-01 5.47654569e...,A RESPECTED senior French police officer inves...,['A RESPECTED senior French police officer inv...,[[-1.15868144e-01 4.64112371e-01 -7.01989383e...,40,34,22,...,0,0,0,0,0,0,0,0,0.130289,0.1882


In [47]:
fnc_val = pkl.load(open('/gpt_embeddings/fnc_val.pkl','rb'))

FileNotFoundError: [Errno 2] No such file or directory: '/gpt_embeddings/fnc_val.pkl'

In [17]:
body_train['embeddings'] = body_train['sentences'].apply(get_embedding)

In [18]:
head_train['embeddings'] = head_train['sentences'].apply(get_embedding)

In [23]:
pkl.dump(body_train, open('arcbody.pkl','wb'))
pkl.dump(head_train, open('archead.pkl','wb'))

In [19]:
body_train.head()

Unnamed: 0,Body ID,articleBody,sentences,embeddings
0,0,A small meteorite crashed into a wooded area i...,[A small meteorite crashed into a wooded area ...,"[[0.04417499, 0.2813075, 0.08948864, 0.4299080..."
1,4,Last week we hinted at what was to come as Ebo...,[Last week we hinted at what was to come as Eb...,"[[0.3306326, -0.3482146, 0.4529167, 0.09398288..."
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...,[(NEWSER) – Wonder how long a Quarter Pounder ...,"[[-0.18273714, 0.75774044, -0.06936457, -0.440..."
3,6,"Posting photos of a gun-toting child online, I...","[Posting photos of a gun-toting child online, ...","[[0.08154284, 0.5142628, 0.0099390745, 0.10628..."
4,7,At least 25 suspected Boko Haram insurgents we...,[At least 25 suspected Boko Haram insurgents w...,"[[-0.16386387, 0.12687834, -0.89488053, -0.150..."


In [20]:
head_train.head()

Unnamed: 0,Headline,Body ID,Stance,sentences,embeddings
0,Police find mass graves with at least '15 bodi...,712,unrelated,[Police find mass graves with at least '15 bod...,"[[0.11373772, 0.32614568, -0.36455876, 0.34818..."
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree,[Hundreds of Palestinians flee floods in Gaza ...,"[[0.0035811067, 0.66388166, -0.3805887, 0.0164..."
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated,"[Christian Bale passes on role of Steve Jobs, ...","[[-0.17844312, 0.40808, 0.90333754, -0.0021085..."
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated,[HBO and Apple in Talks for $15/Month Apple TV...,"[[-0.46256074, 0.61089754, 0.06796886, -0.2900..."
4,Spider burrowed through tourist's stomach and ...,1923,disagree,[Spider burrowed through tourist's stomach and...,"[[-0.43532988, 0.84769005, -0.26739606, -0.038..."


In [29]:
import torch.nn.functional as F


In [22]:
a = sent[0]

In [37]:
s = sent[1]

In [23]:
s = "I'm a happy student studying in a university."

In [31]:
a_ = torch.tensor(model.encode(a))

In [38]:
s_ = torch.tensor(model.encode(s))

In [39]:
F.cosine_similarity(a_.unsqueeze(0),s_.unsqueeze(0))

tensor([0.6480])

In [40]:
body_train['embeddings'] = body_train['sentences'].apply(get_embedding)

In [42]:
body_train.head()

Unnamed: 0,Body ID,articleBody,sentences,embeddings
0,0,A small meteorite crashed into a wooded area i...,[A small meteorite crashed into a wooded area ...,"[[0.04417499, 0.2813075, 0.08948864, 0.4299080..."
1,4,Last week we hinted at what was to come as Ebo...,[Last week we hinted at what was to come as Eb...,"[[0.3306326, -0.3482146, 0.4529167, 0.09398288..."
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...,[(NEWSER) – Wonder how long a Quarter Pounder ...,"[[-0.18273714, 0.75774044, -0.06936457, -0.440..."
3,6,"Posting photos of a gun-toting child online, I...","[Posting photos of a gun-toting child online, ...","[[0.08154284, 0.5142628, 0.0099390745, 0.10628..."
4,7,At least 25 suspected Boko Haram insurgents we...,[At least 25 suspected Boko Haram insurgents w...,"[[-0.16386387, 0.12687834, -0.89488053, -0.150..."


In [45]:
body_test['embeddings'] = body_test['sentences'].apply(get_embedding)

In [46]:
body_test.head()

Unnamed: 0,Body ID,articleBody,sentences,embeddings
0,1,Al-Sisi has denied Israeli reports stating tha...,[Al-Sisi has denied Israeli reports stating th...,"[[0.31010464, 0.20827565, 0.23084044, -0.47201..."
1,2,A bereaved Afghan mother took revenge on the T...,[A bereaved Afghan mother took revenge on the ...,"[[0.003925855, 0.06509211, -0.0727313, 0.12800..."
2,3,CNBC is reporting Tesla has chosen Nevada as t...,[CNBC is reporting Tesla has chosen Nevada as ...,"[[0.52243996, 1.0500185, 0.61583394, 0.4663079..."
3,12,A 4-inch version of the iPhone 6 is said to be...,[A 4-inch version of the iPhone 6 is said to b...,"[[-0.22461553, 0.310677, 1.6420063, -0.1729198..."
4,19,GR editor’s Note\r\n\r\nThere are no reports i...,[GR editor’s Note\r\n\r\nThere are no reports ...,"[[0.7256343, 0.71487296, 0.795531, -0.22592303..."


In [47]:
stance_train.head()

Unnamed: 0,Headline,Body ID,Stance,sentences
0,Police find mass graves with at least '15 bodi...,712,unrelated,[Police find mass graves with at least '15 bod...
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree,[Hundreds of Palestinians flee floods in Gaza ...
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated,"[Christian Bale passes on role of Steve Jobs, ..."
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated,[HBO and Apple in Talks for $15/Month Apple TV...
4,Spider burrowed through tourist's stomach and ...,1923,disagree,[Spider burrowed through tourist's stomach and...


In [49]:
stance_train['embeddings'] = stance_train['sentences'].apply(get_embedding)

In [54]:
stance_train.head()

Unnamed: 0,Headline,Body ID,Stance,sentences,embeddings
0,Police find mass graves with at least '15 bodi...,712,unrelated,[Police find mass graves with at least '15 bod...,"[[0.11373772, 0.32614568, -0.36455876, 0.34818..."
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree,[Hundreds of Palestinians flee floods in Gaza ...,"[[0.0035811067, 0.66388166, -0.3805887, 0.0164..."
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated,"[Christian Bale passes on role of Steve Jobs, ...","[[-0.17844312, 0.40808, 0.90333754, -0.0021085..."
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated,[HBO and Apple in Talks for $15/Month Apple TV...,"[[-0.46256074, 0.61089754, 0.06796886, -0.2900..."
4,Spider burrowed through tourist's stomach and ...,1923,disagree,[Spider burrowed through tourist's stomach and...,"[[-0.43532988, 0.84769005, -0.26739606, -0.038..."


In [55]:
stance_test['embeddings'] = stance_test['sentences'].apply(get_embedding)

In [56]:
stance_test.head()

Unnamed: 0,Headline,Body ID,sentences,embeddings
0,Ferguson riots: Pregnant woman loses eye after...,2008,[Ferguson riots: Pregnant woman loses eye afte...,"[[0.317956, 0.59476596, -0.22569571, -0.283397..."
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,1550,[Crazy Conservatives Are Sure a Gitmo Detainee...,"[[-0.14443071, 0.41816604, -0.11379044, 0.2560..."
2,A Russian Guy Says His Justin Bieber Ringtone ...,2,[A Russian Guy Says His Justin Bieber Ringtone...,"[[-0.23880617, 0.9361699, -0.14230421, -0.1047..."
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",1793,"[Zombie Cat: Buried Kitty Believed Dead, Meows...","[[0.031023404, 1.1899894, 0.22445874, -0.47767..."
4,Argentina's President Adopts Boy to End Werewo...,37,[Argentina's President Adopts Boy to End Werew...,"[[-0.18823671, 0.3684826, 0.056638524, 0.52585..."


In [58]:
pkl.dump(body_test, open('/gpt_embeddings/body_test.pkl', 'wb+'))
pkl.dump(body_train, open('/gpt_embeddings/body_train.pkl', 'wb+'))

In [59]:
pkl.dump(stance_train, open('/gpt_embeddings/stance_train.pkl', 'wb+'))
pkl.dump(stance_test, open('/gpt_embeddings/stance_test.pkl', 'wb+'))