In [1]:
import torch
import pickle as pkl
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
os.chdir('..')

In [3]:
torch.cuda.is_available()

True

In [4]:
from sentence_transformers import SentenceTransformer

model_name = 'bert-base-nli-mean-tokens'
model = SentenceTransformer(model_name)

In [5]:
device = torch.device('cuda:0')

In [6]:
model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [8]:
body_test = pd.read_csv('fnc-1/competition_test_bodies.csv')

In [9]:
head_test = pd.read_csv('fnc-1/competition_test_stances.csv')

In [15]:
import nltk

In [21]:
from nltk.tokenize import sent_tokenize

In [22]:
body_test['sentences'] = body_test['articleBody'].apply(sent_tokenize)

In [24]:
head_test['sentences'] = head_test['Headline'].apply(sent_tokenize)

In [26]:
def get_embedding(sent):
    return model.encode(sent)

In [27]:
body_test['embeddings'] = body_test['sentences'].apply(get_embedding)
head_test['embeddings'] = head_test['sentences'].apply(get_embedding)

In [29]:
label_mapping = {'unrelated': 0, 'agree': 1, 'discuss': 2, 'disagree': 3}

head_test['Stance'] = head_test['Stance'].map(label_mapping)

In [31]:
head_test

Unnamed: 0,Headline,Body ID,Stance,sentences,embeddings
0,Ferguson riots: Pregnant woman loses eye after...,2008,0,[Ferguson riots: Pregnant woman loses eye afte...,"[[0.317956, 0.59476596, -0.22569571, -0.283397..."
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,1550,0,[Crazy Conservatives Are Sure a Gitmo Detainee...,"[[-0.14443071, 0.41816604, -0.11379044, 0.2560..."
2,A Russian Guy Says His Justin Bieber Ringtone ...,2,0,[A Russian Guy Says His Justin Bieber Ringtone...,"[[-0.23880617, 0.9361699, -0.14230421, -0.1047..."
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",1793,0,"[Zombie Cat: Buried Kitty Believed Dead, Meows...","[[0.031023404, 1.1899894, 0.22445874, -0.47767..."
4,Argentina's President Adopts Boy to End Werewo...,37,0,[Argentina's President Adopts Boy to End Werew...,"[[-0.18823671, 0.3684826, 0.056638524, 0.52585..."
...,...,...,...,...,...
25408,The success of the Affordable Care Act is a hu...,2582,1,[The success of the Affordable Care Act is a h...,"[[-0.25981155, 0.63267165, 0.41158876, 0.19699..."
25409,The success of the Affordable Care Act is a hu...,2583,2,[The success of the Affordable Care Act is a h...,"[[-0.25981155, 0.63267165, 0.41158876, 0.19699..."
25410,The success of the Affordable Care Act is a hu...,2584,3,[The success of the Affordable Care Act is a h...,"[[-0.25981155, 0.63267165, 0.41158876, 0.19699..."
25411,The success of the Affordable Care Act is a hu...,2585,3,[The success of the Affordable Care Act is a h...,"[[-0.25981155, 0.63267165, 0.41158876, 0.19699..."


In [32]:
body_test

Unnamed: 0,Body ID,articleBody,sentences,embeddings
0,1,Al-Sisi has denied Israeli reports stating tha...,[Al-Sisi has denied Israeli reports stating th...,"[[0.31010464, 0.20827565, 0.23084044, -0.47201..."
1,2,A bereaved Afghan mother took revenge on the T...,[A bereaved Afghan mother took revenge on the ...,"[[0.003925855, 0.06509211, -0.0727313, 0.12800..."
2,3,CNBC is reporting Tesla has chosen Nevada as t...,[CNBC is reporting Tesla has chosen Nevada as ...,"[[0.52243996, 1.0500185, 0.61583394, 0.4663079..."
3,12,A 4-inch version of the iPhone 6 is said to be...,[A 4-inch version of the iPhone 6 is said to b...,"[[-0.22461553, 0.310677, 1.6420063, -0.1729198..."
4,19,GR editor’s Note\r\n\r\nThere are no reports i...,[GR editor’s Note\r\n\r\nThere are no reports ...,"[[0.7256343, 0.71487296, 0.795531, -0.22592303..."
...,...,...,...,...
899,2582,"Congressional Republicans, evidently hoping th...","[Congressional Republicans, evidently hoping t...","[[-0.65867066, 0.6231302, 0.3155008, -0.165456..."
900,2583,Did Obamacare work?\r\n\r\nIt’s worth reflecti...,"[Did Obamacare work?, It’s worth reflecting up...","[[0.26334718, 0.042841468, 0.78453654, 0.15486..."
901,2584,Millions may lose coverage next year if Congre...,[Millions may lose coverage next year if Congr...,"[[-0.13511062, -0.14209259, 0.6387054, 0.07013..."
902,2585,"Come November, the grim trudge across the incr...","[Come November, the grim trudge across the inc...","[[-0.09636568, 0.32292098, 1.6037865, 0.865105..."


In [33]:
fnc_comp_test = head_test.merge(body_test, on='Body ID', suffixes=['_head', '_body'])

In [34]:
fnc_comp_test

Unnamed: 0,Headline,Body ID,Stance,sentences_head,embeddings_head,articleBody,sentences_body,embeddings_body
0,Ferguson riots: Pregnant woman loses eye after...,2008,0,[Ferguson riots: Pregnant woman loses eye afte...,"[[0.317956, 0.59476596, -0.22569571, -0.283397...",A RESPECTED senior French police officer inves...,[A RESPECTED senior French police officer inve...,"[[-0.115868144, 0.46411237, -0.07019894, 0.228..."
1,Apple Stores to install safes to secure gold A...,2008,0,[Apple Stores to install safes to secure gold ...,"[[-0.08704282, 1.4910088, -0.71093166, -0.5618...",A RESPECTED senior French police officer inves...,[A RESPECTED senior French police officer inve...,"[[-0.115868144, 0.46411237, -0.07019894, 0.228..."
2,Pregnant woman loses eye after police shoot be...,2008,0,[Pregnant woman loses eye after police shoot b...,"[[-0.024982031, 0.24953698, -0.113435455, -0.1...",A RESPECTED senior French police officer inves...,[A RESPECTED senior French police officer inve...,"[[-0.115868144, 0.46411237, -0.07019894, 0.228..."
3,We just found out the #Ferguson Protester who ...,2008,0,[We just found out the #Ferguson Protester who...,"[[0.54422915, 0.79883075, -0.63868016, 0.16648...",A RESPECTED senior French police officer inves...,[A RESPECTED senior French police officer inve...,"[[-0.115868144, 0.46411237, -0.07019894, 0.228..."
4,Police Chief In Charge of Paris Attacks Commit...,2008,2,[Police Chief In Charge of Paris Attacks Commi...,"[[0.041398373, 0.22496358, 0.54765457, 0.03150...",A RESPECTED senior French police officer inves...,[A RESPECTED senior French police officer inve...,"[[-0.115868144, 0.46411237, -0.07019894, 0.228..."
...,...,...,...,...,...,...,...,...
25408,A Sign That Obamacare Exchanges Are Failing,2586,3,[A Sign That Obamacare Exchanges Are Failing],"[[0.02800487, 0.66478074, 1.0676717, -0.106268...",Remember how much Republicans wanted to repeal...,[Remember how much Republicans wanted to repea...,"[[-0.6466933, 0.53802973, -0.3956069, -0.20300..."
25409,Republicans call Obamacare a 'failure.' These ...,2586,1,"[Republicans call Obamacare a 'failure.', Thes...","[[-0.4023538, 1.036921, 0.12388966, -0.1142321...",Remember how much Republicans wanted to repeal...,[Remember how much Republicans wanted to repea...,"[[-0.6466933, 0.53802973, -0.3956069, -0.20300..."
25410,CBO’s Alternate Facts Show Obamacare is Unsust...,2586,3,[CBO’s Alternate Facts Show Obamacare is Unsus...,"[[0.1872419, 0.80292976, 0.6364949, 0.16346818...",Remember how much Republicans wanted to repeal...,[Remember how much Republicans wanted to repea...,"[[-0.6466933, 0.53802973, -0.3956069, -0.20300..."
25411,Why Obamacare failed,2586,3,[Why Obamacare failed],"[[-0.1592559, 0.5979577, 0.90847987, -0.152357...",Remember how much Republicans wanted to repeal...,[Remember how much Republicans wanted to repea...,"[[-0.6466933, 0.53802973, -0.3956069, -0.20300..."


In [39]:
fnc_comp_test['embeddings_body'] = fnc_comp_test['embeddings_body'].apply(torch.tensor)
fnc_comp_test['embeddings_head'] = fnc_comp_test['embeddings_head'].apply(torch.tensor)

In [38]:
fnc_comp_test.groupby('Stance').count()

Unnamed: 0_level_0,Headline,Body ID,sentences_head,embeddings_head,articleBody,sentences_body,embeddings_body
Stance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,18349,18349,18349,18349,18349,18349,18349
1,1903,1903,1903,1903,1903,1903,1903
2,4464,4464,4464,4464,4464,4464,4464
3,697,697,697,697,697,697,697


In [41]:
fnc_comp_test_copy = fnc_comp_test.copy()
trans = lambda x: torch.mean(x, dim=0)
fnc_comp_test_copy['embeddings_head'] = fnc_comp_test_copy['embeddings_head'].apply(trans)
fnc_comp_test_copy['embeddings_body'] = fnc_comp_test_copy['embeddings_body'].apply(trans)

In [40]:
pkl.dump(fnc_comp_test, open('/gpt_embeddings/fnc_comp_test.pkl','wb'))

In [43]:
pkl.dump(fnc_comp_test_copy, open('/gpt_embeddings/fnc_comp_test_mean.pkl','wb'))

Unnamed: 0,Headline,Body ID,Stance
0,Ferguson riots: Pregnant woman loses eye after...,2008,unrelated
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,1550,unrelated
2,A Russian Guy Says His Justin Bieber Ringtone ...,2,unrelated
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",1793,unrelated
4,Argentina's President Adopts Boy to End Werewo...,37,unrelated
...,...,...,...
25408,The success of the Affordable Care Act is a hu...,2582,agree
25409,The success of the Affordable Care Act is a hu...,2583,discuss
25410,The success of the Affordable Care Act is a hu...,2584,disagree
25411,The success of the Affordable Care Act is a hu...,2585,disagree


In [18]:
sentence_embeddings = model.encode(sent)

In [19]:
sentence_embeddings.shape

(16, 768)

In [9]:
body_train = pkl.load(open('sentences/body_train.pkl','rb'))
body_test = pkl.load(open('sentences/body_test.pkl','rb'))
stance_train = pkl.load(open('sentences/stance_train.pkl','rb'))
stance_test = pkl.load(open('sentences/stance_test.pkl','rb'))

In [20]:
def get_embedding(sent):
    return model.encode(sent)

In [29]:
import torch.nn.functional as F

In [22]:
a = sent[0]

In [37]:
s = sent[1]

In [23]:
s = "I'm a happy student studying in a university."

In [31]:
a_ = torch.tensor(model.encode(a))

In [38]:
s_ = torch.tensor(model.encode(s))

In [39]:
F.cosine_similarity(a_.unsqueeze(0),s_.unsqueeze(0))

tensor([0.6480])

In [40]:
body_train['embeddings'] = body_train['sentences'].apply(get_embedding)

In [42]:
body_train.head()

Unnamed: 0,Body ID,articleBody,sentences,embeddings
0,0,A small meteorite crashed into a wooded area i...,[A small meteorite crashed into a wooded area ...,"[[0.04417499, 0.2813075, 0.08948864, 0.4299080..."
1,4,Last week we hinted at what was to come as Ebo...,[Last week we hinted at what was to come as Eb...,"[[0.3306326, -0.3482146, 0.4529167, 0.09398288..."
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...,[(NEWSER) – Wonder how long a Quarter Pounder ...,"[[-0.18273714, 0.75774044, -0.06936457, -0.440..."
3,6,"Posting photos of a gun-toting child online, I...","[Posting photos of a gun-toting child online, ...","[[0.08154284, 0.5142628, 0.0099390745, 0.10628..."
4,7,At least 25 suspected Boko Haram insurgents we...,[At least 25 suspected Boko Haram insurgents w...,"[[-0.16386387, 0.12687834, -0.89488053, -0.150..."


In [45]:
body_test['embeddings'] = body_test['sentences'].apply(get_embedding)

In [46]:
body_test.head()

Unnamed: 0,Body ID,articleBody,sentences,embeddings
0,1,Al-Sisi has denied Israeli reports stating tha...,[Al-Sisi has denied Israeli reports stating th...,"[[0.31010464, 0.20827565, 0.23084044, -0.47201..."
1,2,A bereaved Afghan mother took revenge on the T...,[A bereaved Afghan mother took revenge on the ...,"[[0.003925855, 0.06509211, -0.0727313, 0.12800..."
2,3,CNBC is reporting Tesla has chosen Nevada as t...,[CNBC is reporting Tesla has chosen Nevada as ...,"[[0.52243996, 1.0500185, 0.61583394, 0.4663079..."
3,12,A 4-inch version of the iPhone 6 is said to be...,[A 4-inch version of the iPhone 6 is said to b...,"[[-0.22461553, 0.310677, 1.6420063, -0.1729198..."
4,19,GR editor’s Note\r\n\r\nThere are no reports i...,[GR editor’s Note\r\n\r\nThere are no reports ...,"[[0.7256343, 0.71487296, 0.795531, -0.22592303..."


In [47]:
stance_train.head()

Unnamed: 0,Headline,Body ID,Stance,sentences
0,Police find mass graves with at least '15 bodi...,712,unrelated,[Police find mass graves with at least '15 bod...
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree,[Hundreds of Palestinians flee floods in Gaza ...
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated,"[Christian Bale passes on role of Steve Jobs, ..."
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated,[HBO and Apple in Talks for $15/Month Apple TV...
4,Spider burrowed through tourist's stomach and ...,1923,disagree,[Spider burrowed through tourist's stomach and...


In [49]:
stance_train['embeddings'] = stance_train['sentences'].apply(get_embedding)

In [54]:
stance_train.head()

Unnamed: 0,Headline,Body ID,Stance,sentences,embeddings
0,Police find mass graves with at least '15 bodi...,712,unrelated,[Police find mass graves with at least '15 bod...,"[[0.11373772, 0.32614568, -0.36455876, 0.34818..."
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree,[Hundreds of Palestinians flee floods in Gaza ...,"[[0.0035811067, 0.66388166, -0.3805887, 0.0164..."
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated,"[Christian Bale passes on role of Steve Jobs, ...","[[-0.17844312, 0.40808, 0.90333754, -0.0021085..."
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated,[HBO and Apple in Talks for $15/Month Apple TV...,"[[-0.46256074, 0.61089754, 0.06796886, -0.2900..."
4,Spider burrowed through tourist's stomach and ...,1923,disagree,[Spider burrowed through tourist's stomach and...,"[[-0.43532988, 0.84769005, -0.26739606, -0.038..."


In [55]:
stance_test['embeddings'] = stance_test['sentences'].apply(get_embedding)

In [56]:
stance_test.head()

Unnamed: 0,Headline,Body ID,sentences,embeddings
0,Ferguson riots: Pregnant woman loses eye after...,2008,[Ferguson riots: Pregnant woman loses eye afte...,"[[0.317956, 0.59476596, -0.22569571, -0.283397..."
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,1550,[Crazy Conservatives Are Sure a Gitmo Detainee...,"[[-0.14443071, 0.41816604, -0.11379044, 0.2560..."
2,A Russian Guy Says His Justin Bieber Ringtone ...,2,[A Russian Guy Says His Justin Bieber Ringtone...,"[[-0.23880617, 0.9361699, -0.14230421, -0.1047..."
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",1793,"[Zombie Cat: Buried Kitty Believed Dead, Meows...","[[0.031023404, 1.1899894, 0.22445874, -0.47767..."
4,Argentina's President Adopts Boy to End Werewo...,37,[Argentina's President Adopts Boy to End Werew...,"[[-0.18823671, 0.3684826, 0.056638524, 0.52585..."


In [58]:
pkl.dump(body_test, open('/gpt_embeddings/body_test.pkl', 'wb+'))
pkl.dump(body_train, open('/gpt_embeddings/body_train.pkl', 'wb+'))

In [59]:
pkl.dump(stance_train, open('/gpt_embeddings/stance_train.pkl', 'wb+'))
pkl.dump(stance_test, open('/gpt_embeddings/stance_test.pkl', 'wb+'))