In [125]:
import torch
import pickle as pkl
import pandas as pd
import numpy as np

In [126]:
import os
os.chdir('..')

In [127]:
torch.cuda.is_available()

True

In [128]:
from sentence_transformers import SentenceTransformer

model_name = 'all-mpnet-base-v2'
model_path = '/sbert_embeddings/all-mpnet-base-v2'
model = SentenceTransformer(model_path)

In [129]:
device = torch.device('cuda:0')

In [130]:
model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [131]:
sent = "Hello world!"

In [132]:
sentence_embeddings = model.encode(sent)

In [133]:
sentence_embeddings.shape

(768,)

### deal with fnc data

In [134]:
head_train = pkl.load(open('/gpt_embeddings/stance_train.pkl','rb'))

body_train = pkl.load(open('/gpt_embeddings/body_train.pkl','rb'))

In [135]:
head_train = head_train[['Headline', 'Body ID', 'Stance', 'sentences']]

In [136]:
body_train = body_train[['Body ID', 'articleBody', 'sentences']]

In [137]:
def get_embedding(sent):
    return model.encode(sent)

In [138]:
body_train['embeddings'] = body_train['sentences'].apply(get_embedding)

In [139]:
body_train.head()

Unnamed: 0,Body ID,articleBody,sentences,embeddings
0,0,A small meteorite crashed into a wooded area i...,[A small meteorite crashed into a wooded area ...,"[[0.029839752, -0.030218968, 0.003450515, 0.02..."
1,4,Last week we hinted at what was to come as Ebo...,[Last week we hinted at what was to come as Eb...,"[[0.011723922, 0.073492385, -0.02383645, -0.08..."
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...,[(NEWSER) – Wonder how long a Quarter Pounder ...,"[[-0.020041967, 0.102185205, 0.0074368543, 0.0..."
3,6,"Posting photos of a gun-toting child online, I...","[Posting photos of a gun-toting child online, ...","[[0.025258487, -0.0028769402, 0.028513137, -0...."
4,7,At least 25 suspected Boko Haram insurgents we...,[At least 25 suspected Boko Haram insurgents w...,"[[0.01621298, -0.0558657, -0.001334133, -0.015..."


In [140]:
head_train['embeddings'] = head_train['sentences'].apply(get_embedding)

In [141]:
head_train.head()

Unnamed: 0,Headline,Body ID,Stance,sentences,embeddings
0,Police find mass graves with at least '15 bodi...,712,unrelated,[Police find mass graves with at least '15 bod...,"[[0.009808279, 0.023274213, 0.02566386, 0.0317..."
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree,[Hundreds of Palestinians flee floods in Gaza ...,"[[-0.0656245, 0.042460594, -0.022362517, -0.02..."
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated,"[Christian Bale passes on role of Steve Jobs, ...","[[-0.010717051, -0.0053509274, -0.0064408863, ..."
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated,[HBO and Apple in Talks for $15/Month Apple TV...,"[[-0.03046375, 0.029996512, 0.019490862, -0.08..."
4,Spider burrowed through tourist's stomach and ...,1923,disagree,[Spider burrowed through tourist's stomach and...,"[[0.01663362, -0.04078897, 0.009623327, 0.0032..."


In [142]:
fnc_train = head_train.merge(body_train, on='Body ID', suffixes=['_head', '_body'])

In [143]:
label_mapping = {'unrelated': 0, 'agree': 1, 'discuss': 2, 'disagree': 3}

fnc_train['Stance'] = fnc_train['Stance'].map(label_mapping)

In [144]:
fnc_train['embeddings_head'] = fnc_train['embeddings_head'].apply(torch.tensor)
fnc_train['embeddings_body'] = fnc_train['embeddings_body'].apply(torch.tensor)

fnc_training encoding done, split and save

In [145]:
fnc_train_copy = fnc_train.copy()

In [146]:
fnc_train_copy_ = fnc_train.copy()

In [147]:
np.random.seed(42)

In [148]:
from sklearn.model_selection import train_test_split
fnc_train, fnc_val = train_test_split(fnc_train, test_size=0.2, random_state=42)

In [149]:
pkl.dump(fnc_train, open('/sbert_embeddings/fnc_train.pkl','wb'))
pkl.dump(fnc_val, open('/sbert_embeddings/fnc_val.pkl','wb'))

get mean version of fnc_train

In [150]:
fnc_train_copy['embeddings_head'] = fnc_train_copy['embeddings_head'].apply(lambda x: torch.mean(x, dim=0))
fnc_train_copy['embeddings_body'] = fnc_train_copy['embeddings_body'].apply(lambda x: torch.mean(x, dim=0))

In [151]:
from sklearn.model_selection import train_test_split
fnc_train_copy, fnc_val_copy= train_test_split(fnc_train_copy, test_size=0.2, random_state=42)

In [152]:
pkl.dump(fnc_train_copy, open('/sbert_embeddings/fnc_train_mean.pkl','wb'))
pkl.dump(fnc_val_copy, open('/sbert_embeddings/fnc_val_mean.pkl','wb'))

### dealing with arc data

In [71]:
arc_body = pd.read_csv('/gpt_embeddings/pure_arc_body.csv')
arc_head = pd.read_csv('/gpt_embeddings/pure_arc_stance.csv')

In [76]:
import nltk
from nltk.tokenize import sent_tokenize

In [79]:
arc_body['sentences'] = arc_body['articleBody'].apply(sent_tokenize)
arc_head['sentences'] = arc_head['Headline'].apply(sent_tokenize)

In [83]:
arc_body['embeddings'] = arc_body['sentences'].apply(get_embedding)

In [84]:
arc_head['embeddings'] = arc_head['sentences'].apply(get_embedding)

In [85]:
arc_train = arc_head.merge(arc_body, on='Body ID', suffixes=['_head', '_body'])

In [86]:
label_mapping = {'unrelated': 0, 'agree': 1, 'discuss': 2, 'disagree': 3}

arc_train['Stance'] = arc_train['Stance'].map(label_mapping)

In [98]:
arc_train['embeddings_head'] = arc_train['embeddings_head'].apply(torch.tensor)
arc_train['embeddings_body'] = arc_train['embeddings_body'].apply(torch.tensor)

In [99]:
total_train = pd.concat([fnc_train_copy_, arc_train], axis=0)

done with encoding, split and save

In [100]:
total_train_copy = total_train.copy()

In [101]:
total_train, total_val = train_test_split(total_train, test_size = 0.2, random_state=42)

In [104]:
pkl.dump(total_train, open('/sbert_embeddings/total_train.pkl','wb'))
pkl.dump(total_val, open('/sbert_embeddings/total_val.pkl','wb'))

get the mean version of total_train

In [105]:
total_train_copy['embeddings_head'] = total_train_copy['embeddings_head'].apply(lambda x: torch.mean(x, dim=0))
total_train_copy['embeddings_body'] = total_train_copy['embeddings_body'].apply(lambda x: torch.mean(x, dim=0))

In [106]:
total_train_copy, total_val_copy = train_test_split(total_train_copy, test_size=0.2, random_state=42)

In [107]:
pkl.dump(total_train_copy, open('/sbert_embeddings/total_train_mean.pkl','wb'))
pkl.dump(total_val_copy, open('/sbert_embeddings/total_val_mean.pkl','wb'))

### deal with fnc test data

In [111]:
body_test = pd.read_csv(r'D:\NLP_PJ\fnc-1\competition_test_bodies.csv')
head_test = pd.read_csv(r'D:\NLP_PJ\fnc-1\competition_test_stances.csv')

In [113]:
body_test['sentences'] = body_test['articleBody'].apply(sent_tokenize)

In [114]:
head_test['sentences'] = head_test['Headline'].apply(sent_tokenize)

In [115]:
def get_embedding(sent):
    return model.encode(sent)

In [116]:
body_test['embeddings'] = body_test['sentences'].apply(get_embedding)
head_test['embeddings'] = head_test['sentences'].apply(get_embedding)

In [117]:
label_mapping = {'unrelated': 0, 'agree': 1, 'discuss': 2, 'disagree': 3}

head_test['Stance'] = head_test['Stance'].map(label_mapping)

In [118]:
fnc_comp_test = head_test.merge(body_test, on='Body ID', suffixes=['_head', '_body'])

In [119]:
fnc_comp_test['embeddings_body'] = fnc_comp_test['embeddings_body'].apply(torch.tensor)
fnc_comp_test['embeddings_head'] = fnc_comp_test['embeddings_head'].apply(torch.tensor)

done with encoding, save

In [121]:
pkl.dump(fnc_comp_test, open('/sbert_embeddings/fnc_comp_test.pkl','wb'))

In [122]:
fnc_comp_test_copy = fnc_comp_test.copy()
trans = lambda x: torch.mean(x, dim=0)
fnc_comp_test_copy['embeddings_head'] = fnc_comp_test_copy['embeddings_head'].apply(trans)
fnc_comp_test_copy['embeddings_body'] = fnc_comp_test_copy['embeddings_body'].apply(trans)

In [123]:
pkl.dump(fnc_comp_test_copy, open('/sbert_embeddings/fnc_comp_test_mean.pkl','wb'))