In [1]:
import transformers
import torch
import pickle as pkl
import pandas
from transformers import GPT2Tokenizer, GPT2Model
import os
os.chdir('..')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda:0')

In [3]:
body_train = pkl.load(open('sentences/body_train.pkl','rb'))
body_test = pkl.load(open('sentences/body_test.pkl','rb'))
stance_train = pkl.load(open('sentences/stance_train.pkl','rb'))
stance_test = pkl.load(open('sentences/stance_test.pkl','rb'))

In [4]:
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir='.cache')

In [5]:
gpt_model = GPT2Model.from_pretrained('gpt2', cache_dir='.cache')

In [6]:
gpt_model = gpt_model.to(device)

In [7]:
gpt_model.eval()

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0): GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (1): GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwis

In [9]:
def get_embedding(sent):
    input_ids = gpt_tokenizer.encode(sent, return_tensors='pt')
    input_ids = input_ids.to(device)
    with torch.no_grad():
        outputs = gpt_model(input_ids)
        embeddings = outputs.last_hidden_state.detach().cpu()
    return embeddings

In [14]:
sent = body_train['sentences'][0]

In [37]:
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

In [44]:
sent[0]

"A small meteorite crashed into a wooded area in Nicaragua's capital of Managua overnight, the government said Sunday."

In [72]:
sent[0]

"A small meteorite crashed into a wooded area in Nicaragua's capital of Managua overnight, the government said Sunday."

In [73]:
sent[1]

"Residents reported hearing a mysterious boom that left a 16-foot deep crater near the city's airport, the Associated Press reports."

In [76]:
import torch.nn.functional as F

In [94]:
F.cosine_similarity(b.unsqueeze(0), c.unsqueeze(0))

tensor([0.9962], device='cuda:0')

In [92]:
s = "I'm a happy student studying in a university."

In [77]:
sentence = sent[0]
tokens = gpt_tokenizer.encode(sentence, add_special_tokens=True)

input_ids = torch.tensor([tokens]).to(device)
with torch.no_grad():
    outputs = gpt_model(input_ids)
    last_hidden_state = outputs.last_hidden_state
a = last_hidden_state.squeeze().mean(axis=0)

In [84]:
sentence = sent[1]
tokens = gpt_tokenizer.encode(sentence, add_special_tokens=True)

input_ids = torch.tensor([tokens]).to(device)
with torch.no_grad():
    outputs = gpt_model(input_ids)
    last_hidden_state = outputs.last_hidden_state
b = last_hidden_state.squeeze().mean(axis=0)

In [93]:
sentence = s
tokens = gpt_tokenizer.encode(sentence, add_special_tokens=True)

input_ids = torch.tensor([tokens]).to(device)
with torch.no_grad():
    outputs = gpt_model(input_ids)
    last_hidden_state = outputs.last_hidden_state
c = last_hidden_state.squeeze().mean(axis=0)

In [10]:
body_train['embeddings'] = body_train['sentences'].apply(get_embedding)

In [12]:
body_train['embeddings'][0]

tensor([[[-0.0667,  0.0881, -0.3085,  ...,  0.0307,  0.0512, -0.0019],
         [ 0.0354,  0.0094,  0.0664,  ...,  0.0637, -0.1114, -0.0031],
         [ 0.0202,  0.0210,  0.0979,  ...,  0.0937, -0.1037,  0.0063],
         ...,
         [ 0.0208,  0.0419,  0.1367,  ...,  0.0850, -0.1359, -0.0636],
         [ 0.0212,  0.0393,  0.1369,  ...,  0.0841, -0.1380, -0.0664],
         [ 0.0215,  0.0364,  0.1364,  ...,  0.0831, -0.1398, -0.0694]]])

In [13]:
body_train['embeddings'][1]

tensor([[[-0.0667,  0.0881, -0.3085,  ...,  0.0307,  0.0512, -0.0019],
         [ 0.0354,  0.0094,  0.0664,  ...,  0.0637, -0.1114, -0.0031],
         [ 0.0202,  0.0210,  0.0979,  ...,  0.0937, -0.1037,  0.0063],
         [ 0.0209,  0.0301,  0.1139,  ...,  0.0987, -0.1048, -0.0023],
         [ 0.0213,  0.0368,  0.1231,  ...,  0.1002, -0.1074, -0.0116]]])

In [11]:
body_train.head()

Unnamed: 0,Body ID,articleBody,sentences,embeddings
0,0,A small meteorite crashed into a wooded area i...,[A small meteorite crashed into a wooded area ...,"[[[tensor(-0.0667), tensor(0.0881), tensor(-0...."
1,4,Last week we hinted at what was to come as Ebo...,[Last week we hinted at what was to come as Eb...,"[[[tensor(-0.0667), tensor(0.0881), tensor(-0...."
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...,[(NEWSER) – Wonder how long a Quarter Pounder ...,"[[[tensor(-0.0667), tensor(0.0881), tensor(-0...."
3,6,"Posting photos of a gun-toting child online, I...","[Posting photos of a gun-toting child online, ...","[[[tensor(-0.0667), tensor(0.0881), tensor(-0...."
4,7,At least 25 suspected Boko Haram insurgents we...,[At least 25 suspected Boko Haram insurgents w...,"[[[tensor(-0.0667), tensor(0.0881), tensor(-0...."


In [22]:
body_test['embeddings'] = body_test['sentences'].apply(get_embedding)

In [23]:
body_test.head()

Unnamed: 0,Body ID,articleBody,sentences,embeddings
0,1,Al-Sisi has denied Israeli reports stating tha...,[Al-Sisi has denied Israeli reports stating th...,"[[tensor(-0.0667), tensor(0.0881), tensor(-0.3..."
1,2,A bereaved Afghan mother took revenge on the T...,[A bereaved Afghan mother took revenge on the ...,"[[tensor(-0.0667), tensor(0.0881), tensor(-0.3..."
2,3,CNBC is reporting Tesla has chosen Nevada as t...,[CNBC is reporting Tesla has chosen Nevada as ...,"[[tensor(-0.0667), tensor(0.0881), tensor(-0.3..."
3,12,A 4-inch version of the iPhone 6 is said to be...,[A 4-inch version of the iPhone 6 is said to b...,"[[tensor(-0.0667), tensor(0.0881), tensor(-0.3..."
4,19,GR editor’s Note\r\n\r\nThere are no reports i...,[GR editor’s Note\r\n\r\nThere are no reports ...,"[[tensor(-0.0667), tensor(0.0881), tensor(-0.3..."


In [None]:
stance_train.head()

In [25]:
def get_head_embedding(sent):
    input_ids = gpt_tokenizer.encode(sent, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = gpt_model(input_ids)
        embeddings = outputs.last_hidden_state
    return embeddings.detach().cpu()

In [27]:
stance_train['embeddings'] = stance_train['sentences'].apply(get_head_embedding)

In [28]:
stance_train.head()

Unnamed: 0,Headline,Body ID,Stance,sentences,embeddings
0,Police find mass graves with at least '15 bodi...,712,unrelated,[Police find mass graves with at least '15 bod...,"[[[tensor(-0.0667), tensor(0.0881), tensor(-0...."
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree,[Hundreds of Palestinians flee floods in Gaza ...,"[[[tensor(-0.0667), tensor(0.0881), tensor(-0...."
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated,"[Christian Bale passes on role of Steve Jobs, ...","[[[tensor(-0.0667), tensor(0.0881), tensor(-0...."
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated,[HBO and Apple in Talks for $15/Month Apple TV...,"[[[tensor(-0.0667), tensor(0.0881), tensor(-0...."
4,Spider burrowed through tourist's stomach and ...,1923,disagree,[Spider burrowed through tourist's stomach and...,"[[[tensor(-0.0667), tensor(0.0881), tensor(-0...."


In [29]:
stance_test['embeddings'] = stance_test['Headline'].apply(get_head_embedding)

In [30]:
stance_test.head()

Unnamed: 0,Headline,Body ID,sentences,embeddings
0,Ferguson riots: Pregnant woman loses eye after...,2008,[Ferguson riots: Pregnant woman loses eye afte...,"[[[tensor(-0.0748), tensor(-0.1390), tensor(-0..."
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,1550,[Crazy Conservatives Are Sure a Gitmo Detainee...,"[[[tensor(-0.1905), tensor(-0.0578), tensor(-0..."
2,A Russian Guy Says His Justin Bieber Ringtone ...,2,[A Russian Guy Says His Justin Bieber Ringtone...,"[[[tensor(-0.1422), tensor(-0.0903), tensor(-0..."
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",1793,"[Zombie Cat: Buried Kitty Believed Dead, Meows...","[[[tensor(-0.2125), tensor(-0.0637), tensor(-0..."
4,Argentina's President Adopts Boy to End Werewo...,37,[Argentina's President Adopts Boy to End Werew...,"[[[tensor(-0.0847), tensor(-0.1937), tensor(-0..."


In [31]:
pkl.dump(body_test, open('gpt_embeddings/body_test.pkl', 'wb+'))
pkl.dump(body_train, open('gpt_embeddings/body_train.pkl', 'wb+'))

In [32]:
pkl.dump(stance_train, open('gpt_embeddings/stance_train.pkl', 'wb+'))
pkl.dump(stance_test, open('gpt_embeddings/stance_test.pkl', 'wb+'))