In [1]:
import pandas as pd
import numpy as np

import os.path as op

import transformers as tn

import torch as tt

To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html


In [2]:
gpt2_tokenizer = tn.GPT2Tokenizer.from_pretrained("gpt2")
gpt2_model = tn.GPT2Model.from_pretrained("gpt2")

In [3]:
def embed(name, text):    
    encoding = tt.tensor(gpt2_tokenizer.encode(name, add_special_tokens=True)).unsqueeze(0)

    out_name = gpt2_model(encoding)

    encoding = tt.tensor(gpt2_tokenizer.encode(text, add_special_tokens=True)).unsqueeze(0)

    out_text = gpt2_model(encoding)

    return tt.cat([out_name[0][0, -1], out_text[0][0, -1]], dim=0)

In [4]:
DATA_PROCESSED = '../../data/processed'

In [5]:
data = pd.read_csv(op.join(DATA_PROCESSED, 'gofundme_projects.csv')).dropna()

In [6]:
prompt = "Fundraising for an individual with cancer? "

In [7]:
ids = pd.read_csv("ids.csv")

In [8]:
import torch.nn as nn
import torch.nn.functional as func

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.dense1 = nn.Linear(2*768, 128)
        self.dense2 = nn.Linear(128, 32)
        self.drop3 = nn.Dropout(0.1)
        self.dense3 = nn.Linear(32, 2)

    def forward(self, x):
        x = self.dense1(x)
        x = self.dense2(func.relu(x))
        x = self.drop3(x)
        x = self.dense3(func.relu(x))
        return x
    
model = Net()
model.load_state_dict(tt.load("det_infer.pt"))
model.eval()

Net(
  (dense1): Linear(in_features=1536, out_features=128, bias=True)
  (dense2): Linear(in_features=128, out_features=32, bias=True)
  (drop3): Dropout(p=0.1, inplace=False)
  (dense3): Linear(in_features=32, out_features=2, bias=True)
)

In [9]:
c = int(input("N? "))
for i, project in enumerate(data.sample(frac=1).itertuples()):
    _id = project.id
    if _id in set(ids["id"].tolist()):
        continue
    
    print(project.name)
    print()
    print(project.text[:1000])
    
    sent_emb = embed(project.name, project.text[:250])

    ts = tt.softmax(model(sent_emb), dim=0).detach()
    
    print(dict(zip(['Cancer', '*'], ts.numpy().round(3))))

    detection = input(prompt)
    if detection != "y":
        # a non-cancer-related campaign
        ids = ids.append({"label": 1, "id": _id}, ignore_index=True)
    else:
        # cancer-related campaign
        ids = ids.append({"label": 0, "id": _id}, ignore_index=True)
    
    if i == c-1:
        break
        
    print()

N? 5
Help Rich Dunning Beat Cancer!

Rich has just been diagnosed with esophageal cancer and already has many insurance co-pays racking up.  He will be going through surgery and treatment to beat this cancer!  Let's help relieve the extra stress of all the expenses that come along with a cancer diagnosis so Rich can focus on getting better!
{'Cancer': 0.0, '*': 1.0}
Fundraising for an individual with cancer? y

Syd's Lymphoma Donations

One of my friends Sydney Slie was diagnosed with Lymphoma. She has been on my volleyball team for almost 5 years now and is the setter for our club team Nebraska Impact and for her high school Bellevue East. She is 16 years old and is going in to get a spot removed from her neck. Please donate, any amount of money will help and this will all go towards her treatment, after care, and other needs to help her and her family get through this. There will be other fundraising opportunities that I will post on here as they come up. Thank you everyone.
{'Cancer

In [44]:
len(ids)

206

In [45]:
ids["label"].mean()

0.08737864077669903

In [46]:
ids.to_csv("ids.csv", index=False)

# Model

In [14]:
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [15]:
class Data(Dataset):
    def __init__(self):
        self.data = ids
        
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, ix):
        return {
            "target": self.data.loc[ix, "label"],
            "embedding": self.data.loc[ix, "embedding"]
        }

In [16]:
d = Data()
loader = DataLoader(d, shuffle=True, batch_size=32)

In [18]:
net = Net()

In [20]:
crit = nn.CrossEntropyLoss()
opt = optim.Adam(net.parameters(), lr=0.01)

In [21]:
print("Begin Training")
for epoch in range(1,5+1):
    loss_running = 0.0
    print(f"Epoch {epoch}")
    for i, batch in enumerate(loader):
        targets, embeddings = batch
        
        opt.zero_grad()
        
        outs = net(batch[embeddings])
                
        loss = crit(tt.sigmoid(outs), batch[targets])
        loss.backward()
        opt.step()
        
        loss_running += loss.item()
        
        if i % 2 == 1:
            print(f"B:{i} L:{loss_running / 2}")
            loss_running = 0.0
            
print("Finished Training")

Begin Training
Epoch 1
B:1 L:0.5519922971725464
Epoch 2


RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.