In [1]:
# !wget https://www.dropbox.com/sh/3kezt1nx628rly2/AAD3rbbhB8Bwa_30dQ5ugnz2a?dl=0 -O upvote_model.zip
# !unzip -o upvote_model.zip -d upvote_model

In [2]:
!wget https://www.dropbox.com/s/xq4vosn9xyn1dy1/grouped_data.pickle?dl=0 -O grouped_data.pickle

--2021-05-02 22:39:54--  https://www.dropbox.com/s/xq4vosn9xyn1dy1/grouped_data.pickle?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.5.18, 2620:100:601d:18::a27d:512
Connecting to www.dropbox.com (www.dropbox.com)|162.125.5.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/xq4vosn9xyn1dy1/grouped_data.pickle [following]
--2021-05-02 22:39:54--  https://www.dropbox.com/s/raw/xq4vosn9xyn1dy1/grouped_data.pickle
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc604ad9ce26028a159e738ee33c.dl.dropboxusercontent.com/cd/0/inline/BNwW1-eq0y1uDmxYcgVOmM9ROsEhkwOXcidiQMEg_Oj2bsqRdhEj7XY3den6l57Lqri8iajA06Ku5cXFjiTKizsG8XFS0ZYZlbhcf9qtlY9vktlJLFNuPhx0fYpYsyLzREFfd4kM9MHcQ4ui9TE8k-Cw/file# [following]
--2021-05-02 22:39:54--  https://uc604ad9ce26028a159e738ee33c.dl.dropboxusercontent.com/cd/0/inline/BNwW1-eq0y1uDmxYcgVOmM9ROsEhkwOXcidiQMEg_Oj2bsqRdhEj7XY3den6l57L

In [3]:
import pickle
import sklearn
import torch
!pip install transformers
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm.notebook import tqdm
from scipy import stats
import collections
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
# load sample data
# format { post_id: [post, [comment1, comment2, ... ] }

data_file_path = 'grouped_data.pickle'

def load_data(file_path):
    posts_comments = []
    upvotes = []
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
        for post_id, val in data.items():
            post = val[0]
            comms = val[1]
            for com in comms:
                try:
                    if len(post['title']) < 500 and len(com['body']) < 500 and com['body'] != '[deleted]' and com['body'] != '[removed]':
                        posts_comments.append((post['title'], com['body']))
                        upvotes.append(com['score'])
                except:
                    pass
                
    return posts_comments, upvotes
    
posts_comms, upvotes = load_data(data_file_path)

In [5]:
upvotes = np.array(upvotes)
q1 = np.quantile(upvotes, 0.6)


classes = []
for label in upvotes:
  if label < q1:
    classes.append(0)
  else:
    classes.append(1)

collections.Counter(classes)

Counter({0: 681223, 1: 722131})

In [6]:
np.quantile(upvotes, 0.25)

1.0

In [7]:
train_texts, test_texts, train_labels, test_labels = train_test_split(posts_comms, classes, test_size=.1, shuffle=True)

In [8]:
upvote_predictor = BertForSequenceClassification.from_pretrained('/content/gdrive/My Drive/upvote_model_normalized', num_labels=2)
upvote_predictor_tokenizer = BertTokenizerFast.from_pretrained('distilbert-base-cased', use_cache=True)

In [9]:
def tokenize_responses(questions, responses):
    encodings = upvote_predictor_tokenizer(text=questions, text_pair=responses, truncation=True, padding=True)
    items = {key: torch.tensor(val) for key, val in encodings.items()}
    return items

In [10]:
questions = [item[0] for item in test_texts[100:200]]
responses = [item[1] for item in test_texts[100:200]]

In [11]:
items = tokenize_responses(questions, responses)
outs = upvote_predictor.forward(input_ids=items['input_ids'], attention_mask=items['attention_mask'], token_type_ids=items['token_type_ids'])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [12]:
probs = torch.softmax(outs.logits, 1)
categories = torch.argmax(probs, dim=1)

In [13]:
sklearn.metrics.f1_score(classes[100:200], categories, average='weighted')

0.4814420062695924

In [14]:
sklearn.metrics.confusion_matrix(classes[100:200], categories)

array([[ 5, 36],
       [ 9, 50]])

In [15]:
i = 65
print(questions[i] + '\n' + '='*50 + '\n' + responses[i] + '\n%d, %d' % (classes[i+100], categories[i]))

What did or are you having for breakfast today?
A free muffin from school. Tasted ok.
0, 0


In [16]:
upvotes[i]

3

In [17]:
np.quantile(upvotes, 0.6)


2.0

In [18]:
probs.argmax(dim=0)

tensor([49, 85])

In [19]:
test_labels[100:200]

[0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1]