#Bert Explorations

Loading a subset of the data

In [74]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [75]:
import os

In [76]:
path = "/content/drive/MyDrive/NLPML2_Bert_Explorations/data/training_t2/TRAINING_DATA/2017_cases/"

In [77]:
# work with a subset of the data
train_data_subset = [os.path.join(path, "pos/train_subject127.xml"),
              os.path.join(path, "pos/train_subject337.xml"),
              os.path.join(path, "neg/train_subject8735.xml"), 
              os.path.join(path, "neg/train_subject8803.xml"),
              os.path.join(path, "neg/train_subject8834.xml")]
val_data_subset = [os.path.join(path, "pos/train_subject127.xml"),
              os.path.join(path, "pos/train_subject888.xml"),
              os.path.join(path, "neg/train_subject7679.xml"), 
              os.path.join(path, "neg/train_subject7720.xml")]              
test_data_subset = [os.path.join(path, "pos/test_subject1488.xml"),
              os.path.join(path, "neg/test_subject50.xml"),
              os.path.join(path, "neg/test_subject96.xml")]

In [78]:
# parse data (function in data.py)
from dataclasses import dataclass
from datetime import datetime
import dateutil.parser
from pathlib import Path
from typing import List, Optional
from xml.etree import ElementTree as ET


@dataclass
class Post:
    title: str
    date: datetime
    text: str


@dataclass
class Subject:
    id: str
    posts: List[Post]
    label: Optional[bool]

    def __str__(self) -> str:
        return f"<Subject '{self.id}' ({self.label})>"

    def __hash__(self) -> int:
        return hash(self.id)


def parse_subject(filename: str) -> Subject:
    individual = ET.parse(filename)
    subject_id = individual.findtext("ID")
    posts = []
    for writing in individual.iterfind("WRITING"):
        title = writing.findtext("TITLE").strip()
        date = dateutil.parser.isoparse(writing.findtext("DATE").strip())
        text = writing.findtext("TEXT").strip()
        posts.append(Post(title, date, text))
    posts.sort(key=lambda post: post.date)
    label = Path(filename).parent.name
    assert label in ["neg", "pos"]
    return Subject(subject_id, posts, label == "pos")

In [58]:
# take a look at id, label and texts from one parsed subject
example_subject = parse_subject(train_data_subset[0])
print(example_subject.id)
print(example_subject.label) # True = pos = depressed

# get texts from this one subject
example_texts = []
for p in example_subject.posts:
  example_texts.append(p.text)
print(example_texts[0:10])

train_subject127
True
['A lot of the time I have trouble communicating why I feel so unhappy because there is no one core reason. I hate it when people say "You\'re not making any sense." after they ask me repeatedly to explain how I feel even after I tell them "I don\'t know" ...', "I try to count back from 10 but it's still really hard... what do you do to calm down? I get really angry like a spike of emotion, then I get angry and sad at myself for being so petty", "I barely stop myself, but whenever something annoys me I think of the most hurtful comment that I could possibly say to them, and for that split second I feel that I am completely justified to say that. But even when someone is saying nothing I feel like they are silently judging me somehow. I know it's crazy but nothing helps me from thinking that the world is against me or something (not meant to sound as melodramatic but I can't think of a better way to explain)", "I'm really glad I'm not alone, I even get self-conscio

Analysen zur Postlänge --> Motivation für Modelle und max input size:

In [91]:
# explore post length (max, min, mean length, outliers etc.)
post_lengths = []

for s in train_data_subset:
  s_parsed = parse_subject(s)
  for p in s_parsed.posts:
    post_lengths.append(len(p.text.split()))

post_lengths.sort()


def get_metrics(lengths_list, percent_top, percent_bottom):
    """Input: sorted list of string lengths.
    Getting some metrics after cutting off some percentage of longest and shortest posts."""
    if percent_bottom == 0 and percent_top == 0:
        post_lengths = lengths_list
        length = len(post_lengths)
        max_len = max(post_lengths)
        min_len = min(post_lengths)
        mean_len = sum(post_lengths)/length
    else:
        post_lengths = lengths_list[int(len(lengths_list) * percent_bottom): int(len(lengths_list) * (1 - percent_top))]
        length = len(post_lengths)
        max_len = max(post_lengths)
        min_len = min(post_lengths)
        mean_len = sum(post_lengths) / length

    return length, max_len, min_len, mean_len

print("All data:")
metrics = get_metrics(post_lengths, 0, 0) # TODO: change to see how much the metrics change if we cut off parts of the data
print(f"No. of posts:\t {metrics[0]} \nMax. Length:\t {metrics[1]} "
      f"\nMin. Length:\t {metrics[2]} \nMean. Length:\t {metrics[3]} ")
print("After cutting off 5% at either end")
metrics = get_metrics(post_lengths, 0.05, 0.05) 
print(f"No. of posts:\t {metrics[0]} \nMax. Length:\t {metrics[1]} "
      f"\nMin. Length:\t {metrics[2]} \nMean. Length:\t {metrics[3]} ")
print("After cutting off 5% at the bottom and 10% at the top:")
metrics = get_metrics(post_lengths, 0.05, 0.1) # 
print(f"No. of posts:\t {metrics[0]} \nMax. Length:\t {metrics[1]} "
      f"\nMin. Length:\t {metrics[2]} \nMean. Length:\t {metrics[3]} ")

All data:
No. of posts:	 1693 
Max. Length:	 506 
Min. Length:	 0 
Mean. Length:	 24.189013585351447 
After cutting off 5% at either end
No. of posts:	 1524 
Max. Length:	 86 
Min. Length:	 0 
Mean. Length:	 18.187007874015748 
After cutting off 5% at the bottom and 10% at the top:
No. of posts:	 1439 
Max. Length:	 86 
Min. Length:	 2 
Mean. Length:	 19.2043085476025 


The max post length here is 506 tokens which fits in Bert (512). 
I calculated these numbers also for the entire dataset: 


Metrics for the entire data:
* No. of posts:	 1076582 
* Max. Length:	 8167 
* Min. Length:	 0 
* Mean. Length:	 28.099796392657503 


After cutting off 2% off the data:
* No. of posts:	 1033519 
* Max. Length:	 183 
* Min. Length:	 0 
* Mean. Length:	 20.145145856051027 
-----------------------------
How many of the strings have the length 0? (in percent):
26.878491373625046

After cutting all the 0s and 5% at the top:
No. of posts:	 733383 
* Max. Length:	 104 
* Min. Length:	 1 
* Mean. Length:	 22.455915940238594 

After cutting all the 0s and 15% at the top:
No. of posts:	 625725 
* Max. Length:	 45 
* Min. Length:	 1 
* Mean. Length:	 14.844070478245236 

-----------------------------
**Motivation BERT:** Wenn wir alle leeren Posts und die top 5% der längsten Posts weglassen, gibt es eine durchschnittliche Postlänge von 22 tokens, also kriegt man in BERT 23 posts rein (512 tokens max) --> bringt was für die Entscheidung nach 5 Posts (ERDE5). 

**Motivation Longformer:** Für ERDE50 (History von 50 Posts). Da kriegt man 180 posts der Länge 22 tokens rein.  
Es stellt sich die Frage: lohnt es sich, beim Transformer die Länge auf 4046 zu setzen? Ja, weil wenn wir 5% oben weg klippen, dann haben wir eine maximale Post Länge von 104 tokens -> 50x 104 = 5200, also brauchen wir die maximale sequence length, wenn wir eine history von 50 posts rein laden wollen. 

##1. Preparation of training data
###Conceptual Idea: 

* during testing, the model will see the current post and the history in the following form: 
    * [[CLS] Post18, Post17, Post16, ..., Post4, [SEP]] 
    * where Post 18 is the current, and Post 4 is the oldest one that fits in the model gets truncated
* we want to exploit the max. sequence length of the model for this (512/ 4096) 
* But during testing, the model will also see history in the following form: 
  * [[CLS], Post4, Post3, Post2, Post1, [PAD], [PAD], [PAD], [SEP]]
  * Which is important for ERDE5! So we will also create data in this form so that the model learns to predict if there is not a lot of history. 

Thus, we want to give the model different amounts of history during training, e.g.: 
* current post + as much history as possible history as possible (max seq.length)
* current post + 49 posts as history, if that fits in the model, else: as much history as fits in the model. Padding in the beginning, if necessary. 
* (current post + 40 posts as history, if that fits in the model, else: as much history as fits in the model). Padding in the beginning, if necessary. 
* (current post + 30 posts as history, if that fits in the model, else: as much history as fits in the model). Padding in the beginning, if necessary. 
* (current post + 20 posts as history, if that fits in the model, else: as much history as fits in the model). Padding in the beginning, if necessary. 
* (current post + 10 posts as history, if that fits in the model, else: as much history as fits in the model). Padding in the beginning, if necessary. 
* current post + 4 posts as history, if that fits in the model, else: as much history as fits in the model. Padding in the beginning, if necessary. 
* current post + 3 posts as history, if that fits in the model, else: as much history as fits in the model. Padding in the beginning, if necessary. 
* current post + 2 posts as history, if that fits in the model, else: as much history as fits in the model. Padding in the beginning, if necessary. 
* current post + 1 posts as history, if that fits in the model, else: as much history as fits in the model. Padding in the beginning, if necessary. 
* current post only

Questions: 
* do the intermediate steps make sense or should we maybe just use 49 and 4 posts as history?


###Implementation:

In [105]:
dummy_posts = ["Post 1.", "Post 2.", "Post 3?", "Post 4.", "Post 5!", "Post 6.", "Post 7.", "Post 8!"]

In [173]:
def merge_posts(posts, number, overlap, max_len):
  """
  Takes a list of strings (list of all posts by one subject) and merges strings 
  in the list according to the specifications from the parameters. The strings are
  merged in reverse order so that the oldest post is to the right and the newest 
  post is to the left. 
  :param posts: a list of strings (posts by one subject)
  :param number: the number of strings that should get merged into one string, 
  must be > 0 (e.g. number = 2 will always merge two strings together)
  :param overlap: 0 if no overlap, 1 if 1 string overlap etc.
  :param max_len: maximal input length for model (e.g. 512 or 4096) 
  """
  
  merged_posts = []
  step = number-overlap
  for i in range(0, len(posts)-1, step):
    # put the number of required sentences in a list
    count = 0 # repeat while loop as many times as the number of sentences we want to concatinate
    step2 = 0 # counter so it knows which sentence to pick next 
    merged_sentence = [] # list for required sentences that need to be merged together
    
    while count < number: # for as many times as the number of sentences we want to concatinate
      try:
        sentence = posts[i+step2]  
        count += 1 # make one more iteration if the number of required sentence hasn't been reached yet
        step2 += 1 # take one sentence to the right next time

        merged_sentence.append(sentence)
      except IndexError:
        break
    
    # nur sätze nehmen, bei denen es aufgeht (=duplikate vermeiden) und die ins modell passen
    if len(merged_sentence) == number:  
      merged_sentence.reverse() # newest post on the left (will be truncated on the right)
      merged_sent_str = ' '.join(merged_sentence)
      if len(merged_sent_str.split()) <= max_len:
        merged_posts.append(merged_sent_str)
  
  

  return merged_posts

print("Initial list of strings:\t",dummy_posts)
print("Merging 2 sentences, no overlap:",merge_posts(dummy_posts, 2, 0, 512))
print("Merging 2 sentences, overlap 1:\t",merge_posts(dummy_posts, 2, 1, 512))
print("Merging 3 sentences, no overlap:",merge_posts(dummy_posts, 3, 0, 512))
print("Merging 3 sentences, overlap 2:\t",merge_posts(dummy_posts, 3, 2, 512))
print("Merging 4 sentences, overlap 2:\t",merge_posts(dummy_posts, 4, 2, 8))


Initial list of strings:	 ['Post 1.', 'Post 2.', 'Post 3?', 'Post 4.', 'Post 5!', 'Post 6.', 'Post 7.', 'Post 8!']
Merging 2 sentences, no overlap: ['Post 2. Post 1.', 'Post 4. Post 3?', 'Post 6. Post 5!', 'Post 8! Post 7.']
Merging 2 sentences, overlap 1:	 ['Post 2. Post 1.', 'Post 3? Post 2.', 'Post 4. Post 3?', 'Post 5! Post 4.', 'Post 6. Post 5!', 'Post 7. Post 6.', 'Post 8! Post 7.']
Merging 3 sentences, no overlap: ['Post 3? Post 2. Post 1.', 'Post 6. Post 5! Post 4.']
Merging 3 sentences, overlap 2:	 ['Post 3? Post 2. Post 1.', 'Post 4. Post 3? Post 2.', 'Post 5! Post 4. Post 3?', 'Post 6. Post 5! Post 4.', 'Post 7. Post 6. Post 5!', 'Post 8! Post 7. Post 6.']
Merging 4 sentences, overlap 2:	 ['Post 4. Post 3? Post 2. Post 1.', 'Post 6. Post 5! Post 4. Post 3?', 'Post 8! Post 7. Post 6. Post 5!']


The function above merges a requested number of strings into a single strings. We now need to augment the data by specifying, which numbers we require:


In [177]:
numbers_to_concatinate = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] # which numbers of posts to concatinate

def data_augmentation(posts, numbers_concat, overlap, max_len):
  """
  Function to augment the training and validation data. 
  Takes a list of strings and returns concatinations of 2 posts, 3 posts, etc.
  The newest post is always at the beginning of the string, the oldest at the end. 
  :param posts: a list of strings (posts by one subject)
  :param numbers_concat: list of integers that determines how many strings should be concatinated. 
  :param overlap: 0 if no overlap, 1 if 1 string overlap etc.
  :param max_len: maximal input length for model (e.g. 512 or 4096)
  """
  
  augmented_data = []

  # current post only (no history)
  for post in posts:
    augmented_data.append(post)

  # current post + n posts of history 
  for n in numbers_concat:
    for s in merge_posts(posts, n, 0, 512):  # TODO: try out if it works better with an overlap (e.g. overlap 10% of n --> more data)
  
        augmented_data.append(s)
    

  return augmented_data

for i in data_augmentation(dummy_posts, numbers_to_concatinate,0, 512):
  print(i)


Post 1.
Post 2.
Post 3?
Post 4.
Post 5!
Post 6.
Post 7.
Post 8!
Post 2. Post 1.
Post 4. Post 3?
Post 6. Post 5!
Post 8! Post 7.
Post 3? Post 2. Post 1.
Post 6. Post 5! Post 4.
Post 4. Post 3? Post 2. Post 1.
Post 8! Post 7. Post 6. Post 5!
Post 5! Post 4. Post 3? Post 2. Post 1.
['Post 1.', 'Post 2.', 'Post 3?', 'Post 4.', 'Post 5!', 'Post 6.', 'Post 7.', 'Post 8!', 'Post 2. Post 1.', 'Post 4. Post 3?', 'Post 6. Post 5!', 'Post 8! Post 7.', 'Post 3? Post 2. Post 1.', 'Post 6. Post 5! Post 4.', 'Post 4. Post 3? Post 2. Post 1.', 'Post 8! Post 7. Post 6. Post 5!', 'Post 5! Post 4. Post 3? Post 2. Post 1.']


Using the two functions above to prepare the actual data for the model: 
For each subject, we have a label and a list of texts. The model expects a list of texts and a list of labels of the same length as the list of texts. For a subject, we need to: 
* apply the data augmentation function to the list of texts
* create a list of labels which corresponds to the length of the list of text

Then, we put all data in one big list for all subjects. 

In [79]:
print(train_data_subset)
print(val_data_subset)

['/content/drive/MyDrive/NLPML2_Bert_Explorations/data/training_t2/TRAINING_DATA/2017_cases/pos/train_subject127.xml', '/content/drive/MyDrive/NLPML2_Bert_Explorations/data/training_t2/TRAINING_DATA/2017_cases/pos/train_subject337.xml', '/content/drive/MyDrive/NLPML2_Bert_Explorations/data/training_t2/TRAINING_DATA/2017_cases/neg/train_subject8735.xml', '/content/drive/MyDrive/NLPML2_Bert_Explorations/data/training_t2/TRAINING_DATA/2017_cases/neg/train_subject8803.xml', '/content/drive/MyDrive/NLPML2_Bert_Explorations/data/training_t2/TRAINING_DATA/2017_cases/neg/train_subject8834.xml']
['/content/drive/MyDrive/NLPML2_Bert_Explorations/data/training_t2/TRAINING_DATA/2017_cases/pos/train_subject127.xml', '/content/drive/MyDrive/NLPML2_Bert_Explorations/data/training_t2/TRAINING_DATA/2017_cases/pos/train_subject888.xml', '/content/drive/MyDrive/NLPML2_Bert_Explorations/data/training_t2/TRAINING_DATA/2017_cases/neg/train_subject7679.xml', '/content/drive/MyDrive/NLPML2_Bert_Explorations/d

In [178]:
numbers_to_concatinate = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100] # which numbers of posts to concatinate

def prepare_subject_data(filename, numb_conc, overlap, max_len):
  """Takes a filename for a subject and returns two lists: 
  - list of labels of the same length as the list of augmented posts data
  - augmented data: list of merged posts
  :param filename: xml file for a subject
  :param numb_conc: list with numbers that determine how many posts of a subject should be concatinated.
  :param overlap: 0 if no overlap, 1 if 1 string overlap etc.
  :param max_len: maximal input length for model (e.g. 512 or 4096) 
  """
  # mapping label
  labels = {True:1, False:0}

  # parse subject
  subject = parse_subject(filename)
  subject_id = subject.id
  subject_texts = [p.text for p in subject.posts]

  if subject.label == True:
    subject_label = 1
  elif subject.label == False: 
    subject_label = 0
  
  # augment text
  augmented_texts = data_augmentation(subject_texts, numb_conc, overlap, max_len)

  # get list with labels which is as long as augmented text list
  labels = [subject_label] * len(augmented_texts)

  return labels, augmented_texts

prepare_subject_data(train_data_subset[0], numbers_to_concatinate, 0, 512)

([1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 ['A lot of the time I have trouble communicating why I feel so unhappy because there is no one core reason. I hate it when people say "You\'re not making any sense." after they ask me repeatedly to explain how I feel even after I tell them "I don\'t know" ...',
  "I try to count back from 10 but it's still really hard... what do you do to calm down? I get really angry like a spike of emotion, then I get angry and sad at myself for being so petty",
  "I barely stop myself, but whenever something annoys me I think of the most hurtful comment that I could possibly say to them, and for that split second I feel that I am completely justified to say that. But even when someone is saying nothing I feel like they are silently judging me somehow. I know it's crazy but nothing helps me from thinking that the world is against me or something (not meant to sound as me

In [187]:
# create those lists for each dataset (e.g. train data) -> this we can then load into the Dataset class
def prepare_dataset(dataset, numb_conc, overlap, max_len):
  """Takes a list of file names (e.g. all file names from test set) and returns 
  a list of labels and a list of strings that can be fed into the Dataloader class.
  :param dataset: list of xml file names
  :param overlap: 0 if no overlap, 1 if 1 string overlap etc.
  :param max_len: maximal input length for model (e.g. 512 or 4096) 
  """
  all_labels = []
  all_texts = []
  for subject in dataset:
    info = prepare_subject_data(subject, numb_conc, overlap, max_len)
    for i in info[0]:
      all_labels.append(i)
    for i in info[1]:
      all_texts.append(i)

  return all_labels, all_texts

prepare_dataset(train_data_subset, numbers_to_concatinate, 0, 512) # results in 


([1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,


This results in 4018 data points for our train subset with a max seq len of 512 and no overlap. Play around with that to augment the dataset differently

Now we also need to prepare a reversed list with the test data. 

In [198]:
def prepare_test_dataset(dataset, numb_conc, overlap, max_len):
  """Takes a list of file names (all file names from test set) and returns 
  a list concatinated sentences to test the model with.
  :param dataset: list of xml file names
  :param overlap: 0 if no overlap, 1 if 1 string overlap etc.
  :param max_len: maximal input length for model (e.g. 512 or 4096) 
  """
  #all_labels = []
  all_texts = []
  for subject in dataset:
    for i in prepare_subject_data(subject, numb_conc, overlap, max_len)[1]:
      all_texts.append(i)
    
  return all_texts

prepare_test_dataset(test_data_subset, [5], 0, 512)

["Sports hold more meaning than... I don't even have an example what girls talk about because all I hear is blah blah blah. \n\nI'm a female btw.",
 "Checked briefly his history and did a search that didn't show up with anything. Wasn't until now that I checked the universal block page and even though i scrolled down alphabetically, I did not scroll down ALL the way and see his name. Also now found more dirt on him. Filed a claim with Paypal and hope to get my money back.",
 'whoa, that one is firey! Thanks for the knowledge!',
 "I haven't had this account for 2 months, yet I've gotten 2 months of Reddit Gold so far. I see no perks about it. If there was a better tradeoff, I probably would do it. It'd just me a symbol for our connection, this time and place, who I am and who you are.",
 "Yes and no. Maybe I should say Art AS therapy in this case. I was thinking more out loud and probably should have tweeked my wording before posting. My idea is simply to make art, do artistic things, g

In [199]:
# apply the functions to prepare train, val and test data for the DataLoader Class
prepared_train_data = prepare_dataset(train_data_subset, numbers_to_concatinate, 0, 512)
prepared_val_data = prepare_dataset(val_data_subset, numbers_to_concatinate, 0, 512)
prepared_test_data = prepare_test_dataset(test_data_subset, [5], 0, 512)

## Dataset Class

In [203]:
!pip install datasets transformers[sentencepiece]

Collecting datasets
  Downloading datasets-2.0.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 27.7 MB/s 
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 47.4 MB/s 
[?25hCollecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.3.0-py3-none-any.whl (136 kB)
[K     |████████████████████████████████| 136 kB 25.6 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 58.1 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 35.3 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-

In [204]:
import transformers

In [205]:
import torch
import numpy as np
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
#labels = {'business':0,
          #'entertainment':1,
          #'sport':2,
          #'tech':3,
          #'politics':4
          #}

class Dataset(torch.utils.data.Dataset):

    def __init__(self, dataset):

        self.labels = dataset[0]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in dataset[1]]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

## Model Building

In [206]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

## Training Loop

In [None]:
from torch.optim import Adam
from tqdm import tqdm

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  
EPOCHS = 5
model = BertClassifier()
LR = 1e-6
              
train(model, prepared_train_data, prepared_val_data, LR, EPOCHS)

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 67%|██████▋   | 1366/2041 [4:57:14<2:33:55, 13.68s/it]

## Evaluate Model

In [None]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    
evaluate(model, prepared_test_data)