In [1]:
# %load_ext autoreload
# %autoreload 2

In [2]:
import os
import sys

os.chdir("..")

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
import torch

import flippers

In [4]:
np.random.seed(1)

In [5]:
train = pd.read_csv("datasets/informative_youtube/train.csv")
dev = pd.read_csv("datasets/informative_youtube/dev.csv")
test = pd.read_csv("datasets/informative_youtube/test.csv")

In [6]:
train.shape, dev.shape, test.shape

((164950, 16), (20619, 16), (20619, 16))

In [7]:
train.sample(5)[["title", "channelTitle", "tags"]]

Unnamed: 0,title,channelTitle,tags
57427,Inter 4-0 Genoa | Inter kick off title defence...,Serie A,Ronaldo|Serie A|Dybala|highlights|Juventus|AC ...
132056,I Built Minecraft's Most HIDDEN Base,Wenzo,minecraft smp|dream smp|smp|minecraft|demisesm...
91229,I WASN'T READY! Obi-Wan Kenobi Episode 6 BREAK...,Star Wars Comics,darth vader|vader vs|star wars comics|star war...
18120,CASH NASTY AND JULIAN NEWMAN HEATED 2V2! | Cas...,Creator League,[None]
3924,I do not recommend: Sonic Frontiers (Review),Skill Up,skill up|skill|up|gameplay|games|guide|sonic f...


# Creating labeling functions

In [8]:
from flippers.lfs import LFS

lfs = LFS()

OTHER = 0
INFORMATIVE = 1


# Clickbait titles
@lfs.add(OTHER)
def contains_more_than_2_caps_lock_words(df):
    return df["title"].str.contains(r"[A-Z]{2,}")


@lfs.add(OTHER)
def contains_more_than_2_marks(df):
    return df["title"].str.contains(r"!|\?")


@lfs.add(OTHER)
def contains_parentesis(df):
    return df["title"].str.contains(r"\(")


# Looking at tags (will not be used in discriminative model)
non_informative_tags = [
    "funny",
    "gaming",
    "game",
    "minecraft",
    "fortnite",
    "console",
    "ps4",
    "xbox",
    "nintendo",
    "movie",
    "netflix",
    "cartoon",
    "anime",
    "music",
    "song",
    "food",
    "vlog",
    "short",
    "unboxing",
    "tik",
    "review",
    "stream",
    "season",
    "top",
    "news",
    "trailer",
    "politics",
    "ball",
]


@lfs.add(INFORMATIVE)
def doesnt_contain_non_informative_tags(df):
    return ~df["tags"].str.lower().str.contains("|".join(non_informative_tags))


informative_tags = [
    "science",
    "technology",
    "education",
    "history",
    "philosophy",
    "psychology",
    "economics",
    "math",
    "mathematics",
    "physics",
    "chemistry",
    "biology",
    "medicine",
    "health",
    "engineering",
    "computer science",
    "programming",
]


@lfs.add(INFORMATIVE)
def contains_informative_tags(df):
    return df["tags"].str.lower().str.contains("|".join(informative_tags))

In [9]:
L_train = lfs.create_matrix(train)
L_train

Unnamed: 0,contains_more_than_2_caps_lock_words,contains_more_than_2_marks,contains_parentesis,doesnt_contain_non_informative_tags,contains_informative_tags
0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...
164945,1.0,1.0,0.0,0.0,0.0
164946,1.0,0.0,0.0,0.0,0.0
164947,1.0,0.0,1.0,0.0,0.0
164948,1.0,1.0,0.0,1.0,0.0


In [10]:
flippers.summary(L_train, lfs.polarities)

Unnamed: 0,polarity,coverage,confidence,overlaps,matches,conflicts
contains_more_than_2_caps_lock_words,0,0.448,1.0,0.309,0.197,0.206
contains_more_than_2_marks,0,0.194,1.0,0.163,0.127,0.092
contains_parentesis,0,0.208,1.0,0.155,0.106,0.102
doesnt_contain_non_informative_tags,1,0.457,1.0,0.294,0.018,0.285
contains_informative_tags,1,0.038,1.0,0.027,0.018,0.018


# Create label model

In [11]:
from flippers.models import SnorkelModel

label_model = SnorkelModel(lfs.polarities, 2, [0.9, 0.1])
label_model.fit(L_train, epochs=100)

In [12]:
train["y_pred_snorkel"] = label_model.predict_proba(L_train)[:, 1]

In [13]:
train.loc[train["y_pred_snorkel"] > 0.8, ["title", "tags", "y_pred_snorkel"]].sample(
    n=20
)

Unnamed: 0,title,tags,y_pred_snorkel
123322,Mental Health Doesn't Discriminate feat. Lil W...,Emmanuel Acho|Lil Wayne|Tha Carter|Uncomfortab...,0.964274
43310,💖 The Tale of Tiffany 💖,cgpgrey|education|hello internet,0.964274
41008,How I Almost Ruined MrBeast Squid Game,laser cutter|william osman|crappy science|pete...,0.964274
3456,Celebrating Black Creativity with Guest Artist...,black|black history|creativity|art|illustrate|...,0.964274
132190,"Exploding Weed Seeds (28,546 fps Slow Motion)-...",Smarter|Every|Day|Science|Physics|Destin|Sandl...,0.875755
96509,let's discuss: the obsession with marilyn monroe,marilyn monroe|kim kardashian|met gala|ana de ...,0.964274
17664,Feeding My Venus Flytrap Candy Instead Of Flies,Science|venus flytrap|candy|the action lab,0.964274
155429,Why Progress Bars Don't Move Smoothly ▓▓▓░░░░░░,tom scott|tomscott|the basics|computer science,0.964274
28166,Surfside Condo Collapse: What We Know So Far,surfside condo|champlain towers|Florida|collap...,0.964274
16721,I Watched Ancient Apocalypse So You Don't Have...,archaeology|anthropology|documentary|history|l...,0.875755


In [14]:
train.loc[train["y_pred_snorkel"] < 0.2, ["title", "tags", "y_pred_snorkel"]].sample(
    n=20
)

Unnamed: 0,title,tags,y_pred_snorkel
101987,South Park Season 26 Teaser,cartman|south park funniest moments|south park...,1.7237e-07
131272,I WENT ON 24 DATES IN 24 HOURS!,BRENT rivera|mr. beast|Ryan trahan|faze rug|Pi...,5.793655e-09
102972,Minecraft Live 2022: Vote for the Rascal!,minecraft live 2022|mob vote|minecraft mob vot...,4.759664e-08
130806,'All net': Barack Obama hits silky three-point...,barack obama|barack obama three pointer|barack...,1.7237e-07
96237,Challenging Bob Does Sports To A Match | Bryso...,Challenging Bob Does Sports To A Match | Bryso...,1.7237e-07
130858,Chad on Mars - SNL,snl|saturday night live|season 46|snl 46|snl h...,2.098158e-08
34857,"*NEW* SEASON 4 UPDATE! (NEW DLC WEAPONS, BATTL...",black ops cold war season 4|black ops cold war...,1.512987e-09
164299,"100 Days, But It's ONE SINGLE CHUNK",minecraft|minecraft hardcore|100 days|minecraf...,2.098158e-08
58366,Run BTS! 2022 Special Episode - 'RUN BTS TV' O...,방탄소년단|BTS|BANGTAN|알엠|RM|슈가|SUGA|제이홉|jhope|지민|정...,0.09855184
125250,When Your Plastic Bottle Finally Degrades,ryan george|theryangeorge|funny|sketch|skit|fi...,1.7237e-07


# Train a transformer discriminative model

In [15]:
# !pip install transformers==4.28.0
# !pip install --upgrade accelerate evaluate

In [16]:
# Only keep the titles that likely to be labeled correctly
transformer_train = train.query("y_pred_snorkel > 0.8 or y_pred_snorkel < 0.2")
sentences = transformer_train["title"].values.tolist()
labels = (transformer_train["y_pred_snorkel"] > 0.8).astype(int).values.tolist()

In [17]:
from sklearn.model_selection import train_test_split

# Split your data into train and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    sentences, labels, random_state=42, test_size=0.1
)

In [18]:
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_dict({"text": train_sentences, "labels": train_labels})
val_dataset = Dataset.from_dict({"text": val_sentences, "labels": val_labels})

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
# Undersampling

rebalance_ratio = 4
train_positives = sum(train_labels)

positive_indices = [i for i, label in enumerate(train_labels) if label == 1]
negative_indices = [i for i, label in enumerate(train_labels) if label == 0]

random_negative_indices = np.random.choice(
    negative_indices, min(rebalance_ratio * train_positives, len(negative_indices)), replace=False
)

under_sample_indices = np.concatenate([positive_indices, random_negative_indices])

train_dataset = train_dataset.select(under_sample_indices)

In [20]:
small_dev_dataset = val_dataset.shuffle(seed=42).select([i for i in list(range(300))])
train_dataset = train_dataset.shuffle(seed=42)

In [None]:
# Set DistilBERT tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)


tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_eval = small_dev_dataset.map(preprocess_function, batched=True)

In [None]:
# Use data_collector to convert our samples to PyTorch tensors and concatenate them with the correct amount of padding
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [24]:
# Define DistilBERT as our base model:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

In [25]:
# Setup evaluation
import evaluate

metric = evaluate.load("f1")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [26]:
import torch
from torch import nn
from transformers import TrainingArguments, Trainer

class_weights = torch.tensor([1, rebalance_ratio]).float().to("cuda")

# Create a custom loss function to balance loss
loss_function = nn.CrossEntropyLoss(weight=class_weights)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = loss_function(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [27]:
# Create a model finetuning trainer

repo_name = "finetuning-informative-model-3000-samples"

training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    push_to_hub=False,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [28]:
# Train the model
trainer.train()

  0%|          | 0/670 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 75%|███████▍  | 501/670 [00:54<00:18,  9.23it/s]

{'loss': 0.3082, 'learning_rate': 2.537313432835821e-06, 'epoch': 1.49}


100%|██████████| 670/670 [01:15<00:00,  8.88it/s]

{'train_runtime': 75.4632, 'train_samples_per_second': 141.765, 'train_steps_per_second': 8.879, 'train_loss': 0.27421511180365266, 'epoch': 2.0}





TrainOutput(global_step=670, training_loss=0.27421511180365266, metrics={'train_runtime': 75.4632, 'train_samples_per_second': 141.765, 'train_steps_per_second': 8.879, 'train_loss': 0.27421511180365266, 'epoch': 2.0})

In [29]:
# Compute the evaluation metrics
trainer.evaluate()

100%|██████████| 19/19 [00:00<00:00, 30.57it/s]


{'eval_loss': 0.16583208739757538,
 'eval_f1': 0.4,
 'eval_runtime': 0.6735,
 'eval_samples_per_second': 445.403,
 'eval_steps_per_second': 28.209,
 'epoch': 2.0}

In [30]:
from transformers import TextClassificationPipeline

pipe = TextClassificationPipeline(
    model=model, tokenizer=tokenizer, return_all_scores=True, device=0
)



In [31]:
pipe("Solving the Riemann Hypothesis")

[[{'label': 'LABEL_0', 'score': 0.01876850239932537},
  {'label': 'LABEL_1', 'score': 0.9812315106391907}]]

In [32]:
pipe("Super Smash Bros. Ultimate - The Fastest Way to Unlock All Characters")

[[{'label': 'LABEL_0', 'score': 0.98139888048172},
  {'label': 'LABEL_1', 'score': 0.01860111765563488}]]

In [33]:
pipe("How to make a YouTube video")

[[{'label': 'LABEL_0', 'score': 0.02162959612905979},
  {'label': 'LABEL_1', 'score': 0.9783704876899719}]]

In [34]:
dev = dev.sample(1000)
y_pred_dev = pipe(dev["title"].tolist())

In [35]:
y_pred_dev = [x[1]["score"] for x in y_pred_dev]

In [36]:
dev["y_pred"] = y_pred_dev

In [37]:
dev.loc[dev["y_pred"] > 0.5, ["title", "y_pred"]].sample(n=15)

Unnamed: 0,title,y_pred
2893,I Transformed My Washing Machine into a Fish Tank,0.982048
4226,Why Phineas and Ferb Brigade Shouldn’t Work,0.661624
11848,Bad IRISH ACCENTS That Even I Can't Understand,0.917264
19846,Historically Accurate Frozen,0.984362
17958,Eating the Worst Rated Food with Cold Ones,0.807342
17928,Why Samsung is about to take over.,0.852245
9619,how they wrote songs in the 50s,0.503052
5098,The coolest Internet Hack ever?,0.95631
12974,This gas on Venus could be a new sign of life,0.983318
1747,The Aston Martin Lagonda Taraf Is the World's ...,0.501427


In [38]:
dev.loc[dev["y_pred"] < 0.5, ["title", "y_pred"]].sample(n=15)

Unnamed: 0,title,y_pred
5861,RC Helicopter Battle (Behind The Scenes),0.017589
19539,PFAS: Last Week Tonight with John Oliver (HBO),0.017774
9003,Scoring 1 AMAZING Goal With Every World Cup Ba...,0.084454
20200,The Death of Late Night TV,0.38928
5107,Let’s Speedrun Dumb Ways To Die 2,0.019334
971,BABY TRIES SOLID FOOD FOR THE FIRST TIME! BABY...,0.011797
15981,"Ken Block’s 1,400hp AWD Ford Mustang Hoonicorn...",0.079533
313,"Minecraft, But You Can Invent Anything...",0.015935
17693,Deontay Wilder vs. Robert Helenius | Preview H...,0.014548
14750,GloRilla Glows Up In Every Way With Performanc...,0.01982
