In [None]:
import numpy as np
import pandas as pd
from fastai.text.all import *
import re

# Import the data and clean it

In [None]:
dir_path = "/kaggle/input/nlp-getting-started/"
train_df = pd.read_csv(dir_path + "train.csv")
test_df = pd.read_csv(dir_path + "test.csv")

In [None]:
train_df

In [None]:
train_df = train_df.drop(columns=["id", "keyword", "location"])

In [None]:
train_df["target"].value_counts()

In [None]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

train_df["text"] = train_df["text"].apply(remove_URL)
test_df["text"] = test_df["text"].apply(remove_URL)

In [None]:
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

train_df["text"] = train_df["text"].apply(remove_html)
test_df["text"] = test_df["text"].apply(remove_html)

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

train_df["text"] = train_df["text"].apply(remove_emoji)
test_df["text"] = test_df["text"].apply(remove_emoji)

In [None]:
train_df

In [None]:
train_df["text"].apply(lambda x:len(x.split())).plot(kind="hist");

# Get tokens for the transformer

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-large")

From the graph above, we can know that the longest tweet has 30 words, so I set the `max_length` to 30.

In [None]:
train_tensor = tokenizer(list(train_df["text"]), padding="max_length",
                        truncation=True, max_length=30,
                        return_tensors="pt")["input_ids"]

# Preparing datasets and dataloaders

In [None]:
class TweetDataset:
    def __init__(self, tensors, targ, ids):
        self.text = tensors[ids, :]
        self.targ = targ[ids].reset_index(drop=True)
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):
        
        t = self.text[idx]
        y = self.targ[idx]
        
        return t, tensor(y)

In [None]:
train_ids, valid_ids = RandomSplitter()(train_df)


target = train_df["target"]

train_ds = TweetDataset(train_tensor, target, train_ids)
valid_ds = TweetDataset(train_tensor, target, valid_ids)

train_dl = DataLoader(train_ds, bs=64)
valid_dl = DataLoader(valid_ds, bs=512)
dls = DataLoaders(train_dl, valid_dl).to("cuda")

# Get the model

In [None]:
bert = AutoModelForSequenceClassification.from_pretrained("roberta-large", num_labels=2).train().to("cuda")

class BertClassifier(Module):
    def __init__(self, bert):
        self.bert = bert
    def forward(self, x):
        return self.bert(x).logits

model = BertClassifier(bert)

# Start training

In [None]:
learn = Learner(dls, model, metrics=[accuracy, F1Score()]).to_fp16()
learn.lr_find()

In [None]:
learn.fit_one_cycle(3, lr_max=1e-5)

# Find the best threshold for f1 score

In [None]:
from sklearn.metrics import f1_score

preds, targs = learn.get_preds()

min_threshold = None
max_f1 = -float("inf")
thresholds = np.linspace(0.3, 0.7, 50)
for threshold in thresholds:
    f1 = f1_score(targs, F.softmax(preds, dim=1)[:, 1]>threshold)
    if f1 > max_f1:
        min_threshold = threshold
        min_f1 = f1
    print(f"threshold:{threshold:.4f} - f1:{f1:.4f}")

# Make prediction on the test set and submit the prediction

In [None]:
test_tensor = tokenizer(list(test_df["text"]),
                        padding="max_length",
                        truncation=True,
                        max_length=30,
                        return_tensors="pt")["input_ids"]

In [None]:
class TestDS:
    def __init__(self, tensors):
        self.tensors = tensors
    
    def __len__(self):
        return len(self.tensors)
    
    def __getitem__(self, idx):
        t = self.tensors[idx]
        return t, tensor(0)

test_dl = DataLoader(TestDS(test_tensor), bs=128)

In [None]:
test_preds = learn.get_preds(dl=test_dl)

In [None]:
sub = pd.read_csv(dir_path + "sample_submission.csv")
prediction = (F.softmax(test_preds[0], dim=1)[:, 1]>min_threshold).int()
sub = pd.read_csv(dir_path + "sample_submission.csv")
sub["target"] = prediction
sub.to_csv("submission.csv", index=False)