In [None]:
import numpy as np
import pandas as pd

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from fastai.text.all import *
from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset

In [None]:
path = "/kaggle/input/quora-insincere-questions-classification/"
train_df = pd.read_csv(path + "train.csv")
test_df = pd.read_csv(path + "test.csv")

In [None]:
train_df.shape, test_df.shape

In [None]:
train_df["target"].value_counts()/train_df.shape[0]

In [None]:
all_text = pd.concat([train_df["question_text"], test_df["question_text"]], axis=0)

In [None]:
def sample_text(n=10):
    sample = all_text.sample(n)
    print(" | ".join(sample))

In [None]:
sample_text()

In [None]:
train_df["question_text"].apply(lambda x:len(x.split())).plot(kind="hist");

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [None]:
class QuestionDataset(Dataset):
    def __init__(self, X, y, tokenizer):
        self.text = X.reset_index(drop=True)
        self.targets = y.reset_index(drop=True)
        self.tok = tokenizer
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):
        
        text = self.text[idx]
        targ = self.targets[idx]
        
        return self.tok(text, padding='max_length', 
                        truncation=True,
                        max_length=30,
                        return_tensors="pt")["input_ids"][0], tensor(targ)

In [None]:
df = train_df
X_train, X_valid, y_train, y_valid = train_test_split(df["question_text"], df["target"], 
                                                      stratify=df["target"],  test_size=0.01)

train_ds = QuestionDataset(X_train, y_train, tokenizer)
valid_ds = QuestionDataset(X_valid, y_valid, tokenizer)

train_dl = DataLoader(train_ds, bs=256)
valid_dl = DataLoader(valid_ds, bs=512)
dls = DataLoaders(train_dl, valid_dl).to("cuda")

In [None]:
bert = AutoModelForSequenceClassification.from_pretrained('bert-base-cased').train()

classifier = nn.Sequential(
    nn.Linear(768, 1024),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(1024, 2)
)

bert.classifier = classifier

class BertClassifier(Module):
    def __init__(self, bert):
        self.bert = bert
    def forward(self, x):
        x = self.bert(x)
        return x.logits

model = BertClassifier(bert).to("cuda")

In [None]:
n_0 = (train_df["target"] == 0).sum()
n_1 = (train_df["target"] == 1).sum()
n = n_0 + n_1

In [None]:
class_weights = tensor([n / (n+n_0), n / (n+n_1)]).to('cuda')
learn = Learner(dls, model, 
                loss_func=nn.CrossEntropyLoss(weight=class_weights), 
                metrics=[accuracy, F1Score()]).to_fp16()
learn.lr_find()

In [None]:
learn.fit_one_cycle(2, lr_max=5e-5)

In [None]:
from sklearn.metrics import f1_score

In [None]:
preds, targs = learn.get_preds()

In [None]:
thresholds = np.linspace(0.3, 0.7, 50)
for threshold in thresholds:
    f1 = f1_score(targs, F.softmax(preds, dim=1)[:, 1]>threshold)
    print(f"threshold:{threshold:.4f} - f1:{f1:.4f}")

In [None]:
test_tensor = tokenizer(list(test_df["question_text"]),
                        padding="max_length",
                        truncation=True,
                        max_length=30,
                        return_tensors="pt")["input_ids"]

In [None]:
class TestDS:
    def __init__(self, tensors):
        self.tensors = tensors
    
    def __len__(self):
        return len(self.tensors)
    
    def __getitem__(self, idx):
        t = self.tensors[idx]
        return t, tensor(0)

test_dl = DataLoader(TestDS(test_tensor), bs=128)

In [None]:
test_preds = learn.get_preds(dl=test_dl)

In [None]:
prediction = (F.softmax(test_preds[0], dim=1)[:, 1]>0.48).int()
sub = pd.read_csv(path + "sample_submission.csv")
sub["prediction"] = prediction
sub.to_csv("submission.csv", index=False)