In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

pd.options.display.max_colwidth = None
pd.options.display.max_columns = 10

# Quora insincere questions classification

## Objective

* Predict whether a question asked on Quora is sincere or not
* Binary classification

In [None]:
train_data = pd.read_csv("../input/quora-insincere-questions-classification/train.csv")
test_data = pd.read_csv("../input/quora-insincere-questions-classification/test.csv")

In [None]:
hand_picked_positives = [0, 8, 12, 16, 41]
positive_data = train_data.loc[train_data['target'] == 1].iloc[hand_picked_positives].copy()
positive_data

In [None]:
hand_picked_negatives = [2, 7, 11, 17, 28]
negative_data = train_data.loc[train_data['target'] == 0].iloc[hand_picked_negatives].copy()
negative_data

In [None]:
print(f'Size of training set: {len(train_data)}')
print(f'Size of testing set: {len(test_data)}')
print('Distribution of labels in training set:')
print(train_data['target'].value_counts())

In [None]:
train_data = pd.concat([train_data.loc[train_data['target']==1].tail(2500), train_data.loc[train_data['target']==0].tail(2500)], axis=0)
train_data['target'].value_counts()

In [None]:
# Following code is mostly copied from: https://www.thepythoncode.com/article/finetuning-bert-using-huggingface-transformers-python (thank you!)

# the model we gonna train, base uncased BERT
# check text classification models here: https://huggingface.co/models?filter=text-classification
model_name = "bert-base-uncased"
# max sequence length for each document/sentence sample
max_length = 128

In [None]:
from transformers import BertTokenizerFast, BertForSequenceClassification

# load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

In [None]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(train_data['question_text'].apply(str).tolist(),
                                                                    train_data['target'].apply(int).tolist(), train_size=0.8)

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=max_length)

In [None]:
import torch

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# convert our tokenized data into a torch Dataset
train_dataset = CustomDataset(train_encodings, train_labels)
valid_dataset = CustomDataset(val_encodings, val_labels)

In [None]:
# load the model and pass to CUDA
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to("cuda")

In [None]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=50,                 # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=50,               # log & save weights each logging_steps
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

In [None]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

In [None]:
# train the model
trainer.train()

In [None]:
# evaluate the current model after training
trainer.evaluate()

In [None]:
def get_prediction_proba(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return probs

def get_prediction(text):
    return get_prediction_proba(text).argmax().item()

In [None]:
print(get_prediction_proba("Is this sincere question?"))

In [None]:
positive_data['pred'] = positive_data['question_text'].apply(get_prediction))
negative_data['pred'] = negative_data['question_text'].apply(lambda x: get_prediction.argmax().item())

In [None]:
positive_data

In [None]:
negative_data