In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
from torch import nn
from sklearn.model_selection import train_test_split
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
train_df = pd.read_csv("../input/nlp-getting-started/train.csv")
test_df = pd.read_csv("../input/nlp-getting-started/test.csv")

In [None]:
from datasets import Dataset, DatasetDict

In [None]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
train_dataset = train_dataset.train_test_split(test_size=0.20)

In [None]:
train_dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

In [None]:
tokenized_train = tokenized_train.remove_columns(["text", "keyword", "location", "id"])
tokenized_train = tokenized_train.rename_column("target", "labels")
tokenized_train.set_format("torch")

In [None]:
full_train_dataset = tokenized_train["train"]
full_eval_dataset = tokenized_train["test"]

In [None]:
tokenized_test = tokenized_test.remove_columns(["text", "keyword", "location", "id"])
tokenized_test.set_format("torch")

In [None]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(full_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(full_eval_dataset, shuffle=True, batch_size=8)

In [None]:
from transformers import get_scheduler

num_epochs = 4
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)
print(num_training_steps)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
from datasets import load_metric
metric= load_metric("accuracy")
model.eval()
for batch in tqdm(eval_dataloader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

In [None]:
test_dataloader = DataLoader(tokenized_test, shuffle=False, batch_size=1)

In [None]:
all_preds = []

In [None]:
model.eval()
for batch in tqdm(test_dataloader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).cpu().tolist()
    all_preds = all_preds + predictions

In [None]:
ids = test_df["id"]
submission_df = pd.DataFrame({"id": ids, "target": all_preds})
submission_df.reset_index(drop=True, inplace=True)

In [None]:
submission_df.to_csv("submission.csv", index=False)