In [None]:
%env WANDB_DISABLED=True

In [None]:
import re
import string
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, default_data_collator, Trainer, set_seed

In [None]:
set_seed(42)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertForSequenceClassification.from_pretrained('bert-large-uncased')

In [None]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')
train.head()

In [None]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)


def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)


def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

In [None]:
train['text_clean'] = train['text'].apply(remove_URL)
train['text_clean'] = train['text_clean'].apply(remove_emoji)
train['text_clean'] = train['text_clean'].apply(remove_html)
train['text_clean'] = train['text_clean'].apply(remove_punct)
train['text_clean'] = train['text_clean'].apply(lambda x: x.lower())
train.head()

In [None]:
max_length = 64
train['input_ids'] = train['text_clean'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
train.rename(columns={'target': 'labels'}, inplace=True)
train.head()

In [None]:
train = train[['input_ids', 'labels']]
train.head()

In [None]:
train_df = train[:-int(len(train)*0.01)].reset_index(drop=True)
valid_df = train[-int(len(train)*0.01):].reset_index(drop=True)

In [None]:
train_ds = Dataset.from_pandas(train_df)
valid_ds = Dataset.from_pandas(valid_df)

In [None]:
train_ds

In [None]:
batch_size = 16

args = TrainingArguments(
    'nlp-getting-started',
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    learning_rate=3e-5,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    warmup_ratio=0.1,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
)

data_collator = default_data_collator
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
test = pd.read_csv('../input/nlp-getting-started/test.csv')
test.head()

In [None]:
test['text_clean'] = test['text'].apply(remove_URL)
test['text_clean'] = test['text_clean'].apply(remove_emoji)
test['text_clean'] = test['text_clean'].apply(remove_html)
test['text_clean'] = test['text_clean'].apply(remove_punct)
test['text_clean'] = test['text_clean'].apply(lambda x: x.lower())
test.head()

In [None]:
test['input_ids'] = test['text_clean'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
test.head()

In [None]:
test = test[['input_ids']]
test.head()

In [None]:
test_ds = Dataset.from_pandas(test)
test_ds

In [None]:
outputs = trainer.predict(test_ds)

In [None]:
sub = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
sub.head()

In [None]:
sub['target'] = outputs.predictions.argmax(1)
sub.head()

In [None]:
sub.to_csv('submission.csv', index=False)