In [None]:
pip install evaluate

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


from datasets import Dataset, DatasetDict
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
from transformers import AutoTokenizer,DataCollatorWithPadding,AutoModelForSequenceClassification, TrainingArguments, Trainer

import evaluate


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from huggingface_hub import notebook_login

notebook_login()

# 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

In [None]:
df_full_train =pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
df_test =pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:

df_full_train.drop(columns=['id', 'keyword', 'location'], inplace=True)
df_test.drop(columns=['id', 'keyword', 'location'], inplace=True)
df_full_train.rename(columns={"target": "labels"}, inplace=True)

In [None]:
dataset = Dataset.from_pandas(df_full_train)

dataset_full=  dataset.train_test_split(test_size=0.2)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_dataset = dataset_full.map(preprocess_function, batched=True)


In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
accuracy = evaluate.load("f1")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Train

In [None]:
id2label = {0: "not disaster", 1: "disaster"}
label2id = {"not disaster": 0, "disaster": 1}

In [None]:

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id,
    ignore_mismatched_sizes=True  # This will ignore the size mismatch
)

In [None]:
training_args = TrainingArguments(
    output_dir="distilbert-base-uncased-disaster",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
# a1af5462898b4b9547fe7c2b7d0b7d28f8543582

In [None]:
trainer.push_to_hub()

In [None]:
from transformers import pipeline
text = "Forest fire near La Ronge Sask. Canada"
classifier = pipeline("sentiment-analysis", model="jiajiacen/distilbert-base-uncased-disaster")
result = classifier(text)

In [None]:
result[0]['label']

## inference

In [None]:

classifier = pipeline("sentiment-analysis", model="jiajiacen/distilbert-base-uncased-disaster")
# result = classifier(text)
# result[0]['label']

def classify_text(text):
    
    result = classifier(text)
    
    label = result[0]['label']
    return 1 if label == 'disaster' else 0

df_test['disaster_prediction'] = df_test['text'].apply(classify_text)

In [None]:
df_submission =df_test[['id','disaster_prediction']]
df_submission.rename(columns={'disaster_prediction': 'target'}, inplace=True)

df_submission.to_csv('fine_tune_roberta-large-mnli.csv')