In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
sumbission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

In [2]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [3]:
import re, string

class PreprocessText:
    def __init__(self, text):
        self.text = text 

    def remove_url(self, text):
        url = re.compile(r'https?://\S+|www\.\S+')
        return url.sub(r'', text)

    def remove_emoji(self, text):
        emoji_pattern = re.compile(
            '['
            u'\U0001F600-\U0001F64F'  # emoticons
            u'\U0001F300-\U0001F5FF'  # symbols & pictographs
            u'\U0001F680-\U0001F6FF'  # transport & map symbols
            u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
            u'\U00002702-\U000027B0'
            u'\U000024C2-\U0001F251'
            ']+',
            flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)

    def remove_html(self, text):
        html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
        return re.sub(html, '', text)
    
    

    def clean_text(self):
        '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
        and remove words containing numbers.'''
        text = str(self.text).lower()
        text = re.sub('\[.*?\]', '', text)
        text = re.sub('https?://\S+|www\.\S+', '', text)
        text = re.sub('<.*?>+', '', text)
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
        text = re.sub('\n', '', text)
        text = re.sub('\w*\d\w*', '', text)
        text = self.remove_url(text)
        text = self.remove_emoji(text)
        text = self.remove_html(text)
        return text


In [4]:
def preprocess(df):
    df = df.dropna(how="any", axis=1)
    df['clean_text'] = df['text'].apply(lambda x: PreprocessText(x).clean_text())
    df['text_len'] = df['text'].apply(lambda x: len(x.split(' ')))
    return df 

In [5]:
train_df = preprocess(train)
test_df = preprocess(test)
train_df.head() 
# test_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,id,text,target,clean_text,text_len
0,1,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...,13
1,4,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,7
2,5,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...,22
3,6,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in...,9
4,7,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...,17


In [6]:
from sklearn.model_selection import train_test_split

X = train_df.clean_text 
y = train_df.target

x_train, x_val, y_train, y_val = train_test_split(X, y, random_state=42)
print(len(x_train), len(y_train))
print(len(x_val), len(y_val))




5709 5709
1904 1904


In [7]:
from transformers import AutoTokenizer

checkpoint = "vinai/bertweet-base"

tokenizer = AutoTokenizer.from_pretrained(checkpoint, normalization=True)

Downloading:   0%|          | 0.00/558 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/843k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
x_train_input = dict(tokenizer(x_train.to_list(), padding=True, truncation=True, return_tensors="tf"))
x_val_input = dict(tokenizer(x_val.to_list(), padding=True, truncation=True, return_tensors="tf"))

x_test_input = dict(tokenizer(test_df.clean_text.to_list(), padding=True, truncation=True, return_tensors="tf"))

In [9]:
tokenizer.pad_token

'<pad>'

In [10]:
from tensorflow.keras.metrics import Metric
import numpy as np


class F1_metric(Metric):
    def __init__(self, name='f1_score', **kwargs):
        super().__init__(name=name, **kwargs)
        # Initialize our metric by initializing the two metrics it's based on:
        # Precision and Recall
        self.precision = tf.keras.metrics.Precision()
        self.recall = tf.keras.metrics.Recall()

    def update_state(self, y_true, y_pred, sample_weight=None):
        # Update our metric by updating the two metrics it's based on
        self.precision.update_state(y_true, tf.math.argmax(y_pred, axis=0), sample_weight) # Ensure to get preds not logits 
        self.recall.update_state(y_true,tf.math.argmax(y_pred, axis=0), sample_weight)

    def reset_state(self):
        self.precision.reset_state()
        self.recall.reset_state()

    def result(self):
        # To get the F1 result, we compute the harmonic mean of the current
        # precision and recall
        return 2 / ((1 / self.precision.result()) + (1 / self.recall.result())) 


In [11]:
# For tpu: 
import tensorflow as tf
import os

tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()

# instantiate a distribution strategy
strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [12]:
import numpy as np
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from transformers import TFAutoModelForSequenceClassification

batch_size = 16
# batch_size = 16 * strategy.num_replicas_in_sync

num_epochs = 5
num_train_steps = (len(x_train_input['input_ids']) // batch_size) * num_epochs

# Define lr_scheduler
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5,
    end_learning_rate=0.,
    decay_steps=num_train_steps
    )

# Define optimizer with lr_scheduler
opt = Adam(learning_rate=lr_scheduler)
loss = SparseCategoricalCrossentropy(from_logits=True)


with strategy.scope():
  model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2, return_dict=False) # TFAutoModel.from_pretrained(checkpoint)
  model.compile(optimizer=opt, loss=loss, metrics=['accuracy'])


model.fit(
    x_train_input,
    np.array(y_train), 
    validation_data=(
        x_val_input,
        np.array(y_val),
    ),
    batch_size=batch_size, 
    epochs=num_epochs
)

Downloading:   0%|          | 0.00/740M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fa2602423d0>

In [13]:
ch = model(x_test_input, return_dict=True)

In [14]:
targets = [np.argmax(x) for x in ch['logits']]
# targets
ids = test.id.to_list()

In [15]:
submission = pd.DataFrame({'id': ids, 'target': targets})
submission.to_csv('submission.csv', index=False)