In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords

import tensorflow as tf
import tensorflow_addons as tfa
from transformers import TFAutoModel, AutoTokenizer

In [None]:
from nltk.tokenize import TweetTokenizer
from emoji import demojize
import re

tweet_tokenizer = TweetTokenizer()

def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "HTTPURL"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token

def normalizeTweet(tweet):
    tokens = tweet_tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = normTweet.replace("cannot ", "can not ").replace("n't ", " n't ").replace("n 't ", " n't ").replace("ca n't", "can't").replace("ai n't", "ain't")
    normTweet = normTweet.replace("'m ", " 'm ").replace("'re ", " 're ").replace("'s ", " 's ").replace("'ll ", " 'll ").replace("'d ", " 'd ").replace("'ve ", " 've ")
    normTweet = normTweet.replace(" p . m .", "  p.m.") .replace(" p . m ", " p.m ").replace(" a . m .", " a.m.").replace(" a . m ", " a.m ")

    normTweet = re.sub(r",([0-9]{2,4}) , ([0-9]{2,4})", r",\1,\2", normTweet)
    normTweet = re.sub(r"([0-9]{1,3}) / ([0-9]{2,4})", r"\1/\2", normTweet)
    normTweet = re.sub(r"([0-9]{1,3})- ([0-9]{2,4})", r"\1-\2", normTweet)
    
    return " ".join(normTweet.split())

In [None]:
#Data normalizion is done only in the purpose of EDA
def load_train_set():
    df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")[["text", "target"]]
    df["text"] = df["text"].apply(normalizeTweet)
    return df

def load_test_set():
    df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")[["id", "text"]]
    df["text"] = df["text"].apply(normalizeTweet)
    return df

In [None]:
train = load_train_set()
test = load_test_set()

train.info()
print()
test.info()

In [None]:
print(train['target'].value_counts())
print()
print(train['target'].value_counts(normalize=True))

In [None]:
sequences_length=[len(tweet.split()) for tweet in train["text"]]
plt.figure(figsize=(18,5))
plt.title('Tweet length histogram')
plt.hist(sequences_length,bins=range(np.unique(sequences_length)[-1]+1))
plt.show()

In [None]:
disaster_tweets = train[train['target']==1]['text']
non_disaster_tweets = train[train['target']==0]['text']

freq_dist_disaster_tweets= nltk.FreqDist([word for tweet in disaster_tweets for word in tweet.lower().split() if word not in stopwords.words("english") and len(word) > 2])
freq_dist_non_disaster_tweets= nltk.FreqDist([word for tweet in non_disaster_tweets for word in tweet.lower().split() if word not in stopwords.words("english") and len(word) > 2])

    
results = {
    'word' : [],
    'disaster_count' : [],
    'non_disaster_count' : []
}
for word, value in freq_dist_disaster_tweets.most_common(200):
    results['word'].append(word)
    results['disaster_count'].append(freq_dist_disaster_tweets[word])
    results['non_disaster_count'].append(freq_dist_non_disaster_tweets[word])
df = pd.DataFrame(results)
df.head(30)

In [None]:
freq_dist_disaster_tweets.most_common(50)

In [None]:
plt.figure(figsize=(25,5))
plt.title(f'Frequency Distribution (Disaster, Top 50 words)')
freq_dist_disaster_tweets.plot(50, marker='|', markersize=20)

plt.figure(figsize=(25,5))
plt.title(f'Frequency Distribution (Non Disaster, Top 50 words)')
freq_dist_non_disaster_tweets.plot(50, marker='|', markersize=20)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
tweets_length = [len(tokenizer.encode(tweet, max_length=512, truncation=True)) for tweet in train["text"]]
print("Average length: {:.1f}".format(np.mean(tweets_length)))
print("Max length: {}".format(max(tweets_length)))

plt.figure(figsize=(10,5))
ax = sns.distplot(tweets_length, bins=150, kde=False, hist_kws=dict(alpha=0.8))
ax.set(xlabel='Number of tokens')

# Finalize the plot
sns.despine(bottom=True)
plt.tight_layout(h_pad=2)

In [None]:
MAX_LENGTH = 50 # in terms of generated tokens (not words)
short_tweets = sum(np.array(tweets_length) <= MAX_LENGTH)
long_tweets = sum(np.array(tweets_length) > MAX_LENGTH)

print("{} reviews with LEN > {} ({:.2f} % of total data)".format(
    long_tweets,
    MAX_LENGTH,
    100 * long_tweets / len(train)
))

In [None]:
def encode_tweets(tokenizer, tweets, max_len):
    nb_tweets = len(tweets)
    tokens = np.ones((nb_tweets,max_len),dtype='int32')
    masks = np.zeros((nb_tweets,max_len),dtype='int32')
    segs = np.zeros((nb_tweets,max_len),dtype='int32')

    for k in range(nb_tweets):        
        # INPUT_IDS
        tweet = tweets[k]
        enc = tokenizer.encode(tweet)                   
        if len(enc)<max_len-2:
            tokens[k,:len(enc)+2] = [0] + enc + [2]
            masks[k,:len(enc)+2] = 1
        else:
            tokens[k,:max_len] = [0] + enc[:max_len-2] + [2]
            masks[k,:max_len] = 1 
    return tokens,masks,segs


def build_model(max_len):
    ids = tf.keras.layers.Input((max_len,), dtype=tf.int32)
    att = tf.keras.layers.Input((max_len,), dtype=tf.int32)
    tok = tf.keras.layers.Input((max_len,), dtype=tf.int32)
    
    bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base")
    x,_ = bertweet(ids,attention_mask=att,token_type_ids=tok)

    out=tf.keras.layers.Dense(1,activation='sigmoid')(x[:,0,:])
    
    model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=out)
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

    
model = build_model(MAX_LENGTH)
model.summary()

In [None]:
train_tokens, train_masks, train_segs = encode_tweets(tokenizer,train["text"].to_list(), MAX_LENGTH)
train_labels = train["target"]

In [None]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=3, restore_best_weights=True, verbose=1)
train_labels = train['target']
train_history = model.fit(
    [train_tokens,train_masks,train_segs], train_labels,
    validation_split=0.2,
    epochs=5,
    batch_size=16,
    verbose = 1,
    callbacks = [es]
)

In [None]:
test_tokens, test_masks, test_segs = encode_tweets(tokenizer,test["text"].to_list(), MAX_LENGTH)
test["target"] = model.predict([test_tokens, test_masks, test_segs]).round().astype(int)
submission = test[["id", "target"]]
submission.to_csv("submission.csv",index=False)