## Data Loading

In [124]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


In [125]:
print(train["text"][:50])

0     Our Deeds are the Reason of this #earthquake M...
1                Forest fire near La Ronge Sask. Canada
2     All residents asked to 'shelter in place' are ...
3     13,000 people receive #wildfires evacuation or...
4     Just got sent this photo from Ruby #Alaska as ...
5     #RockyFire Update => California Hwy. 20 closed...
6     #flood #disaster Heavy rain causes flash flood...
7     I'm on top of the hill and I can see a fire in...
8     There's an emergency evacuation happening now ...
9     I'm afraid that the tornado is coming to our a...
10          Three people died from the heat wave so far
11    Haha South Tampa is getting flooded hah- WAIT ...
12    #raining #flooding #Florida #TampaBay #Tampa 1...
13              #Flood in Bago Myanmar #We arrived Bago
14    Damage to school bus on 80 in multi car crash ...
15                                       What's up man?
16                                        I love fruits
17                                     Summer is

In [126]:
train.columns

Index(['Unnamed: 0', 'keyword', 'location', 'text', 'target'], dtype='object')

In [127]:
test.columns

Index(['id', 'keyword', 'location', 'text'], dtype='object')

## Preprocessing 

In [128]:
# https://www.kaggle.com/code/wrrosa/keras-bert-using-tfhub-modified-train-data#About-this-kernel

# There are targets which are wrong
ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]
train[train['Unnamed: 0'].isin(ids_with_target_error)]


Unnamed: 0.1,Unnamed: 0,keyword,location,text,target
229,328,annihilated,,Ready to get annihilated for the BUCS game,1
301,443,apocalypse,,Short Reading\n\nApocalypse 21:1023 \n\nIn the...,1
356,513,army,Studio,But if you build an army of 100 dogs and their...,1
1822,2619,crashed,,My iPod crashed..... \n#WeLoveYouLouis \n#MTVH...,1
2536,3640,desolation,"Quilmes , Arg",This desperation dislocation\nSeparation conde...,1
2715,3900,devastated,PG Chillin!,Man Currensy really be talkin that talk... I'd...,1
3024,4342,dust%20storm,chicago,Going to a fest? Bring swimming goggles for th...,1
4068,5781,forest%20fires,,Campsite recommendations \nToilets /shower \nP...,1
4609,6552,injury,Saint Paul,My prediction for the Vikings game this Sunday...,1
4611,6554,injury,,Dante Exum's knee injury could stem Jazz's hop...,1


In [129]:
# Correcting the target values
train.loc[train['Unnamed: 0'].isin(ids_with_target_error),'target'] = 0

In [130]:
import os
import re
import string
import nltk
from nltk.corpus import stopwords

# Set the NLTK_DATA environment variable to your dir
nltk_data_dir = '/home/stefan/nltk_data' 
os.environ['NLTK_DATA'] = nltk_data_dir

if not os.path.exists(nltk_data_dir):
    os.makedirs(nltk_data_dir)

nltk.data.path.append(nltk_data_dir)

nltk.download('punkt', download_dir=nltk_data_dir)
nltk.download('wordnet', download_dir=nltk_data_dir)
nltk.download('omw-1.4', download_dir=nltk_data_dir)
nltk.download('stopwords', download_dir=nltk_data_dir)

[nltk_data] Downloading package punkt to /home/stefan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/stefan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/stefan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /home/stefan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [131]:
# https://www.kaggle.com/rftexas/text-only-kfold-bert
abbreviations = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", #"que pasa",
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}

In [132]:

# https://www.kaggle.com/code/vbmokin/nlp-eda-bag-of-words-tf-idf-glove-bert#5.-Data-Cleaning-
def remove_emoji(text):
    emoji_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"  # emoticons
                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            u"\U00002702-\U000027B0"
                            u"\U000024C2-\U0001F251"
                            "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def preprocess(text):
    text = remove_emoji(text)

    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    text = re.sub(r'\@\w+|\#', '', text)
    
    text = re.sub(r'\d+', '', text)
    
    text = text.lower()

    text = text.translate(str.maketrans('', '', string.punctuation))
    
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])

    for word in text.split():
        if word.lower() in abbreviations:
            text = text.replace(word, abbreviations[word.lower()])

    text = re.sub(r'(.)\1{1,}', r'\1', text)

    return text


In [133]:
columns_to_preprocess = ['text', 'keyword', 'location']

for column in columns_to_preprocess:
    train[column] = train[column].fillna('Missing')
    test[column] = test[column].fillna('Missing')
    
train['clean_text'] = train['text'].apply(preprocess)
test['clean_text'] = test['text'].apply(preprocess)

train['clean_keyword'] = train['keyword'].apply(preprocess)
test['clean_keyword'] = test['keyword'].apply(preprocess)

train['clean_location'] = train['location'].apply(preprocess)
test['clean_location'] = test['location'].apply(preprocess)

In [134]:
test.head()

Unnamed: 0,id,keyword,location,text,clean_text,clean_keyword,clean_location
0,0,Missing,Missing,Just happened a terrible car crash,hapened terible car crash,mising,mising
1,2,Missing,Missing,"Heard about #earthquake is different cities, s...",heard earthquake diferent cities stay safe eve...,mising,mising
2,3,Missing,Missing,"there is a forest fire at spot pond, geese are...",forest fire spot pond gese fleing acros stret ...,mising,mising
3,9,Missing,Missing,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfires,mising,mising
4,11,Missing,Missing,Typhoon Soudelor kills 28 in China and Taiwan,typhon soudelor kils china taiwan,mising,mising


In [135]:
train.head()

Unnamed: 0.1,Unnamed: 0,keyword,location,text,target,clean_text,clean_keyword,clean_location
0,1,Missing,Missing,Our Deeds are the Reason of this #earthquake M...,1,deds reason earthquake may alah forgive us,mising,mising
1,4,Missing,Missing,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,mising,mising
2,5,Missing,Missing,All residents asked to 'shelter in place' are ...,1,residents asked shelter place notified oficers...,mising,mising
3,6,Missing,Missing,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders cal...,mising,mising
4,7,Missing,Missing,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfires pou...,mising,mising


In [136]:
train.columns

Index(['Unnamed: 0', 'keyword', 'location', 'text', 'target', 'clean_text',
       'clean_keyword', 'clean_location'],
      dtype='object')

In [137]:
test.columns

Index(['id', 'keyword', 'location', 'text', 'clean_text', 'clean_keyword',
       'clean_location'],
      dtype='object')

In [138]:
train.drop(columns=['text', 'keyword', 'location'], inplace=True)
test.drop(columns=['text', 'keyword', 'location'], inplace=True)

train.drop(columns=['Unnamed: 0'], inplace=True)
test.drop(columns=['id'], inplace=True)

In [139]:
train.to_csv('train_cleaned.csv', index=False)

In [140]:
train.drop(columns=['clean_keyword', 'clean_location'], inplace=True)
test.drop(columns=['clean_keyword', 'clean_location'], inplace=True)

train.to_csv('train_cleaned.csv', index=False)
test.to_csv('test_cleaned.csv', index=False)

In [141]:
from transformers import DebertaTokenizer, DebertaForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train['clean_text'].tolist(),
    train['target'].tolist(),
    test_size=0.2,  
    random_state=42
)

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test['clean_text'].tolist(), truncation=True, padding=True)

class CustomDataset(Dataset):
    def __init__(self, encodings, labels=None): 
        self.encodings = encodings
        self.labels = labels  
    def __len__(self):
        return len(self.encodings['input_ids'])
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)
test_dataset = CustomDataset(test_encodings) 

model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base')

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=1,              
    per_device_train_batch_size=32,  
    per_device_eval_batch_size=64,   
    warmup_steps=100,               
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=50,                
    evaluation_strategy='steps',     
    eval_steps=1000,                
    save_steps=2000,                 
    load_best_model_at_end=True,     
    metric_for_best_model='f1',     
    greater_is_better=True,         
)

def compute_metrics(p):
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

trainer = Trainer(
    model=model,                        
    args=training_args,                  
    train_dataset=train_dataset,        
    eval_dataset=val_dataset,           
    compute_metrics=compute_metrics      
)

trainer.train()

val_results = trainer.evaluate()
print(val_results)

test_predictions = trainer.predict(test_dataset)
preds = test_predictions.predictions.argmax(-1)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/191 [00:00<?, ?it/s]

{'loss': 0.6848, 'grad_norm': 4.139852523803711, 'learning_rate': 2.5e-05, 'epoch': 0.26}
{'loss': 0.5844, 'grad_norm': 4.621327877044678, 'learning_rate': 5e-05, 'epoch': 0.52}
{'loss': 0.5005, 'grad_norm': 3.2949280738830566, 'learning_rate': 2.252747252747253e-05, 'epoch': 0.79}
{'train_runtime': 762.26, 'train_samples_per_second': 7.989, 'train_steps_per_second': 0.251, 'train_loss': 0.5655983630275228, 'epoch': 1.0}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.43749499320983887, 'eval_accuracy': 0.8115561391989494, 'eval_f1': 0.8084657174758022, 'eval_precision': 0.8138905638033413, 'eval_recall': 0.8115561391989494, 'eval_runtime': 50.6106, 'eval_samples_per_second': 30.093, 'eval_steps_per_second': 0.474, 'epoch': 1.0}


  0%|          | 0/51 [00:00<?, ?it/s]

In [142]:

# Prepare the submission file
submission = pd.read_csv('sample_submission.csv')
submission['target'] = preds
submission.to_csv('submission_deberta.csv', index=False)

