# Import Data, EDA

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data.columns

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

Take a quick look at the data we are working with

In [3]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test_data

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [5]:
for i in range(50,60):
    print(train_data.text.iloc[i])
    print(train_data.target.iloc[i])

Deputies: Man shot before Brighton home set ablaze http://t.co/gWNRhMSO8k
1
Man wife get six years jail for setting ablaze niece
http://t.co/eV1ahOUCZA
1
SANTA CRUZ ÛÓ Head of the St Elizabeth Police Superintendent Lanford Salmon has r ... - http://t.co/vplR5Hka2u http://t.co/SxHW2TNNLf
0
Police: Arsonist Deliberately Set Black Church In North CarolinaåÊAblaze http://t.co/pcXarbH9An
1
Noches El-Bestia '@Alexis_Sanchez: happy to see my teammates and training hard ?? goodnight gunners.?????? http://t.co/uc4j4jHvGR'
0
#Kurds trampling on Turkmen flag later set it ablaze while others vandalized offices of Turkmen Front in #Diyala http://t.co/4IzFdYC3cg
1
TRUCK ABLAZE : R21. VOORTREKKER AVE. OUTSIDE OR TAMBO INTL. CARGO SECTION. http://t.co/8kscqKfKkF
1
Set our hearts ablaze and every city was a gift And every skyline was like a kiss upon the lips @Û_ https://t.co/cYoMPZ1A0Z
0
They sky was ablaze tonight in Los Angeles. I'm expecting IG and FB to be filled with sunset shots if I know my p

# Clean Up Tweets Before ML

Arguably the most important step (better data > better model) and after looking at some of the other entries, thinking there is plenty of room for improvement here.

TODO: Convert common shorthand (ie 'zzz') to words (ie 'Sleep')

In [6]:
from string import punctuation
import re
from nltk.corpus import stopwords
import nltk
import emoji
from spellchecker import SpellChecker

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text)
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

stopWords = set(stopwords.words('english'))

def clean_text(text):
    # change URL/@ tags to keywords
    text = ' '.join(['URL' if 'http://' in i or 'https://' in i else i for i in text.split(' ') ])
    text = ' '.join(['ATUSER' if len(i)>0 and i[0]=='@' else i for i in text.split(' ') ])
    #remove punctuation
    text = ''.join([c for c in text if c not in punctuation])
    #replace newline with space
    text = re.sub('\n',' ',text)
    #remove extra spaces
    text = re.sub(' +', ' ', text)
    #remove stop words
    text = ' '.join([i for i in text.lower().split(' ') if i not in stopWords])
    
    # some additional clean up steps I found interesting online
    text = re.sub(r'@[A-Za-z0-9_]+','',text)
    text = re.sub(r'RT : ','',text)
    text = re.sub(emoji.get_emoji_regexp(), r"", text)
    text = re.sub(r"<.*?>","",text)
    return text


def make_Lower(text):
    return str.lower(text)

train_data.text = train_data.text.apply(clean_text)
train_data.text = train_data.text.apply(correct_spellings)
text = train_data.text.tolist()

test_data.text = test_data.text.apply(clean_text)
test_data.text = test_data.text.apply(correct_spellings)
test_text = test_data.text.tolist()

In [7]:
train_data.text.values

array(['deeds reason earthquake may allah forgive us',
       'forest fire near la ronge sask canada',
       'residents asked shelter place notified officers evacuation shelter place orders expected',
       ..., 'm194 0104 utc5km volcano hawaii url',
       'police investigating ebike collided car little portugal ebike rider suffered serious nonlife threatening injuries',
       'latest homes razed northern california wildfire abc news url'],
      dtype=object)

# Fine-Tune Pre-Trained BERT

In [56]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt not found in cache or force_download set to True, downloading to C:\Users\duran\.cache\huggingface\transformers\tmpe10k470h


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…

storing https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt in cache at C:\Users\duran/.cache\huggingface\transformers\0e1bbfda7f63a99bb52e3915dcf10c3c92122b827d92eb2d34ce94ee79ba486c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
creating metadata file for C:\Users\duran/.cache\huggingface\transformers\0e1bbfda7f63a99bb52e3915dcf10c3c92122b827d92eb2d34ce94ee79ba486c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99





https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer_config.json not found in cache or force_download set to True, downloading to C:\Users\duran\.cache\huggingface\transformers\tmpr0tkodj8


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…

storing https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer_config.json in cache at C:\Users\duran/.cache\huggingface\transformers\8c8624b8ac8aa99c60c912161f8332de003484428c47906d7ff7eb7f73eecdbb.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79
creating metadata file for C:\Users\duran/.cache\huggingface\transformers\8c8624b8ac8aa99c60c912161f8332de003484428c47906d7ff7eb7f73eecdbb.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79
https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to C:\Users\duran\.cache\huggingface\transformers\tmp3hv34414





HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…

storing https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json in cache at C:\Users\duran/.cache\huggingface\transformers\75abb59d7a06f4f640158a9bfcde005264e59e8d566781ab1415b139d2e4c603.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
creating metadata file for C:\Users\duran/.cache\huggingface\transformers\75abb59d7a06f4f640158a9bfcde005264e59e8d566781ab1415b139d2e4c603.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at C:\Users\duran/.cache\huggingface\transformers\0e1bbfda7f63a99bb52e3915dcf10c3c92122b827d92eb2d34ce94ee79ba486c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://hug




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…

storing https://huggingface.co/distilbert-base-uncased/resolve/main/config.json in cache at C:\Users\duran/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.d423bdf2f58dc8b77d5f5d18028d7ae4a72dcfd8f468e81fe979ada957a8c361
creating metadata file for C:\Users\duran/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.d423bdf2f58dc8b77d5f5d18028d7ae4a72dcfd8f468e81fe979ada957a8c361
loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\duran/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.d423bdf2f58dc8b77d5f5d18028d7ae4a72dcfd8f468e81fe979ada957a8c361
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_




In [57]:
split_frac = 0.8

## split data into training, validation data (features and labels, x and y)

split_idx = int(len(text)*split_frac)
train_x, test_x = text[:split_idx], text[split_idx:]
train_y, test_y = train_data.target.tolist()[:split_idx], train_data.target.tolist()[split_idx:]

print(train_x[0])
print(train_y[0])

deeds reason earthquake may allah forgive us
1


In [58]:
import torch

def tokenize_function(text):
    return tokenizer(text, padding="max_length", truncation=True)

#train_x = [tokenize_function(i) for i in train_x]
#test_x = [tokenize_function(i) for i in test_x]

train_x = tokenizer(train_x, padding="max_length", truncation=True)
text_x = tokenizer(test_x, padding="max_length", truncation=True)

train_y = [float(i) for i in train_y]
test_y = [float(i) for i in test_y]

In [59]:
import torch

class DisDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = DisDataset(train_x, train_y)
test_dataset = DisDataset(test_x, test_y)

In [60]:
#from transformers import ElectraForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=1)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\duran/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.d423bdf2f58dc8b77d5f5d18028d7ae4a72dcfd8f468e81fe979ada957a8c361
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.9.1",
  "vocab_size": 30522
}

https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin not found in cache or force_download set to Tr

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…

storing https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin in cache at C:\Users\duran/.cache\huggingface\transformers\9c169103d7e5a73936dd2b627e42851bec0831212b677c637033ee4bce9ab5ee.126183e36667471617ae2f0835fab707baa54b731f991507ebbb55ea85adb12a
creating metadata file for C:\Users\duran/.cache\huggingface\transformers\9c169103d7e5a73936dd2b627e42851bec0831212b677c637033ee4bce9ab5ee.126183e36667471617ae2f0835fab707baa54b731f991507ebbb55ea85adb12a
loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at C:\Users\duran/.cache\huggingface\transformers\9c169103d7e5a73936dd2b627e42851bec0831212b677c637033ee4bce9ab5ee.126183e36667471617ae2f0835fab707baa54b731f991507ebbb55ea85adb12a





Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

In [61]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [62]:
from transformers import Trainer

trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset
)

In [63]:
trainer.train()

***** Running training *****
  Num examples = 6090
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1143


Step,Training Loss
10,0.4773
20,0.4845
30,0.4227
40,0.3516
50,0.219
60,0.2069
70,0.2149
80,0.1673
90,0.1739
100,0.165


Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json
Model weights saved in ./results\checkpoint-500\pytorch_model.bin
Saving model checkpoint to ./results\checkpoint-1000
Configuration saved in ./results\checkpoint-1000\config.json
Model weights saved in ./results\checkpoint-1000\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1143, training_loss=0.12563544300597485, metrics={'train_runtime': 15274.2222, 'train_samples_per_second': 1.196, 'train_steps_per_second': 0.075, 'total_flos': 2420136212981760.0, 'train_loss': 0.12563544300597485, 'epoch': 3.0})

# Run Predictions on Submission Dataset

In [64]:
sub_tokens = tokenizer(test_data.text.tolist(), padding="max_length", truncation=True)
sub_labs = [0 for i in range(len(sub_tokens.input_ids))]

In [65]:
sub_dataset = DisDataset(sub_tokens, sub_labs)

In [66]:
model_path = "results/checkpoint-1000"
model_submission = DistilBertForSequenceClassification.from_pretrained(model_path, num_labels=1)

test_trainer = Trainer(model_submission)

raw_pred, _, _ = test_trainer.predict(sub_dataset)

loading configuration file results/checkpoint-1000\config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "regression",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.9.1",
  "vocab_size": 30522
}

loading weights file results/checkpoint-1000\pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized f

In [67]:
preds = [round(i[0]) for i in raw_pred]

In [69]:
test_data['target'] = preds
test_data.filter(['id','target']).to_csv('sub_BERT.csv',index=False)

In [70]:
test_data.head(50)

Unnamed: 0,id,keyword,location,text,preds,target
0,0,,,happened terrible car crash,1,1
1,2,,,heard earthquake different cities stay safe ev...,1,1
2,3,,,forest fire spot pond geese fleeing across str...,1,1
3,9,,,apocalypse lighting spokane wildfires,1,1
4,11,,,typhoon soudelor kills 28 china taiwan,1,1
5,12,,,shakingits earthquake,1,1
6,21,,,theyd probably still show life arsenal yesterd...,0,0
7,22,,,hey,0,0
8,27,,,nice hat,0,0
9,29,,,fuck,0,0
