In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset

In [2]:
# Read a file using pandas
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
path = '/content/drive/My Drive/Nat_Cat_Exercise/Events.csv'
df = pd.read_csv(path)

In [4]:
import pandas as pd

# Step 1: Prepare Labeled Data
data = [
    ("An earthquake has struck in San Francisco", "Earthquake"),
    ("New tornados identified in northwestern Ontario", "Tornado"),
    ("Volcano erupts in southwestern Iceland", "Volcano"),
    ("Flooding damage in Moreton Bay", "Floods"),
    ("4.0 Magnitude Earthquake Reported In US", "Earthquake"),
    ("Flooding expected for River Ouse in York", "Floods"),
    ("Wildfire burns across central California", "Wildfire"),
    ("Massive wildfire spreads in Greece", "Wildfire"),
    ("Flash floods hit parts of southern India", "Floods"),
    ("Powerful tornado destroys homes in Texas", "Tornado"),
    ("Strong earthquake rattles parts of Japan", "Earthquake"),
    ("Lava flows after volcano erupts in Hawaii", "Volcano"),
    ("New film breaks box office record",'' )
]
df = pd.DataFrame(data, columns=["title", "label"])

In [7]:
df = df.dropna(subset=['title'])

In [8]:
# Drop exact duplicate clean titles
df = df.drop_duplicates(subset='title', keep='first').reset_index(drop=True)
print("New shape after deduplication:", df.shape)

New shape after deduplication: (65158, 8)


In [9]:
#!pip install spacy
#!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m81.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [10]:
!pip install geonamescache

Collecting geonamescache
  Downloading geonamescache-2.0.0-py3-none-any.whl.metadata (3.2 kB)
Downloading geonamescache-2.0.0-py3-none-any.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: geonamescache
Successfully installed geonamescache-2.0.0


In [11]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [12]:
# 4. Clean Titles
import re
def clean_title(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df['clean_title'] = df['title'].astype(str).apply(clean_title)

In [13]:
import spacy
import geonamescache

# Load spaCy model
#nlp = spacy.load("en_core_web_sm")

# Load GeoNames country and city names
gc = geonamescache.GeonamesCache()
countries = set([c['name'] for c in gc.get_countries().values()])
cities = set([c['name'] for c in gc.get_cities().values()])
gazetteer = countries.union(cities)

# Function to check if a title has a real location
def has_location(text):
    doc = nlp(text)

    # Rule 1: spaCy NER match with gazetteer
    for ent in doc.ents:
        if ent.label_ in ["GPE", "LOC", "FAC"]:
            if ent.text.strip().title() in gazetteer:
                return True

    # Rule 2: Token-level backup match
    tokens = [t.text.strip().title() for t in doc if t.is_alpha]
    if any(tok in gazetteer for tok in tokens):
        return True

    return False


In [14]:
# 6. Detect Nat-Cat Keyword
CATEGORY_KEYWORDS = {
    'Earthquakes': [
        'earthquake', 'tremor', 'seismic', 'quake', 'aftershock', 'foreshock',
        'seism', 'epicenter', 'hypocenter', 'magnitude', 'richter scale',
        'seismic waves', 'ground shaking', 'fault line', 'tectonic', 'liquefaction',
        'seismograph', 'seismology', 'earth movement', 'earth vibration'
    ],

    'Hurricanes': [
        'hurricane', 'typhoon', 'cyclone', 'tropical storm', 'storm surge',
        'eye wall', 'rain bands', 'wind speed', 'category storm', 'tropical depression',
        'gale', 'tempest', 'monsoon', 'low pressure', 'storm system', 'hurricane warning',
        'hurricane watch', 'storm track', 'landfall', 'saffir-simpson'
    ],

    'Tornadoes': [
        'tornado', 'twister', 'funnel cloud', 'whirlwind', 'supercell',
        'mesocyclone', 'vortex', 'windstorm', 'tornado warning', 'tornado watch',
        'tornado siren', 'debris cloud', 'tornado alley', 'rotation', 'wall cloud',
        'hook echo', 'doppler radar', 'fujita scale', 'enhanced fujita', 'tornado outbreak'
    ],

    'Floods': [
        'flood', 'flooding', 'inundation', 'deluge', 'flash flood',
        'torrent', 'high water', 'overflow', 'submersion', 'waterlogging',
        'floodwaters', 'flood damage', 'flood alert', 'flood warning', 'flood watch',
        'levee breach', 'dam break', 'runoff', 'water rise', 'coastal flooding'
    ],

    'Wildfires': [
        'wildfire', 'bushfire', 'forest fire', 'brush fire', 'grass fire',
        'wildland fire', 'firestorm', 'conflagration', 'ember', 'fire spread',
        'fire line', 'fire suppression', 'fire evacuation', 'smoke plume', 'fire season',
        'fire danger', 'red flag warning', 'backfire', 'firebreak', 'containment line'
    ],

    'Tsunamis': [
        'tsunami', 'tidal wave', 'seismic wave', 'harbor wave', 'ocean surge',
        'coastal flooding', 'wave height', 'run-up', 'tsunami warning', 'tsunami alert',
        'tidal bore', 'tsunami advisory', 'seiche', 'displacement wave', 'tsunami siren',
        'tsunami evacuation', 'tsunami zone', 'tsunami buoys', 'megatsunami', 'tsunami ready'
    ],

    'Volcanic eruptions': [
        'volcano', 'eruption', 'lava', 'magma', 'ash cloud',
        'pyroclastic', 'volcanic ash', 'crater', 'caldera', 'fumarole',
        'geyser', 'volcanic gas', 'tephra', 'lahar', 'volcanic bomb',
        'pumice', 'volcanic winter', 'vog', 'volcano alert', 'volcanic explosivity index'
    ]
}
def is_natcat_event(title):
    title = title.lower()
    has_event = any(kw in title for kws in CATEGORY_KEYWORDS.values() for kw in kws)
    return has_event and has_location(title)

df['is_natcat'] = df['clean_title'].apply(is_natcat_event)

In [15]:
# 7. Filter
df_natcat = df[df['is_natcat'] == True].copy()
print("Filtered shape:", df_natcat.shape)

Filtered shape: (21570, 10)


In [16]:
df = df_natcat.copy()

In [17]:
df.head(3)

Unnamed: 0,url,url_mobile,title,seendate,socialimage,domain,language,sourcecountry,clean_title,is_natcat
3,https://www.ctvnews.ca/climate-and-environment...,,Iceland volcanoes bring tourists to island cou...,20240101T223000Z,https://www.ctvnews.ca/content/dam/ctvnews/en/...,ctvnews.ca,English,Canada,Iceland volcanoes bring tourists to island cou...,True
4,https://news.yahoo.com/tornados-scorchers-ice-...,,"Tornados , scorchers and ice storm : Top 10 we...",20240101T131500Z,https://s.yimg.com/ny/api/res/1.2/PXdWVXp40q9s...,news.yahoo.com,English,United States,Tornados scorchers and ice storm Top 10 weathe...,True
11,https://www.natureworldnews.com/articles/60104...,https://www.natureworldnews.com/amp/articles/6...,Yellowstone Supervolcano Magma Chamber Is Very...,20240101T150000Z,https://1471793142.rsc.cdn77.org/data/images/f...,natureworldnews.com,English,United States,Yellowstone Supervolcano Magma Chamber Is Very...,True


In [18]:
from transformers import pipeline
import pandas as pd

# Your DataFrame with a 'title' column
# Example:
# df = pd.DataFrame({'title': ["Massive wildfire spreads in California forest", "Universal Resort Releases More Details..."]})

# Define labels and threshold
candidate_labels = ["Earthquake", "Floods", "Volcano", "Tornado", "Wildfire"]
threshold = 0.5

# Load zero-shot classifier
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Run zero-shot classification
results = classifier(df['clean_title'].tolist(), candidate_labels=candidate_labels)

# Extract final labels with threshold logic
final_labels = []
top_scores = []

for result in results:
    labels = result['labels']
    scores = result['scores']
    top_index = scores.index(max(scores))
    top_label = labels[top_index]
    top_score = scores[top_index]
    final_label = top_label if top_score >= threshold else "Others"
    final_labels.append(final_label)
    top_scores.append(top_score)

# Add results to DataFrame
df['final_label'] = final_labels
df['top_score'] = top_scores

# View result
print(df[['title', 'final_label', 'top_score']])


Device set to use cuda:0


                                                   title final_label  \
3      Iceland volcanoes bring tourists to island cou...     Volcano   
4      Tornados , scorchers and ice storm : Top 10 we...     Tornado   
11     Yellowstone Supervolcano Magma Chamber Is Very...     Volcano   
12     Japan earthquakes : Coastal residents told to ...  Earthquake   
14           Topic | Tsunami | The Sydney Morning Herald      Others   
...                                                  ...         ...   
65129    IT DOESNT SEEM FAIR : Residents deal with to...     Tornado   
65134  In event of an earthquake , insurance industry...  Earthquake   
65135  National Weather Service : No evidence of torn...     Tornado   
65154  Montgomery County crime : Authorities detain d...     Tornado   
65155  Love Island India Reynolds reveals her family ...      Others   

       top_score  
3       0.970039  
4       0.964810  
11      0.979111  
12      0.931377  
14      0.315850  
...          ...  
65

In [20]:
data = df[['title', 'final_label', 'top_score']].copy()

In [23]:
# Step 2: Encode labels
label2id = {label: i for i, label in enumerate(sorted(data['final_label'].unique()))}
id2label = {i: label for label, i in label2id.items()}
data['label_id'] = data['final_label'].map(label2id)

In [24]:
# Step 3: Train-Test Split
train_df, test_df = train_test_split(data, test_size=0.3, stratify=data['label_id'], random_state=42)

In [25]:
# Step 4: Convert to HuggingFace Dataset
train_dataset = Dataset.from_pandas(train_df[['title', 'label_id']].rename(columns={'label_id': 'labels'}))
test_dataset = Dataset.from_pandas(test_df[['title', 'label_id']].rename(columns={'label_id': 'labels'}))

In [26]:
# Step 5: Tokenizer and Model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label2id))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
def tokenize(batch):
    return tokenizer(batch["title"], truncation=True)

In [28]:
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/15099 [00:00<?, ? examples/s]

Map:   0%|          | 0/6471 [00:00<?, ? examples/s]

In [29]:
# Step 6: Data Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [30]:
# Step 7: Training Setup
training_args = TrainingArguments(
    output_dir="./bert-disaster",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Trainer(


In [31]:
# Step 8: Train the Model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.5162,0.127784
2,0.0003,0.117928
3,0.0253,0.121862
4,0.0002,0.13548
5,0.0,0.138827


TrainOutput(global_step=18875, training_loss=0.06683327641447587, metrics={'train_runtime': 2466.3999, 'train_samples_per_second': 30.609, 'train_steps_per_second': 7.653, 'total_flos': 842848465711644.0, 'train_loss': 0.06683327641447587, 'epoch': 5.0})

In [32]:
# Step 9: Predict on Test Set
preds = trainer.predict(test_dataset)
pred_labels = preds.predictions.argmax(-1)
true_labels = test_dataset['labels']

In [33]:
# Step 10: Evaluation
print("Classification Report:")
print(classification_report(true_labels, pred_labels, target_names=label2id.keys()))

Classification Report:
              precision    recall  f1-score   support

  Earthquake       1.00      0.99      1.00      1839
      Floods       0.97      0.98      0.97      1572
      Others       0.87      0.84      0.86       354
     Tornado       0.99      0.99      0.99      1050
     Volcano       0.98      0.99      0.99       536
    Wildfire       1.00      0.99      1.00      1120

    accuracy                           0.98      6471
   macro avg       0.97      0.97      0.97      6471
weighted avg       0.98      0.98      0.98      6471



In [34]:
# Step 11: Predict on New Titles
new_titles = [
    "Massive wildfire spreads in California forest",
    "Tornado causes power outage in Kansas",
    "Strong earthquake shakes southern Chile",
    "Iceland volcano spews lava again",
    "Flood warning issued for Mississippi River",
    "Tremors felt across small town",
    "Strong winds reported in Oklahoma",
    "Clouds of ash rising from crater",
    "Heavy rains hit northern region",
    "Fire spreads through forest area",
    "Dry and warm weekend expected",
    "New film breaks box office record",
    "River banks rise after days of monsoon",
    "Explosions heard near base of dormant mountain",
    "High winds twist trees in Missouri",
    "Massive plumes of smoke spotted on satellite",
    "Heavy rains expected across southeast",             # ambiguous, could be flood
    "Disaster warning issued for Pacific region",        # general
    "Major evacuation underway after tremors",           # earthquake, but vague
    "Ash covers sky after sudden mountain explosion",
    "Blaze engulfs forests in Portugal",              # Wildfire
    "Severe tremor damages buildings in Turkey",      # Earthquake
    "Thunderstorms bring flash floods to Vietnam",    # Floods
    "Molten rock bursts out of Mount Fuji",           # Volcano
    "Twister touches down near Nebraska farms",
    "EQ shakes the Bay Area",                       # Earthquake
    "Inferno consumes parts of Australia",          # Wildfire
    "Cloudburst triggers flooding in Assam",        # Floods
    "Eruption rocks volcano in Indonesia",          # Volcano
    "Whirlwind uproots trees in Oklahoma"           # Tornado

]

# Tokenize and predict
inputs = tokenizer(new_titles, return_tensors="pt", padding=True, truncation=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = {key: value.to(device) for key, value in inputs.items()}


with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits.argmax(dim=-1).tolist()

# Map predictions to labels
for title, pred_id in zip(new_titles, predictions):
    print(f"{title} => {id2label[pred_id]}")

Massive wildfire spreads in California forest => Wildfire
Tornado causes power outage in Kansas => Tornado
Strong earthquake shakes southern Chile => Earthquake
Iceland volcano spews lava again => Volcano
Tremors felt across small town => Earthquake
Strong winds reported in Oklahoma => Others
Clouds of ash rising from crater => Others
Heavy rains hit northern region => Floods
Fire spreads through forest area => Wildfire
Dry and warm weekend expected => Others
New film breaks box office record => Others
River banks rise after days of monsoon => Floods
Explosions heard near base of dormant mountain => Others
High winds twist trees in Missouri => Others
Massive plumes of smoke spotted on satellite => Others
Heavy rains expected across southeast => Floods
Major evacuation underway after tremors => Earthquake
Ash covers sky after sudden mountain explosion => Volcano
Blaze engulfs forests in Portugal => Wildfire
Severe tremor damages buildings in Turkey => Earthquake
Thunderstorms bring flas