In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset

In [21]:
# Read a file using pandas
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
path = '/content/drive/My Drive/Nat_Cat_Exercise/Events.csv'
df = pd.read_csv(path)

In [23]:
df = df.dropna(subset=['title'])

In [24]:
# Drop exact duplicate clean titles
df = df.drop_duplicates(subset='title', keep='first').reset_index(drop=True)
print("New shape after deduplication:", df.shape)

New shape after deduplication: (65158, 8)


In [51]:
#!pip install spacy
#!python -m spacy download en_core_web_sm

In [26]:
!pip install geonamescache



In [27]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [28]:
# 4. Clean Titles
import re
def clean_title(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df['clean_title'] = df['title'].astype(str).apply(clean_title)

In [29]:
import spacy
import geonamescache

# Load spaCy model
#nlp = spacy.load("en_core_web_sm")

# Load GeoNames country and city names
gc = geonamescache.GeonamesCache()
countries = set([c['name'] for c in gc.get_countries().values()])
cities = set([c['name'] for c in gc.get_cities().values()])
gazetteer = countries.union(cities)

# Function to check if a title has a real location
def has_location(text):
    doc = nlp(text)

    # Rule 1: spaCy NER match with gazetteer
    for ent in doc.ents:
        if ent.label_ in ["GPE", "LOC", "FAC"]:
            if ent.text.strip().title() in gazetteer:
                return True

    # Rule 2: Token-level backup match
    tokens = [t.text.strip().title() for t in doc if t.is_alpha]
    if any(tok in gazetteer for tok in tokens):
        return True

    return False


In [55]:
# 6. Detect Nat-Cat Keyword
CATEGORY_KEYWORDS = {
    'Earthquakes': [
        'earthquake', 'tremor', 'seismic', 'quake', 'aftershock', 'foreshock',
        'seism', 'epicenter', 'hypocenter', 'magnitude', 'richter scale',
        'seismic waves', 'ground shaking', 'fault line', 'tectonic', 'liquefaction',
        'seismograph', 'seismology', 'earth movement', 'earth vibration'
    ],

    'Hurricanes': [
        'hurricane', 'typhoon', 'cyclone', 'tropical storm', 'storm surge',
        'eye wall', 'rain bands', 'wind speed', 'category storm', 'tropical depression',
        'gale', 'tempest', 'monsoon', 'low pressure', 'storm system', 'hurricane warning',
        'hurricane watch', 'storm track', 'landfall', 'saffir-simpson'
    ],

    'Tornadoes': [
        'tornado', 'twister', 'funnel cloud', 'whirlwind', 'supercell',
        'mesocyclone', 'vortex', 'windstorm', 'tornado warning', 'tornado watch',
        'tornado siren', 'debris cloud', 'tornado alley', 'rotation', 'wall cloud',
        'hook echo', 'doppler radar', 'fujita scale', 'enhanced fujita', 'tornado outbreak'
    ],

    'Floods': [
        'flood', 'flooding', 'inundation', 'deluge', 'flash flood',
        'torrent', 'high water', 'overflow', 'submersion', 'waterlogging',
        'floodwaters', 'flood damage', 'flood alert', 'flood warning', 'flood watch',
        'levee breach', 'dam break', 'runoff', 'water rise', 'coastal flooding'
    ],

    'Wildfires': [
        'wildfire', 'bushfire', 'forest fire', 'brush fire', 'grass fire',
        'wildland fire', 'firestorm', 'conflagration', 'ember', 'fire spread',
        'fire line', 'fire suppression', 'fire evacuation', 'smoke plume', 'fire season',
        'fire danger', 'red flag warning', 'backfire', 'firebreak', 'containment line'
    ],

    'Tsunamis': [
        'tsunami', 'tidal wave', 'seismic wave', 'harbor wave', 'ocean surge',
        'coastal flooding', 'wave height', 'run-up', 'tsunami warning', 'tsunami alert',
        'tidal bore', 'tsunami advisory', 'seiche', 'displacement wave', 'tsunami siren',
        'tsunami evacuation', 'tsunami zone', 'tsunami buoys', 'megatsunami', 'tsunami ready'
    ],

    'Volcanic eruptions': [
        'volcano', 'eruption', 'lava', 'magma', 'ash cloud',
        'pyroclastic', 'volcanic ash', 'crater', 'caldera', 'fumarole',
        'geyser', 'volcanic gas', 'tephra', 'lahar', 'volcanic bomb',
        'pumice', 'volcanic winter', 'vog', 'volcano alert', 'volcanic explosivity index'
    ]
}
def is_natcat_event(title):
    title = title.lower()
    has_event = any(kw in title for kws in CATEGORY_KEYWORDS.values() for kw in kws)
    return has_event and has_location(title)

df['is_natcat'] = df['clean_title'].apply(is_natcat_event)

In [56]:
# 7. Filter
df_natcat = df[df['is_natcat'] == True].copy()
print("Filtered shape:", df_natcat.shape)

Filtered shape: (21570, 10)


In [50]:
#!pip install keybert

In [33]:
from keybert import KeyBERT
from collections import defaultdict

In [34]:
CATEGORY_KEYWORDS = {
    'Earthquakes': ['earthquake', 'tremor', 'seismic', 'quake','tsunami', 'tidal wave', 'seismic wave'],
    'Floods': ['flood', 'flooding', 'inundation'],
    'Volcanic eruptions': ['volcano', 'eruption', 'lava'],
    'Tornadoes': ['tornado', 'twister', 'cyclone','hurricane', 'typhoon', 'cyclone'],
    'Wildfires': ['wildfire', 'bushfire', 'forest fire'],

}

In [35]:
kw_model = KeyBERT()

In [42]:
def classify_disaster(title, threshold=0.3):
    # Extract keywords
    keywords = kw_model.extract_keywords(
        title,
        keyphrase_ngram_range=(1, 2),
        stop_words='english',
        top_n=5
    )

    # Filter keywords by score
    filtered_keywords = [kw for kw, score in keywords if score >= threshold]

    # Calculate category scores
    category_scores = defaultdict(int)
    for category, terms in CATEGORY_KEYWORDS.items():
        for term in terms:
            if any(term in kw.lower() for kw in filtered_keywords):
                category_scores[category] += 1

    # Determine predicted category
    predicted_category = max(category_scores.items(), key=lambda x: x[1])[0] if category_scores else "Unknown"

    return predicted_category, filtered_keywords

In [37]:
# Apply classification to each title
df_natcat['predicted_category'] = df_natcat['title'].apply(classify_disaster)

In [38]:
df_natcat.reset_index(drop=True, inplace=True)

In [39]:
df_natcat.reset_index(drop=True, inplace=True)

In [40]:
df_natcat['key_Bert'] = df_natcat['predicted_category'].apply(lambda x: x[0])


In [41]:
df_natcat.key_Bert.value_counts()

Unnamed: 0_level_0,count
key_Bert,Unnamed: 1_level_1
Earthquakes,6271
Floods,4708
Tornadoes,4417
Wildfires,3351
Volcanic eruptions,1595
Unknown,1228


In [43]:
def test_classify_disaster():
    """Simple test cases for classify_disaster function"""

    # Example 1: Clear natural disaster
    title = "Major earthquake hits California"
    category, keywords = classify_disaster(title)
    print(f"Test 1 - '{title}':")
    print(f"  Category: {category}")
    print(f"  Keywords: {keywords}")
    print()

    # Example 2: Industrial accident
    title = "Chemical plant explosion in Texas"
    category, keywords = classify_disaster(title)
    print(f"Test 2 - '{title}':")
    print(f"  Category: {category}")
    print(f"  Keywords: {keywords}")
    print()

    # Example 3: Below threshold (should return Unknown)
    title = "Minor incident at local factory"
    category, keywords = classify_disaster(title, threshold=0.5)
    print(f"Test 3 - '{title}':")
    print(f"  Category: {category}")
    print(f"  Keywords: {keywords}")
    print()

    # Example 4: No matching category
    title = "Celebrity wedding announcement"
    category, keywords = classify_disaster(title)
    print(f"Test 4 - '{title}':")
    print(f"  Category: {category}")
    print(f"  Keywords: {keywords}")

# Run the tests
if __name__ == "__main__":
    test_classify_disaster()

Test 1 - 'Major earthquake hits California':
  Category: Earthquakes
  Keywords: ['earthquake hits', 'major earthquake', 'hits california', 'earthquake', 'california']

Test 2 - 'Chemical plant explosion in Texas':
  Category: Unknown
  Keywords: ['explosion texas', 'plant explosion', 'explosion', 'chemical plant', 'chemical']

Test 3 - 'Minor incident at local factory':
  Category: Unknown
  Keywords: ['incident local', 'minor incident', 'incident', 'local factory', 'factory']

Test 4 - 'Celebrity wedding announcement':
  Category: Unknown
  Keywords: ['celebrity wedding', 'wedding announcement', 'wedding', 'celebrity', 'announcement']


**Bert Topic**

In [49]:
#!pip install bertopic
#from bertopic import BERTopic

In [45]:
from bertopic import BERTopic
import pandas as pd

# Fit BERTopic
topic_model = BERTopic(language="english", nr_topics=5)
topics, probs = topic_model.fit_transform(df_natcat['clean_title'])

# STEP 1: Display topic keywords for manual inspection
print("Topic Keywords:")
for topic_num in set(topics):
    print(f"Topic {topic_num}: {topic_model.get_topic(topic_num)}")

# STEP 2: Manually map topics to your 5 disaster categories
# After inspecting above output, update this dictionary accordingly
manual_topic_map = {
    0: "Earthquake",
    1: "Tornado",
    2: "Volcano",
    3: "Floods",
    4: "Wildfire"
}

# STEP 3: Create DataFrame with results
df_result = pd.DataFrame({
    "title": df_natcat['clean_title'],
    "topic_num": topics,
    "topic_label": [manual_topic_map.get(t, "Unknown") for t in topics]
})

print("\nFinal Categorized Titles:")
print(df)


Topic Keywords:
Topic 0: [('earthquake', np.float64(0.14504829607353298)), ('magnitude', np.float64(0.07563144635479475)), ('in', np.float64(0.07346802797658374)), ('mag', np.float64(0.06889097843860043)), ('near', np.float64(0.06467296749226831)), ('light', np.float64(0.06248026779868852)), ('of', np.float64(0.06102940647299294)), ('quake', np.float64(0.058922233013910794)), ('volcano', np.float64(0.05587115030837778)), ('japan', np.float64(0.048303188464383785))]
Topic 1: [('tornado', np.float64(0.0695846008459822)), ('flood', np.float64(0.06852435241238697)), ('in', np.float64(0.06135681320474038)), ('of', np.float64(0.054059426557127775)), ('hurricane', np.float64(0.05138149736583858)), ('to', np.float64(0.04922928346838302)), ('for', np.float64(0.04762231489856661)), ('and', np.float64(0.04023598401103956)), ('as', np.float64(0.03873916091521555)), ('the', np.float64(0.035573495359894576))]
Topic 2: [('wildfire', np.float64(0.17896668876770125)), ('wildfires', np.float64(0.0792379

In [47]:
df_result.shape

(21570, 3)

In [48]:
df_result.topic_label.value_counts()

Unnamed: 0_level_0,count
topic_label,Unnamed: 1_level_1
Unknown,6948
Earthquake,6250
Tornado,6034
Volcano,2322
Floods,16


In [58]:
# ✅ TEST CASES
# Test 1: Ensure number of output rows matches number of input titles
assert len(df_result) == len(df_natcat), "Mismatch in number of rows between input and result."

# Test 2: Ensure all titles have a topic number assigned
assert df_result["topic_num"].isnull().sum() == 0, "Some titles have no topic assigned."

# Test 3: Ensure all topic labels are mapped correctly (or marked as 'Unknown')
valid_labels = {"Earthquake", "Tornado", "Volcano", "Floods", "Wildfire", "Unknown"}
assert all(label in valid_labels for label in df_result["topic_label"]), "Unexpected topic label found."


In [61]:
df_result# ✅ TEST CASES
assert len(df_result) == len(df_natcat), "Mismatch in number of rows between input and result."
assert df_result["topic_num"].isnull().sum() == 0, "Some titles have no topic assigned."

valid_labels = {"Earthquake", "Tornado", "Volcano", "Floods", "Wildfire", "Unknown"}
assert all(label in valid_labels for label in df_result["topic_label"]), "Unexpected topic label found."

print("✅ All tests passed successfully.")


✅ All tests passed successfully.


In [62]:
df_result.head()

Unnamed: 0,title,topic_num,topic_label
0,Iceland volcanoes bring tourists to island cou...,0,Earthquake
1,Tornados scorchers and ice storm Top 10 weathe...,1,Tornado
2,Yellowstone Supervolcano Magma Chamber Is Very...,-1,Unknown
3,Japan earthquakes Coastal residents told to ev...,0,Earthquake
4,Topic Tsunami The Sydney Morning Herald,0,Earthquake


In [63]:
df_result.topic_label.value_counts()

Unnamed: 0_level_0,count
topic_label,Unnamed: 1_level_1
Unknown,6948
Earthquake,6250
Tornado,6034
Volcano,2322
Floods,16


In [66]:
def classify_new_title(title, topic_model, manual_topic_map):
    topic, prob = topic_model.transform([title])
    topic_num = topic[0]
    label = manual_topic_map.get(topic_num, "Unknown")

    return {
        "title": title,
        "topic_num": topic_num,
        "probability": prob[0],
        "topic_label": label
    }

In [73]:
new_title = "4.5 magnitude earthquake strikes southern Turkey"

# Reuse the already fitted topic_model
result = classify_new_title(new_title, topic_model, manual_topic_map)

print(f"Title: {result['title']}")
print(f"Topic #: {result['topic_num']}")
print(f"Label: {result['topic_label']}")
print(f"Confidence: {result['probability']:.4f}")

ValueError: This BERTopic instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.