<a href="https://colab.research.google.com/github/prafullahas/Samrakshak-Flood-Alert-System/blob/main/socialmedia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install faker



In [None]:
"""
enhanced_synthetic_hazard_dataset.py

Generates a realistic synthetic social media dataset for disaster detection and sentiment analysis:
- Adds typos, emojis, random capitalization
- Multiple templates for hazards and neutral posts
- Balances classes
- Includes 'None' class for non-hazard posts
"""

import random
from faker import Faker
import pandas as pd
import numpy as np
from sklearn.utils import resample

fake = Faker()
random.seed(42)
np.random.seed(42)

# ------------------------------------
# Parameters
# ------------------------------------
num_posts = 3000
locations = ["Kerala", "Goa", "Mumbai", "Tamil Nadu", "Odisha"]
hazards = ["Flood", "Tsunami", "High Tide", "Low Tide", "Cyclone"]
tags = ["#flood", "#tsunami", "#hightide", "#lowtide", "#cyclone", "#weatheralert", "#coast", "#monsoon"]
sentiments = ["Positive", "Neutral", "Negative"]

# ------------------------------------
# Sentence templates
# ------------------------------------
hazard_templates = [
    "Massive {hazard} reported in {location}! Stay safe everyone. {tag}",
    "Just witnessed {hazard} near {location}. It's getting scary. {tag}",
    "The {hazard} in {location} caused huge damage today. {tag}",
    "Weather alert: {hazard} expected near {location} coast. {tag}",
    "Authorities are on alert due to {hazard} in {location}. {tag}",
    "Minor {hazard} reported in {location}, situation under control. {tag}",
    "OMG! {hazard} near {location} is wild 😰 {tag}",
    "Hope everyone in {location} stays safe from the {hazard}. {tag}",
]

neutral_templates = [
    "Beautiful sunset in {location} today 🌅",
    "Enjoying seafood near {location} coast 🐟",
    "Tourists are loving the beaches of {location}! #travel",
    "Great weather today in {location}, no worries at all!",
    "Just chilling at {location}, nothing to worry about 😎",
]

# ------------------------------------
# Helper functions
# ------------------------------------
def add_noise(text):
    """Adds social media style noise: typos, emojis, caps, punctuation"""
    if random.random() < 0.2:
        text = text.replace("flood", "flod").replace("cyclone", "cyc1one")
    if random.random() < 0.3:
        text += random.choice([" 😰", " 🌊", " ⚠️", " 🙏", "!!!", "😂"])
    if random.random() < 0.3:
        text = "".join(c.upper() if random.random() < 0.3 else c for c in text)
    return text

# Optional severity mapping
severity_map = {
    "Flood": ["High", "Medium", "Low"],
    "Tsunami": ["High", "Medium"],
    "Cyclone": ["High", "Medium", "Low"],
    "High Tide": ["Medium", "Low"],
    "Low Tide": ["Low", "Medium"],
    "None": ["None"]
}

def random_severity(hazard):
    return random.choice(severity_map.get(hazard, ["Medium"]))

# ------------------------------------
# Generate hazard posts
# ------------------------------------
data = []
for i in range(num_posts):
    loc = random.choice(locations)
    is_hazard = random.random() > 0.3  # 70% hazard, 30% neutral

    if is_hazard:
        hazard = random.choice(hazards)
        template = random.choice(hazard_templates)
        text = template.format(hazard=hazard, location=loc, tag=random.choice(tags))
        text = add_noise(text)
        sentiment = random.choice(sentiments)
        severity = random_severity(hazard)
    else:
        hazard = "None"
        text = random.choice(neutral_templates).format(location=loc)
        text = add_noise(text)
        sentiment = "Positive"
        severity = "None"

    data.append({
        "post_id": f"p{i+1}",
        "user_id": f"u{random.randint(1,200)}",
        "timestamp": fake.date_time_this_year(),
        "content": text,
        "location": loc,
        "hazard": hazard,
        "sentiment": sentiment,
        "severity": severity
    })

df = pd.DataFrame(data)

# ------------------------------------
# Balance classes
# ------------------------------------
max_count = df['hazard'].value_counts().max()
balanced_dfs = []
for label in df['hazard'].unique():
    df_class = df[df['hazard'] == label]
    df_up = resample(df_class, replace=True, n_samples=max_count, random_state=42)
    balanced_dfs.append(df_up)

df_balanced = pd.concat(balanced_dfs).sample(frac=1, random_state=42).reset_index(drop=True)

# ------------------------------------
# Save to CSV
# ------------------------------------
df_balanced.to_csv("synthetic_hazard_dataset_enhanced.csv", index=False)
print("✅ Enhanced balanced dataset saved to synthetic_hazard_dataset_enhanced.csv")
print(df_balanced['hazard'].value_counts())
print(df_balanced['sentiment'].value_counts())


✅ Enhanced balanced dataset saved to synthetic_hazard_dataset_enhanced.csv
hazard
Flood        952
Cyclone      952
None         952
Low Tide     952
High Tide    952
Tsunami      952
Name: count, dtype: int64
sentiment
Positive    2584
Neutral     1619
Negative    1509
Name: count, dtype: int64


In [None]:
import pandas as pd
from datasets import Dataset

# Load the enhanced dataset
df = pd.read_csv("synthetic_hazard_dataset_enhanced.csv")

# Optional: check
print(df.head())
print(df['hazard'].value_counts())
print(df['sentiment'].value_counts())


  post_id user_id                   timestamp  \
0   p1151    u143  2025-07-04 18:46:45.754493   
1   p2862    u138  2025-02-23 15:26:08.308773   
2   p2084     u86  2025-04-15 08:21:33.210972   
3    p519     u30  2025-02-28 07:26:22.752140   
4   p1549     u93  2025-08-27 13:39:02.197007   

                                             content location    hazard  \
0            OMG! FlOoD nEAr OdiSha is wIlD 😰 #cOASt   Odisha     Flood   
1  Hope everyone in Odisha stays safe from the Cy...   Odisha   Cyclone   
2               Beautiful sunset in Odisha today 🌅 😰   Odisha       NaN   
3  Minor Low Tide reported in Goa, situation unde...      Goa  Low Tide   
4  Massive Low Tide reported in Mumbai! Stay safe...   Mumbai  Low Tide   

  sentiment severity  
0  Negative      Low  
1   Neutral      Low  
2  Positive      NaN  
3  Negative   Medium  
4  Negative      Low  
hazard
Flood        952
Cyclone      952
Low Tide     952
High Tide    952
Tsunami      952
Name: count, dtype: int6

In [None]:
# Fill missing hazard/severity for neutral posts
df['hazard'] = df['hazard'].fillna("None")
df['severity'] = df['severity'].fillna("None")

# Optional: check
print(df['hazard'].value_counts())
print(df['severity'].value_counts())


hazard
Flood        952
Cyclone      952
None         952
Low Tide     952
High Tide    952
Tsunami      952
Name: count, dtype: int64
severity
Medium    1974
Low       1670
High      1116
None       952
Name: count, dtype: int64


In [None]:
df.isnull().sum()

Unnamed: 0,0
post_id,0
user_id,0
timestamp,0
content,0
location,0
hazard,0
sentiment,0
severity,0


In [None]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df["hazard"])
val_df, test_df  = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df["hazard"])

train_ds = Dataset.from_pandas(train_df)
val_ds   = Dataset.from_pandas(val_df)
test_ds  = Dataset.from_pandas(test_df)


In [None]:
# Hazard labels (including 'None')
hazard_labels = {h:i for i,h in enumerate(df['hazard'].unique())}
id2hazard = {v:k for k,v in hazard_labels.items()}

# Sentiment labels
sentiment_labels = {"Positive":0, "Neutral":1, "Negative":2}
id2sentiment = {v:k for k,v in sentiment_labels.items()}


In [None]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

def tokenize_fn(batch):
    return tokenizer(batch["content"], truncation=True, padding=True, max_length=128)

train_ds = train_ds.map(tokenize_fn, batched=True)
val_ds   = val_ds.map(tokenize_fn, batched=True)
test_ds  = test_ds.map(tokenize_fn, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/3998 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

In [None]:
def encode_labels(batch):
    batch["hazard_label"] = hazard_labels[batch["hazard"]]
    batch["sentiment_label"] = sentiment_labels[batch["sentiment"]]
    return batch

train_ds = train_ds.map(encode_labels)
val_ds   = val_ds.map(encode_labels)
test_ds  = test_ds.map(encode_labels)


Map:   0%|          | 0/3998 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

In [None]:
from transformers import RobertaForSequenceClassification

# Hazard type model
model_hazard = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=len(hazard_labels),
    id2label=id2hazard,
    label2id=hazard_labels
)

# Sentiment model
model_sentiment = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=len(sentiment_labels),
    id2label=id2sentiment,
    label2id=sentiment_labels
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    acc = accuracy_score(labels, preds)
    f1_micro = f1_score(labels, preds, average="micro")
    f1_macro = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "f1_micro": f1_micro, "f1_macro": f1_macro}


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./roberta_hazard_checkpoints",
    num_train_epochs=6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2
)


In [None]:
from transformers import Trainer

trainer_hazard = Trainer(
    model=model_hazard,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer_sentiment = Trainer(
    model=model_sentiment,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer_hazard = Trainer(
  trainer_sentiment = Trainer(


In [None]:
# For hazard classification
def encode_hazard(batch):
    batch["labels"] = hazard_labels[batch["hazard"]]
    return batch

train_ds = train_ds.map(encode_hazard)
val_ds   = val_ds.map(encode_hazard)
test_ds  = test_ds.map(encode_hazard)


Map:   0%|          | 0/3998 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

In [None]:
hazard_labels = {
    "None": 0,
    "Low Tide": 1,
    "High Tide": 2,
    "Flood": 3,
    "Cyclone": 4,
    "Tsunami": 5
}


In [None]:
def encode_hazard(batch):
    batch["labels"] = hazard_labels[batch["hazard"]]
    return batch

train_ds = train_ds.map(encode_hazard)
val_ds   = val_ds.map(encode_hazard)
test_ds  = test_ds.map(encode_hazard)


Map:   0%|          | 0/3998 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

In [None]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

def tokenize_fn(batch):
    return tokenizer(batch["content"], truncation=True, padding=True, max_length=128)

train_ds = train_ds.map(tokenize_fn, batched=True)
val_ds   = val_ds.map(tokenize_fn, batched=True)
test_ds  = test_ds.map(tokenize_fn, batched=True)


Map:   0%|          | 0/3998 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

In [None]:
remove_unused_columns=False


In [None]:
# Hazard (disaster type)
hazard_labels = sorted(df_balanced["hazard"].unique())
hazard2id = {label: i for i, label in enumerate(hazard_labels)}
id2hazard = {i: label for label, i in hazard2id.items()}

# Severity
severity_labels = sorted(df_balanced["severity"].unique())
severity2id = {label: i for i, label in enumerate(severity_labels)}
id2severity = {i: label for label, i in severity2id.items()}

print("Hazard labels:", hazard2id)
print("Severity labels:", severity2id)


Hazard labels: {'Cyclone': 0, 'Flood': 1, 'High Tide': 2, 'Low Tide': 3, 'None': 4, 'Tsunami': 5}
Severity labels: {'High': 0, 'Low': 1, 'Medium': 2, 'None': 3}


In [None]:
from sklearn.model_selection import train_test_split

# Drop NaNs just in case
df_balanced = df_balanced.dropna(subset=["content", "hazard", "severity"])

train_df, temp_df = train_test_split(df_balanced, test_size=0.3, random_state=42, stratify=df_balanced["hazard"])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df["hazard"])


In [None]:
from datasets import Dataset
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

def tokenize(batch):
    return tokenizer(batch["content"], truncation=True, padding="max_length", max_length=128)

# Hazard datasets
train_hazard = Dataset.from_pandas(train_df[["content", "hazard"]])
val_hazard   = Dataset.from_pandas(val_df[["content", "hazard"]])
test_hazard  = Dataset.from_pandas(test_df[["content", "hazard"]])

train_hazard = train_hazard.map(lambda e: {"labels": hazard2id[e["hazard"]]}, remove_columns=["hazard"])
val_hazard   = val_hazard.map(lambda e: {"labels": hazard2id[e["hazard"]]}, remove_columns=["hazard"])
test_hazard  = test_hazard.map(lambda e: {"labels": hazard2id[e["hazard"]]}, remove_columns=["hazard"])

train_hazard = train_hazard.map(tokenize, batched=True)
val_hazard   = val_hazard.map(tokenize, batched=True)
test_hazard  = test_hazard.map(tokenize, batched=True)

# Severity datasets
train_severity = Dataset.from_pandas(train_df[["content", "severity"]])
val_severity   = Dataset.from_pandas(val_df[["content", "severity"]])
test_severity  = Dataset.from_pandas(test_df[["content", "severity"]])

train_severity = train_severity.map(lambda e: {"labels": severity2id[e["severity"]]}, remove_columns=["severity"])
val_severity   = val_severity.map(lambda e: {"labels": severity2id[e["severity"]]}, remove_columns=["severity"])
test_severity  = test_severity.map(lambda e: {"labels": severity2id[e["severity"]]}, remove_columns=["severity"])

train_severity = train_severity.map(tokenize, batched=True)
val_severity   = val_severity.map(tokenize, batched=True)
test_severity  = test_severity.map(tokenize, batched=True)


Map:   0%|          | 0/3998 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

Map:   0%|          | 0/3998 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

Map:   0%|          | 0/3998 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

Map:   0%|          | 0/3998 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments

training_args_hazard = TrainingArguments(
    output_dir="./hazard_model",
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",   # use eval_strategy for older versions
    save_strategy="epoch",
    logging_dir="./logs",
    load_best_model_at_end=True
)

training_args_severity = TrainingArguments(
    output_dir="./severity_model",
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    load_best_model_at_end=True
)


In [None]:
from transformers import RobertaForSequenceClassification, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

# Hazard model
model_hazard = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(hazard2id))

trainer_hazard = Trainer(
    model=model_hazard,
    args=training_args_hazard,
    train_dataset=train_hazard,
    eval_dataset=val_hazard,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

# Severity model
model_severity = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(severity2id))

trainer_severity = Trainer(
    model=model_severity,
    args=training_args_severity,
    train_dataset=train_severity,
    eval_dataset=val_severity,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_hazard = Trainer(
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_severity = Trainer(


In [None]:
trainer_hazard.train()
trainer_severity.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.000675,1.0,1.0
2,0.130100,0.000275,1.0,1.0
3,0.130100,0.00019,1.0,1.0
4,0.000300,0.000169,1.0,1.0


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.742517,0.570595,0.478139
2,0.808900,0.708381,0.589265,0.564002


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.742517,0.570595,0.478139
2,0.808900,0.708381,0.589265,0.564002
3,0.808900,0.734241,0.549592,0.52915
4,0.677600,0.68546,0.631272,0.62337


TrainOutput(global_step=1000, training_loss=0.7432427673339844, metrics={'train_runtime': 552.747, 'train_samples_per_second': 28.932, 'train_steps_per_second': 1.809, 'total_flos': 1051936888823808.0, 'train_loss': 0.7432427673339844, 'epoch': 4.0})

In [None]:
# assume df_balanced is your DataFrame
hazard_words = set(df_balanced['hazard'].unique()) - {"None"}
def redact_hazard(text, hazard_words=hazard_words):
    if not isinstance(text, str):
        return text
    out = text
    for w in hazard_words:
        # replace exact word case-insensitively with placeholder
        out = re.sub(rf"(?i)\b{re.escape(w)}\b", "[HAZARD]", out)
    return out

import re
df_balanced['content_redacted'] = df_balanced['content'].apply(redact_hazard)


In [None]:
def add_severity_cue(text, severity):
    cues = {
        "High": ["massive", "widespread damage", "evacuations", "major destruction"],
        "Medium": ["moderate", "roads submerged", "several houses damaged"],
        "Low": ["minor", "localized", "puddles", "no major damage"],
        "None": []
    }
    if severity in cues and cues[severity]:
        cue = random.choice(cues[severity])
        # prepend or append a short phrase with cue
        return f"{text} {cue}."
    return text


In [None]:
from sklearn.model_selection import train_test_split

# 1. Work only with unique content
unique_contents = df_balanced['content'].unique()

# 2. Split unique contents
train_texts, temp_texts = train_test_split(
    unique_contents, test_size=0.3, random_state=42
)
val_texts, test_texts = train_test_split(
    temp_texts, test_size=0.5, random_state=42
)

# 3. Assign rows back based on membership
train_df = df_balanced[df_balanced['content'].isin(train_texts)]
val_df   = df_balanced[df_balanced['content'].isin(val_texts)]
test_df  = df_balanced[df_balanced['content'].isin(test_texts)]

# 4. Verify disjointness again
assert set(train_df['content']).isdisjoint(val_df['content'])
assert set(train_df['content']).isdisjoint(test_df['content'])
assert set(val_df['content']).isdisjoint(test_df['content'])

print(len(train_df), len(val_df), len(test_df))


4019 867 826


In [None]:
from sklearn.utils import resample

train_balanced = []
max_count = train_df['severity'].value_counts().max()
for label in train_df['severity'].unique():
    df_class = train_df[train_df['severity'] == label]
    df_up = resample(df_class, replace=True, n_samples=max_count, random_state=42)
    train_balanced.append(df_up)

train_df = pd.concat(train_balanced).sample(frac=1, random_state=42).reset_index(drop=True)


In [None]:
from datasets import Dataset

train_ds = Dataset.from_pandas(train_df)
val_ds   = Dataset.from_pandas(val_df)
test_ds  = Dataset.from_pandas(test_df)


In [None]:
def tokenize_fn(batch):
    return tokenizer(batch["content"], truncation=True, padding="max_length", max_length=128)

train_ds = train_ds.map(tokenize_fn, batched=True)
val_ds   = val_ds.map(tokenize_fn, batched=True)
test_ds  = test_ds.map(tokenize_fn, batched=True)


Map:   0%|          | 0/5600 [00:00<?, ? examples/s]

Map:   0%|          | 0/867 [00:00<?, ? examples/s]

Map:   0%|          | 0/826 [00:00<?, ? examples/s]

In [None]:
label2id = {"Low": 0, "Medium": 1, "High": 2, "None": 3}
id2label = {v: k for k, v in label2id.items()}

def encode_labels(batch):
    return {"labels": label2id[batch["severity"]]}

train_ds = train_ds.map(encode_labels)
val_ds   = val_ds.map(encode_labels)
test_ds  = test_ds.map(encode_labels)


Map:   0%|          | 0/5600 [00:00<?, ? examples/s]

Map:   0%|          | 0/867 [00:00<?, ? examples/s]

Map:   0%|          | 0/826 [00:00<?, ? examples/s]

In [None]:
train_ds = train_ds.remove_columns([c for c in train_ds.column_names if c not in ["input_ids","attention_mask","labels"]])
val_ds   = val_ds.remove_columns([c for c in val_ds.column_names if c not in ["input_ids","attention_mask","labels"]])
test_ds  = test_ds.remove_columns([c for c in test_ds.column_names if c not in ["input_ids","attention_mask","labels"]])


In [None]:
trainer_hazard.train()
trainer_severity.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.027087,0.996499,0.996499
2,0.008300,0.00032,1.0,1.0
3,0.008300,5.5e-05,1.0,1.0
4,0.000400,5.1e-05,1.0,1.0


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.700017,0.621937,0.572006
2,0.686100,0.681814,0.645274,0.622819
3,0.686100,0.652768,0.665111,0.665716
4,0.533500,0.629068,0.691949,0.690858


TrainOutput(global_step=1000, training_loss=0.6097927856445312, metrics={'train_runtime': 418.9429, 'train_samples_per_second': 38.172, 'train_steps_per_second': 2.387, 'total_flos': 1051936888823808.0, 'train_loss': 0.6097927856445312, 'epoch': 4.0})

In [None]:
hazard_labels = ["None", "Flood", "Cyclone", "Tsunami", "Low Tide", "High Tide"]
severity_labels = ["None", "Low", "Medium", "High"]

hazard_label2id = {v: i for i, v in enumerate(hazard_labels)}
hazard_id2label = {i: v for v, i in hazard_label2id.items()}

severity_label2id = {v: i for i, v in enumerate(severity_labels)}
severity_id2label = {i: v for v, i in severity_label2id.items()}


In [None]:
class MultiTaskRoberta(nn.Module):
    def __init__(self, model_name, num_hazards, num_severity):
        super().__init__()
        from transformers import RobertaModel
        self.roberta = RobertaModel.from_pretrained(model_name)
        hidden_size = self.roberta.config.hidden_size

        # Two heads
        self.hazard_classifier = nn.Linear(hidden_size, num_hazards)
        self.severity_classifier = nn.Linear(hidden_size, num_severity)

    def forward(self, input_ids, attention_mask=None, labels_hazard=None, labels_severity=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # CLS token

        logits_hazard = self.hazard_classifier(pooled_output)
        logits_severity = self.severity_classifier(pooled_output)

        loss = None
        if labels_hazard is not None and labels_severity is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss_hazard = loss_fct(logits_hazard, labels_hazard)
            loss_severity = loss_fct(logits_severity, labels_severity)
            loss = loss_hazard + loss_severity

        # 🚨 Return dict, not SequenceClassifierOutput
        return {
            "loss": loss,
            "logits_hazard": logits_hazard,
            "logits_severity": logits_severity,
        }


In [None]:
num_hazards = len(hazard_label2id)
num_severity = len(severity_label2id)

model = MultiTaskRoberta("roberta-base", num_hazards, num_severity).to("cuda")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
with torch.no_grad():
    inputs = tokenizer("Massive flood near Mumbai 😰", return_tensors="pt", truncation=True, padding=True).to("cuda")
    outputs = model(**inputs)

    probs_hazard = torch.softmax(outputs["logits_hazard"], dim=-1)
    probs_severity = torch.softmax(outputs["logits_severity"], dim=-1)

    hazard_pred = probs_hazard.argmax(dim=-1).item()
    severity_pred = probs_severity.argmax(dim=-1).item()

print("Hazard:", hazard_pred, "Severity:", severity_pred)


Hazard: 3 Severity: 0


In [None]:
# Example label maps (adapt to your own)
hazard_id2label = {0: "None", 1: "Flood", 2: "Cyclone", 3: "Tsunami", 4: "Low Tide", 5: "High Tide"}
severity_id2label = {0: "Low", 1: "Medium", 2: "High"}

with torch.no_grad():
    inputs = tokenizer("Massive flood near Mumbai 😰", return_tensors="pt", truncation=True, padding=True).to("cuda")
    outputs = model(**inputs)

    probs_hazard = torch.softmax(outputs["logits_hazard"], dim=-1)
    probs_severity = torch.softmax(outputs["logits_severity"], dim=-1)

    hazard_pred = probs_hazard.argmax(dim=-1).item()
    severity_pred = probs_severity.argmax(dim=-1).item()

    # Confidence scores
    hazard_conf = probs_hazard[0, hazard_pred].item()
    severity_conf = probs_severity[0, severity_pred].item()

print("Hazard:", hazard_id2label[hazard_pred], f"(confidence: {hazard_conf:.2f})")
print("Severity:", severity_id2label[severity_pred], f"(confidence: {severity_conf:.2f})")


Hazard: Tsunami (confidence: 0.22)
Severity: Low (confidence: 0.29)


In [None]:
def predict_disaster(text, model, tokenizer, hazard_id2label, severity_id2label, device="cuda"):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding=True
        ).to(device)

        outputs = model(**inputs)

        # Softmax probabilities
        probs_hazard = torch.softmax(outputs["logits_hazard"], dim=-1)
        probs_severity = torch.softmax(outputs["logits_severity"], dim=-1)

        # Predictions
        hazard_pred = probs_hazard.argmax(dim=-1).item()
        severity_pred = probs_severity.argmax(dim=-1).item()

        # Confidence scores
        hazard_conf = probs_hazard[0, hazard_pred].item()
        severity_conf = probs_severity[0, severity_pred].item()

    return {
        "hazard": hazard_id2label[hazard_pred],
        "hazard_confidence": round(hazard_conf, 2),
        "severity": severity_id2label[severity_pred],
        "severity_confidence": round(severity_conf, 2),
    }


In [None]:
result = predict_disaster(
    "Massive flood near Mumbai 😰",
    model,
    tokenizer,
    hazard_id2label,
    severity_id2label
)
print(result)


{'hazard': 'Tsunami', 'hazard_confidence': 0.22, 'severity': 'Low', 'severity_confidence': 0.29}
