# Social Media Disaster Detection using NLP

**Internship-ready notebook**

This notebook contains a complete pipeline (data loading, preprocessing, feature extraction, model training & evaluation) to detect disaster-related social media posts. Save your dataset as `disaster_tweets.csv` in the same folder as the notebook or update the path in the data-loading cell.

In [None]:
# Install required packages (uncomment if needed)
# !pip install pandas scikit-learn nltk imbalanced-learn matplotlib seaborn

# Imports
import os
import re
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils import resample

import pickle

# For text processing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (first run)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

print('Setup complete')

In [None]:
# Data loading
# Place your CSV in the same folder and name it 'disaster_tweets.csv' or change the path below.
DATA_PATH = 'disaster_tweets.csv'

if os.path.exists(DATA_PATH):
    df = pd.read_csv(DATA_PATH)
    print(f'Loaded {len(df)} rows from {DATA_PATH}')
else:
    # Fallback: create a small example dataset
    data = {
        'text': [
            'Massive earthquake shakes the city, buildings collapsed, people trapped',
            'Lovely day at the park with family! #weekend',
            'Flooding reported in downtown area, residents advised to evacuate',
            'New cafe opened near me, great coffee and vibes ☕️',
            'Wildfire spreading fast due to strong winds, emergency declared'
        ],
        'target': [1, 0, 1, 0, 1]  # 1 = disaster-related, 0 = not disaster
    }
    df = pd.DataFrame(data)
    print('No dataset found — using a small example dataset')

df.head()

## Quick EDA
Check class distribution and example posts.

In [None]:
# Class distribution
if 'target' in df.columns:
    print(df['target'].value_counts())
else:
    print('Column `target` not found. If your dataset has labels under a different column name, rename it to `target` or update this cell.')

## Preprocessing functions

In [None]:
# Preprocessing utilities
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # lower, remove urls, mentions, hashtags (keep hashtag text), emojis, non-alphanumeric
    text = str(text).lower()
    text = re.sub(r'http\S+|www\.\S+', ' ', text)
    text = re.sub(r'@\w+', ' ', text)
    text = re.sub(r'#', ' ', text)  # remove hash symbol but keep the word
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess_text(text):
    text = clean_text(text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words and len(t) > 1]
    return ' '.join(tokens)

# Apply preprocessing
if 'text' in df.columns:
    df['clean_text'] = df['text'].astype(str).apply(preprocess_text)
    df[['text','clean_text']].head()
else:
    raise KeyError('Dataset must contain a `text` column with the social media content.')

## Handle class imbalance (optional)

In [None]:
# If dataset is imbalanced, you can upsample the minority class (simple approach)
if 'target' in df.columns:
    display(df['target'].value_counts())
    # Simple check
    class_counts = df['target'].value_counts()
    if class_counts.min() / class_counts.max() < 0.6:
        # Upsample minority class
        majority = df[df['target'] == class_counts.idxmax()]
        minority = df[df['target'] != class_counts.idxmax()]
        minority_upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)
        df_bal = pd.concat([majority, minority_upsampled]).sample(frac=1, random_state=42).reset_index(drop=True)
        print('Performed upsampling. New distribution:')
        display(df_bal['target'].value_counts())
    else:
        df_bal = df.copy()
else:
    df_bal = df.copy()

## Train / Test split and model pipeline

In [None]:
# Train-test split
X = df_bal['clean_text']
y = df_bal['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define a simple pipeline: TF-IDF + Logistic Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
    ('clf', LogisticRegression(max_iter=1000))
])

pipeline.fit(X_train, y_train)
print('Training complete')

## Evaluation

In [None]:
# Predictions and evaluation
y_pred = pipeline.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('\nClassification Report:\n')
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
print('\nConfusion Matrix:\n', cm)

## Try alternative model: Random Forest (optional)

In [None]:
rf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
    ('rf', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))
])

rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)
print('RF Accuracy:', accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

## Save trained model

In [None]:
# Save the best pipeline (change filename as needed)
model_path = 'disaster_detector_pipeline.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(pipeline, f)
print(f'Model saved to {model_path}')

## Example: Use the saved model for inference

In [None]:
# Load and predict on new samples
with open('disaster_detector_pipeline.pkl','rb') as f:
    model = pickle.load(f)

examples = [
    'Huge landslide reported near the highway, many cars buried under debris',
    'Enjoying the concert tonight! Amazing performance.'
]
examples_clean = [preprocess_text(t) for t in examples]
preds = model.predict(examples_clean)
for txt, p in zip(examples, preds):
    print('\nText:', txt)
    print('Predicted label (1=disaster,0=not):', p)

## Optional: Collecting live tweets (Twitter API)

Below is a **commented** template showing how you would collect tweets via Tweepy. To run it you must set up API credentials and install tweepy. This is left commented because it requires private keys and network access.

In [None]:
# Example (commented) - requires `tweepy` and Twitter API credentials
# !pip install tweepy
# import tweepy
#
# API_KEY = 'YOUR_API_KEY'
# API_SECRET = 'YOUR_API_SECRET'
# BEARER_TOKEN = 'YOUR_BEARER_TOKEN'
#
# client = tweepy.Client(bearer_token=BEARER_TOKEN)
# query = 'earthquake OR flood OR fire -is:retweet lang:en'
# tweets = client.search_recent_tweets(query=query, max_results=100)
# for t in tweets.data:
#     print(t.text)

## Next steps / Improvements

- Use transformer embeddings (BERT, RoBERTa) for better performance.
- Fine-tune a pretrained transformer on a labeled disaster dataset.
- Add location extraction and entity recognition to find affected areas.
- Build a dashboard to visualize detected events (time series, heatmaps).
- Add multilingual support and misinformation filtering.

---

Good luck with your internship project!