In [1]:
import pandas as pd
import folium
import json
import re
import matplotlib.pyplot as plt
import seaborn as sns
from folium.plugins import HeatMap
from collections import Counter
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import spacy
from geotext import GeoText

# Load SpaCy NLP Model
nlp = spacy.load("en_core_web_sm")

# Initialize OpenStreetMap Geocoder
geolocator = Nominatim(user_agent="geo_locator", timeout=10)

# Load dataset
try:
    df = pd.read_csv("reddit_data_location.csv")
except FileNotFoundError:
    print("❌ Error: 'reddit_data_location.csv' not found!")
    exit()

# Ensure required columns exist
if "content" not in df.columns:
    print("❌ Error: Missing 'content' column!")
    exit()

# List of invalid words that are NOT real locations
INVALID_LOCATIONS = {
    "n't", "ER", "it", "he", "she", "they", "us", "me", "here", "there",
    "somewhere", "anywhere", "everywhere", "world", "earth", "planet", "universe",
    "USA", "UK", "US", "India", "Canada", "Europe", "Asia", "phobia", "kinda",
    "lot", "many", "some", "all", "whole", "most", "person", "people", "someone", "everyone"
}

# Function to validate location using OpenStreetMap
def validate_location_with_osm(location):
    """Checks if the location is real by querying OpenStreetMap (OSM)."""
    if pd.isna(location) or not isinstance(location, str):
        return False
    
    try:
        geo = geolocator.geocode(location, timeout=10)
        return geo is not None  # Return True if OSM can find the location
    except Exception as e:
        return False

# Function to extract location from text using multiple methods
def extract_location(text):
    if pd.isna(text) or not isinstance(text, str):
        return None  

    # 1️⃣ Named Entity Recognition (NER) using SpaCy
    doc = nlp(text)
    ner_locations = [ent.text for ent in doc.ents if ent.label_ in ["GPE", "LOC"]]
    ner_locations = [loc for loc in ner_locations if loc.lower() not in INVALID_LOCATIONS]

    # Validate extracted locations with OpenStreetMap
    ner_locations = [loc for loc in ner_locations if validate_location_with_osm(loc)]

    if ner_locations:
        return ner_locations[0]  # Return the first valid location

    # 2️⃣ Regex-based location extraction
    location_match = re.search(r"(live in|from|located in|born in) ([A-Za-z\s]+)", text, re.IGNORECASE)
    if location_match:
        location = location_match.group(2).strip()
        if location.lower() not in INVALID_LOCATIONS and validate_location_with_osm(location):
            return location

    # 3️⃣ GeoText City Name Recognition
    places = GeoText(text).cities
    valid_places = [place for place in places if validate_location_with_osm(place)]
    
    if valid_places:
        return valid_places[0]  

    return None  

# Function to get latitude & longitude using OpenStreetMap
def get_lat_lon(location):
    """Returns latitude & longitude of a location using OpenStreetMap (Nominatim)."""
    if pd.isna(location) or not isinstance(location, str):
        return None, None  # Ensure valid input
    
    try:
        geo = geolocator.geocode(location, timeout=10)
        if geo:
            return geo.latitude, geo.longitude
    except GeocoderTimedOut:
        print(f"⚠ Geocoding timed out for '{location}'")
    except Exception as e:
        print(f"⚠ Geocoding failed for '{location}': {e}")
    
    return None, None  # Return None if lookup fails

# Extract locations for each post
df["location"] = df["content"].apply(extract_location)

# Remove rows without valid locations
df = df.dropna(subset=["location"])

# Get latitude and longitude for each unique location
unique_locations = df["location"].unique()
location_dict = {}

for loc in unique_locations:
    lat, lon = get_lat_lon(loc)
    if lat and lon:
        location_dict[loc] = (lat, lon)

# Assign latitude & longitude to the dataframe
df["latitude"] = df["location"].map(lambda x: location_dict.get(x, (None, None))[0])
df["longitude"] = df["location"].map(lambda x: location_dict.get(x, (None, None))[1])

# Remove rows where geolocation lookup failed
df = df.dropna(subset=["latitude", "longitude"])

# Count top locations
top_locations = Counter(df["location"]).most_common(5)

# Display top 5 crisis locations
print("\n📍 **Top 5 Locations with Highest Crisis Discussions:**")
for i, (location, count) in enumerate(top_locations, 1):
    print(f"{i}. {location}: {count} mentions")

# Create a heatmap of crisis locations
map_center = [df["latitude"].mean(), df["longitude"].mean()]  # Center map around data
m = folium.Map(location=map_center, zoom_start=4)

# Add heatmap layer
heat_data = list(zip(df["latitude"], df["longitude"]))
HeatMap(heat_data).add_to(m)

# Save the heatmap
heatmap_file = "crisis_heatmap.html"
m.save(heatmap_file)

# Visualization: Top 5 Locations with Highest Crisis Discussions
plt.figure(figsize=(8, 5))
sns.barplot(x=[loc[0] for loc in top_locations], y=[loc[1] for loc in top_locations], palette="Reds")
plt.title("Top 5 Locations with Highest Crisis Discussions")
plt.xlabel("Location")
plt.ylabel("Number of Mentions")
plt.xticks(rotation=45)
plt.savefig("top_crisis_locations.png")  # Save plot instead of displaying
plt.close()

print(f"\n🌍 **Heatmap saved as:** {heatmap_file}")
print(f"📊 **Top crisis locations plot saved as:** top_crisis_locations.png")



📍 **Top 5 Locations with Highest Crisis Discussions:**
1. canada: 44 mentions
2. california: 40 mentions
3. europe: 34 mentions
4. uk: 32 mentions
5. florida: 28 mentions

🌍 **Heatmap saved as:** crisis_heatmap.html
📊 **Top crisis locations plot saved as:** top_crisis_locations.png



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=[loc[0] for loc in top_locations], y=[loc[1] for loc in top_locations], palette="Reds")



📍 **Top 5 Locations with Highest Crisis Discussions:**
1. kinda: 91 mentions
2. canada: 44 mentions
3. california: 38 mentions
4. europe: 34 mentions
5. phobia: 32 mentions

🌍 **Heatmap saved as:** crisis_heatmap.html
📊 **Top crisis locations plot saved as:** top_crisis_locations.png



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=[loc[0] for loc in top_locations], y=[loc[1] for loc in top_locations], palette="Reds")
