<a href="https://colab.research.google.com/github/spavithra978/Mini-project-1/blob/main/Mini_project_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =========================================================
# USGS EARTHQUAKE DATA EXTRACTION
# Period: Dec 2020 to Dec 2025 (Monthly)
# Schema: Full 26-feature project specification
# =========================================================

import requests
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta
from google.colab import files
import time

BASE_URL = "https://earthquake.usgs.gov/fdsnws/event/1/query"

In [None]:
# ---------------------------------------------------------
# 1. DATE RANGE (PROJECT REQUIREMENT)
# ------------------------------------------------
start_date = datetime(2020, 12, 1)
end_date   = datetime(2025, 12, 1)

current = start_date
records = []

print("Starting data fetch...")


Starting data fetch...


In [None]:
# ---------------------------------------------------------
# 2. MONTH-BY-MONTH LOOP (API SAFE)
# ---------------------------------------------------------
while current < end_date:
    month_start = current.strftime("%Y-%m-%d")
    month_end = (current + relativedelta(months=1)).strftime("%Y-%m-%d")

    print(f"Fetching data: {month_start} → {month_end}")

    params = {
        "format": "geojson",
        "starttime": month_start,
        "endtime": month_end,
        "minmagnitude": 0,
        "limit": 20000
    }

    try:
        response = requests.get(BASE_URL, params=params, timeout=30)
        response.raise_for_status()
        events = response.json().get("features", [])

        print(f"  → Records fetched: {len(events)}")

        for event in events:
            prop = event.get("properties", {})
            geom = event.get("geometry", {}).get("coordinates", [None, None, None])

            records.append({
                # 1–3 Identifiers & Time
                "id": event.get("id"),
                "time": pd.to_datetime(prop.get("time"), unit="ms", utc=True),
                "updated": pd.to_datetime(prop.get("updated"), unit="ms", utc=True),

                # 4–6 Location
                "latitude": geom[1],
                "longitude": geom[0],
                "depth_km": geom[2],

                # 7–8 Magnitude
                "mag": prop.get("mag"),
                "magType": prop.get("magType"),

                # 9–10 Place & Status
                "place": prop.get("place"),
                "status": prop.get("status"),

                # 11–13 Tsunami / Significance / Network
                "tsunami": prop.get("tsunami"),
                "sig": prop.get("sig"),
                "net": prop.get("net"),

                # 14–17 Quality Metrics
                "nst": prop.get("nst"),
                "dmin": prop.get("dmin"),
                "rms": prop.get("rms"),
                "gap": prop.get("gap"),

                # 18–20 Error Metrics
                "magError": prop.get("magError"),
                "depthError": prop.get("depthError"),
                "magNst": prop.get("magNst"),

                # 21–22 Source Information
                "locationSource": prop.get("locationSource"),
                "magSource": prop.get("magSource"),

                # 23–25 References
                "types": prop.get("types"),
                "ids": prop.get("ids"),
                "sources": prop.get("sources"),

                # 26 Event Type
                "type": prop.get("type")
            })

        time.sleep(1)  # Prevent API throttling

    except Exception as e:
        print(f"⚠️ Failed for {month_start}: {e}")

    current += relativedelta(months=1)


Fetching data: 2020-12-01 → 2021-01-01
  → Records fetched: 15202
Fetching data: 2021-01-01 → 2021-02-01
  → Records fetched: 14996
Fetching data: 2021-02-01 → 2021-03-01
  → Records fetched: 11538
Fetching data: 2021-03-01 → 2021-04-01
  → Records fetched: 13247
Fetching data: 2021-04-01 → 2021-05-01
  → Records fetched: 12396
Fetching data: 2021-05-01 → 2021-06-01
  → Records fetched: 11388
Fetching data: 2021-06-01 → 2021-07-01
  → Records fetched: 13705
Fetching data: 2021-07-01 → 2021-08-01
  → Records fetched: 17285
Fetching data: 2021-08-01 → 2021-09-01
  → Records fetched: 16284
Fetching data: 2021-09-01 → 2021-10-01
  → Records fetched: 13272
Fetching data: 2021-10-01 → 2021-11-01
  → Records fetched: 12141
Fetching data: 2021-11-01 → 2021-12-01
  → Records fetched: 11299
Fetching data: 2021-12-01 → 2022-01-01
  → Records fetched: 11165
Fetching data: 2022-01-01 → 2022-02-01
  → Records fetched: 12471
Fetching data: 2022-02-01 → 2022-03-01
  → Records fetched: 11489
Fetching d

In [None]:
# ---------------------------------------------------------
# 3. CREATE DATAFRAME
# ---------------------------------------------------------
df = pd.DataFrame(records)

# ---------------------------------------------------------
# 4. DERIVED TIME FEATURES (PROJECT REQUIREMENT)
# ---------------------------------------------------------
df["year"] = df["time"].dt.year
df["month"] = df["time"].dt.month
df["day"] = df["time"].dt.day
df["hour"] = df["time"].dt.hour
df["day_of_week"] = df["time"].dt.day_name()


In [None]:
# ---------------------------------------------------------
# 5. DATA CLEANING
# ---------------------------------------------------------
numeric_cols = [
    "mag", "depth_km", "nst", "dmin", "rms",
    "gap", "magError", "depthError", "magNst", "sig"
]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

df["tsunami"] = df["tsunami"].fillna(0).astype(int)
df["status"] = df["status"].fillna("unknown")
df["magType"] = df["magType"].fillna("unknown")
df["net"] = df["net"].fillna("unknown")
df["type"] = df["type"].fillna("unknown")

In [None]:
# ---------------------------------------------------------
# 6. SAVE CSV
# ---------------------------------------------------------
csv_name = "earthquakes_5years_full_schema.csv"
df.to_csv(csv_name, index=False)

print("========================================")
print("CSV CREATED SUCCESSFULLY")
print("File:", csv_name)
print("Rows:", len(df))
print("Columns:", len(df.columns))
print("========================================")

files.download(csv_name)