**Disclaimer:** Scraping data is often considered illegal and unethical if done without proper permissions. This project and its content are intended **solely for educational purposes** to demonstrate technical concepts. Data is scraped from https://seismonepal.gov.np/earthquakes

In [32]:
#scraping data

import requests
from bs4 import BeautifulSoup
import pandas as pd

# Scrape data from the website
base_url = "https://seismonepal.gov.np/earthquakes/index"

max_pages = 63  # Update as needed

# Function to scrape data from a single page
def scrape_page(page_url):
    response = requests.get(page_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Locate the table
    table = soup.find('table', {'class': 'table table-striped table-bordered'})
    rows = table.find('tbody').find_all('tr')

    data = []
    for row in rows:
        cols = row.find_all('td')
        cols = [col.text.strip() for col in cols]
        if cols:  # Check for non-empty rows
            data.append({
                'Date': cols[1],
                'Time': cols[2],
                'Latitude': str(cols[3]),
                'Longitude': str(cols[4]),
                'Magnitude': cols[5],
                'Epicenter': cols[6]
            })
    return data

# Main function to scrape multiple pages
def scrape_all_pages(base_url, max_pages):
    all_data = []
    for page in range(1, max_pages + 1):
        url = f"{base_url}?page={page}"
        page_data = scrape_page(url)
        all_data.extend(page_data)
        # print(f"Scraped page {page}") #this step can take some time, to see the progress of scraping data, uncomment this line
    return all_data

datas = scrape_all_pages(base_url, max_pages)
# print(data)
print('data scraped updated')

data scraped updated


In [33]:
# --- Cleaning Functions ---

def clean_date(date_str):
    if 'A.D.:' in date_str:
        return date_str.split('A.D.:')[1].strip()
    return None

def clean_time(time_str):
    if 'Local:' in time_str:
        return time_str.split('Local:')[1].split('UTC:')[0].strip()
    return None

def clean_numeric(value):
    try:
        return float(str(value).replace(':', '.'))
    except ValueError:
        return None

def clean_epicenter(epicenter_str):
    # Remove asterisk and extra spaces
    return epicenter_str.replace('*', '').strip()

# --- Process Data ---

cleaned_data = []
for entry in datas:
    cleaned_entry = {
        'Date': clean_date(entry['Date']),
        'Time': clean_time(entry['Time']),
        'Latitude': clean_numeric(entry['Latitude']),
        'Longitude': clean_numeric(entry['Longitude']),
        'Magnitude': clean_numeric(entry['Magnitude']),
        'Epicenter': clean_epicenter(entry['Epicenter'])
    }
    cleaned_data.append(cleaned_entry)

# --- Convert to DataFrame ---
df = pd.DataFrame(cleaned_data)

# Drop rows with missing coordinates
df = df.dropna(subset=['Latitude', 'Longitude'], how='any')

# Convert 'Date' to datetime.date
df['Date'] = pd.to_datetime(df['Date'], errors='coerce').dt.date

# Combine 'Date' and 'Time' into full datetime
df['Datetime'] = pd.to_datetime(df['Date'].astype(str) + ' ' + df['Time'], errors='coerce')

# Save to CSV
df.to_csv('data/earthquake_data_nepal.csv', index=False)
print("✅ Data saved to earthquake_data_nepal.csv")


✅ Data saved to earthquake_data_nepal.csv


In [5]:
# Display DataFrame
print(df)

            Date   Time  Latitude  Longitude  Magnitude      Epicenter  \
0     2025-04-04  20:10     28.96      82.12        5.5      Jajarkot*   
1     2025-04-04  20:07     28.95      82.12        5.2       Jajarkot   
2     2025-04-03  17:04     30.02      80.84        4.0       Darchula   
3     2025-03-26  19:44     29.69      81.82        4.5          Humla   
4     2025-03-26  18:27     28.70      86.74        5.5  Tingri, China   
...          ...    ...       ...        ...        ...            ...   
1232  1995-01-29  02:37     26.85      86.11        4.5        Dhanusa   
1233  1995-01-27  23:05     29.08      81.73        4.3        Kalikot   
1234  1995-01-22  11:58     27.90      87.80        4.0      Taplejung   
1235  1995-01-19  12:18     28.35      83.44        4.3         Myagdi   
1236  1995-01-05  17:23     29.82      80.95        4.4       Darchula   

                 Datetime  
0     2025-04-04 20:10:00  
1     2025-04-04 20:07:00  
2     2025-04-03 17:04:00  