In [None]:
import pandas as pd
from datetime import datetime
import numpy as np
from dateutil import parser

<div dir="rtl" align="right">

  خواندن داده از دیتاست     

</div>



In [None]:

df=pd.read_csv("japan_messy_earthquakes.csv")


<div dir="rtl" align="right">

تعداد ردیف‌ها و ستون‌ها

</div>

In [None]:
rows, columns = df.shape
print(f"ROWS: {rows} _ Columns: {columns}")

<div dir="rtl" align="right">

بررسی نوع داده‌ها:

</div>

In [None]:
df.dtypes


<div dir="rtl" align="right">

پردازش و مرتب‌سازی داده‌ها

</div>

In [None]:
df = df.drop(columns=["notes", "status"])
df = df.drop_duplicates().reset_index(drop=True)


In [None]:



dates = []
times = []
df["time"] = df["time"].astype(str).str.strip().str.replace(".Z", "Z", regex=False)
for ts in df["time"]:
    try:
        dt = parser.parse(ts)
        date_str = dt.date()
        time_str = dt.strftime("%H:%M:%S.%f")[:-3]  
    except:
        date_str = "unv"
        time_str = "unv"
    
    dates.append(date_str)
    times.append(time_str)

df.insert(0, "Date", dates)
df.insert(1, "Time", times)

df = df.drop(columns="time")

In [None]:
df.insert(
    0, 
    'datetime',
    pd.to_datetime(df['Date'].astype(str) + ' ' + df['Time'].astype(str), errors='coerce')
)
df['Month'] = df['datetime'].dt.strftime('%B')
df = df.drop(columns=["Date", "Time"])



In [None]:


def floater(value):

    value = str(value).lower().strip().replace("point", ".").replace(" ", "")
    words = {
        "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4,
        "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9
    }
    try:
        return float(value)
    except:
        pass

    if "." in value:
        left, right = value.split(".", 1)
        if left in words and right in words:
            return float(str(words[left])+"."+str(words[right]))
    if value in words:
        return float(words[value])

    return None
df['mag'] = df['mag'].apply(floater)


In [None]:
def clean_depth(val):

    if pd.isna(val):
        return np.nan

    v = str(val).strip().lower()

    num_str = ''
    for ch in v:
        if ch.isdigit() or ch == '.':
            num_str += ch
    if num_str == '':
        return np.nan

    num = float(num_str)
    if num<0 or num>700:
        return np.nan

    if 'km' in v or 'kilometer' in v:
        result = num
    elif 'meter' in v:
        result = num / 1000
    elif 'mile' in v:
        result = num * 1.60934
    else:
        result = num  

    return round(result, 2)
df['depth'] = df['depth'].apply(clean_depth)


In [None]:

cols = ['latitude', 'longitude', 'mag', 'depth']

def invalid_count(row):
    count = 0
    for col in cols:
        val = row[col]

        if pd.isna(val):
            count += 1
            continue

        v = str(val).strip().lower()

        if v in ['nan', 'unknown', '', 'none']:
            count += 1
            continue

        try:
            num = float(v)
        except ValueError:
            count += 1
            continue

        if num < 0:
            count += 1

    return count

df = df[df.apply(invalid_count, axis=1) < 2].reset_index(drop=True)


In [None]:

def fill_missing(df, col, kind='mean'):

    if kind == 'mean':
        val = df[col].mean()

    else:
        val = df[col].median()

    df[col] = df[col].fillna(val)
    print(f"{col} {kind} = {val:.2f}")
    
    return df

df['latitude'] = df['latitude'].astype(float)

df = fill_missing(df, 'latitude', 'mean')
df = fill_missing(df, 'longitude', 'mean')
df = fill_missing(df, 'depth', 'median')
df = fill_missing(df, 'mag', 'median')




In [None]:
def categorize_mag(mag):
    if mag < 4:
        return "Weak"
    elif mag <= 6:
        return "Moderate"
    else:
        return "Strong"

df["category"] = df["mag"].apply(categorize_mag)
df

In [None]:

summary = (
    df.groupby(["Month", "category"])
      .agg(
          avg_magnitude=("mag", "mean"),
          earthquake_count=("mag", "count")
      )
      .reset_index()
)

print(summary)


In [None]:
def clean_region(area_name): 
    
    cleaned = area_name.replace("Prefecture","").replace("region", "").replace("central","")

    if "," in cleaned:
        parts = cleaned.split(",")
        if "japan" in parts[-1].strip().lower():
            cleaned = ",".join(parts[:-1])

    return cleaned.strip().title()

df["region"] = df["place"].apply(clean_region)


In [None]:
quake_count = df.groupby("region").size().reset_index(name="Earthquake Count")
quake_count= quake_count.sort_values(by="Earthquake Count", ascending=False).reset_index(drop=True)

print(quake_count)


In [None]:
mean_stats = df.groupby("region")[["mag", "depth"]].mean().reset_index()
mean_stats = mean_stats.sort_values(by="mag", ascending=False).reset_index(drop=True)

mean_stats.columns = ["Region", " Magnitude mean", "Depth mean"]

print(mean_stats)



In [None]:
max_stats = df.groupby("region")[["mag", "depth"]].max().reset_index()
max_stats = max_stats.sort_values(by="mag", ascending=False).reset_index(drop=True)
max_stats.columns = ["Region", " Magnitude max", "Depth max"]
print(max_stats)


In [None]:

df.columns = [col.title() for col in df.columns]

df.rename(columns={'Mag': 'Magnitude'}, inplace=True)
cols = ['Datetime', 'Month'] + [col for col in df.columns if col not in ['Datetime', 'Month']]
df = df[cols]


In [None]:
tokyo_lat, tokyo_long  = 35.6764, 139.6500

In [None]:
earthquake_lat = np.radians(df["Latitude"])
earthquake_long = np.radians(df["Longitude"])
tokyo_lat = np.radians(tokyo_lat)
tokyo_long = np.radians(tokyo_long)

dlat = earthquake_lat - tokyo_lat
dlng = earthquake_long - tokyo_long

haversine = np.sin(dlat / 2)**2 + np.cos(earthquake_lat) * np.cos(tokyo_lat) * np.sin(dlng / 2)**2
haversine = 2 * np.arcsin(np.sqrt(haversine))

dist = (haversine * 6371).round(2)
df.insert(4, "dist_to_Tokyo_km", value=dist)

In [None]:
df.to_csv('japan_clean_dataset.csv', index=False, encoding='utf-8-sig')

In [None]:

mag_avg = np.mean(df["Magnitude"])
mag_std_dev = np.std(df["Magnitude"])
mag_var = mag_std_dev ** 2


stats_df = pd.DataFrame({
    "Statistic": ["Average", "Variance", "Standard Deviation"],
    "Magnitude": [round(mag_avg, 2), round(mag_var, 2), round(mag_std_dev, 2)]
})


print("Earthquake magnitude statistics from 2025-09-15 to 2025-10-19:")
print(stats_df)

In [None]:
mag_p25 = np.percentile(df["Magnitude"], 25)
mag_p50 = np.percentile(df["Magnitude"], 50)
mag_p75 = np.percentile(df["Magnitude"], 75)
percentile_df = pd.DataFrame({
    "Percentile": ["25%", "50%", "75%"],
    "Magnitude ≤": [round(mag_p25, 2), round(mag_p50, 2), round(mag_p75, 2)]
})

print(f"Earthquake magnitude percentiles from 2025_09_15 to 2025_10_19:")
print(percentile_df)