In [50]:
import pandas as pd
from datetime import datetime
import numpy as np
from dateutil import parser

<div dir="rtl" align="right">

  خواندن داده از دیتاست     

</div>



In [51]:

df=pd.read_csv("japan_messy_earthquakes.csv")


<div dir="rtl" align="right">

تعداد ردیف‌ها و ستون‌ها

</div>

In [52]:
rows, columns = df.shape
print(f"ROWS: {rows} _ Columns: {columns}")

ROWS: 36 _ Columns: 8


<div dir="rtl" align="right">

بررسی نوع داده‌ها:

</div>

In [53]:
df.dtypes


time          object
latitude      object
longitude    float64
depth         object
mag           object
place         object
status        object
notes         object
dtype: object

<div dir="rtl" align="right">

پردازش و مرتب‌سازی داده‌ها

</div>

In [54]:
df = df.drop(columns=["notes", "status"])
df = df.drop_duplicates().reset_index(drop=True)


In [55]:



dates = []
times = []
df["time"] = df["time"].astype(str).str.strip().str.replace(".Z", "Z", regex=False)
for ts in df["time"]:
    try:
        dt = parser.parse(ts)
        date_str = dt.date()
        time_str = dt.strftime("%H:%M:%S.%f")[:-3]  
    except:
        date_str = "unv"
        time_str = "unv"
    
    dates.append(date_str)
    times.append(time_str)

df.insert(0, "Date", dates)
df.insert(1, "Time", times)

df = df.drop(columns="time")

In [56]:
df.insert(
    0, 
    'datetime',
    pd.to_datetime(df['Date'].astype(str) + ' ' + df['Time'].astype(str), errors='coerce')
)
df['Month'] = df['datetime'].dt.strftime('%B')
df = df.drop(columns=["Date", "Time"])



In [59]:


def floater(value):

    value = str(value).lower().strip().replace("point", ".").replace(" ", "")
    words = {
        "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4,
        "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9
    }
    try:
        return float(value)
    except:
        pass

    if "." in value:
        left, right = value.split(".", 1)
        if left in words and right in words:
            return float(str(words[left])+"."+str(words[right]))
    if value in words:
        return float(words[value])

    return None
df['mag'] = df['mag'].apply(floater)


In [60]:
def clean_depth(val):

    if pd.isna(val):
        return np.nan

    v = str(val).strip().lower()

    num_str = ''
    for ch in v:
        if ch.isdigit() or ch == '.':
            num_str += ch
    if num_str == '':
        return np.nan

    num = float(num_str)
    if num<0 or num>700:
        return np.nan

    if 'km' in v or 'kilometer' in v:
        result = num
    elif 'meter' in v:
        result = num / 1000
    elif 'mile' in v:
        result = num * 1.60934
    else:
        result = num  

    return round(result, 2)
df['depth'] = df['depth'].apply(clean_depth)


In [61]:

cols = ['latitude', 'longitude', 'mag', 'depth']

def invalid_count(row):
    count = 0
    for col in cols:
        val = row[col]

        if pd.isna(val):
            count += 1
            continue

        v = str(val).strip().lower()

        if v in ['nan', 'unknown', '', 'none']:
            count += 1
            continue

        try:
            num = float(v)
        except ValueError:
            count += 1
            continue

        if num < 0:
            count += 1

    return count

df = df[df.apply(invalid_count, axis=1) < 2].reset_index(drop=True)


In [62]:

def fill_missing(df, col, kind='mean'):

    if kind == 'mean':
        val = df[col].mean()

    else:
        val = df[col].median()

    df[col] = df[col].fillna(val)
    print(f"{col} {kind} = {val:.2f}")
    
    return df

df['latitude'] = df['latitude'].astype(float)

df = fill_missing(df, 'latitude', 'mean')
df = fill_missing(df, 'longitude', 'mean')
df = fill_missing(df, 'depth', 'median')
df = fill_missing(df, 'mag', 'median')




latitude mean = 35.45
longitude mean = 136.40
depth median = 30.00
mag median = 4.80


In [63]:
def categorize_mag(mag):
    if mag < 4:
        return "Weak"
    elif mag <= 6:
        return "Moderate"
    else:
        return "Strong"

df["category"] = df["mag"].apply(categorize_mag)
df

Unnamed: 0,datetime,latitude,longitude,depth,mag,place,Month,category
0,2025-09-15 12:45:30.123,38.322,142.369,35.0,5.1,"off east coast of Honshu, Japan",September,Moderate
1,2025-09-17 14:10:05.000,43.0618,141.3545,10.2,4.8,"Hokkaido, Japan region",September,Moderate
2,2025-09-18 23:11:55.000,33.5904,130.4017,12.0,4.9,"Fukuoka, Japan",September,Moderate
3,2025-09-19 05:00:00.000,35.6895,139.6917,30.0,5.5,"Tokyo, Japan",September,Moderate
4,2025-09-20 14:30:10.789,37.765,140.467,40.23,4.6,Fukushima Prefecture,September,Moderate
5,2025-09-21 02:15:45.987,34.6937,135.5023,45.1,5.2,osaka,September,Moderate
6,2025-09-22 11:05:21.000,33.8562,132.7653,22.0,4.9,"Shikoku, Japan",September,Moderate
7,2025-09-23 18:40:00.000,39.702,141.152,50.5,4.8,Sendai region,September,Moderate
8,2025-09-24 09:09:09.090,35.4437,139.638,30.0,5.0,Yokohama,September,Moderate
9,2025-09-26 01:25:33.000,26.2124,127.6809,15.0,4.7,"Okinawa, Ryukyu Islands",September,Moderate


In [64]:

summary = (
    df.groupby(["Month", "category"])
      .agg(
          avg_magnitude=("mag", "mean"),
          earthquake_count=("mag", "count")
      )
      .reset_index()
)

print(summary)


       Month  category  avg_magnitude  earthquake_count
0    October  Moderate       4.727778                18
1  September  Moderate       4.950000                14


In [65]:
def clean_region(area_name): 
    
    cleaned = area_name.replace("Prefecture","").replace("region", "").replace("central","")

    if "," in cleaned:
        parts = cleaned.split(",")
        if "japan" in parts[-1].strip().lower():
            cleaned = ",".join(parts[:-1])

    return cleaned.strip().title()

df["region"] = df["place"].apply(clean_region)


In [66]:
quake_count = df.groupby("region").size().reset_index(name="Earthquake Count")
quake_count= quake_count.sort_values(by="Earthquake Count", ascending=False).reset_index(drop=True)

print(quake_count)


                      region  Earthquake Count
0                      Tokyo                 4
1                   Yokohama                 2
2                   Hokkaido                 2
3                     Sendai                 2
4                     Nagoya                 2
5                      Osaka                 2
6    Okinawa, Ryukyu Islands                 2
7                       Oita                 1
8                  Yamaguchi                 1
9                  Takamatsu                 1
10                   Shikoku                 1
11              Sea Of Japan                 1
12                   Fukuoka                 1
13  Off East Coast Of Honshu                 1
14                 Fukushima                 1
15               Near Aomori                 1
16                  Nagasaki                 1
17                     Kyoto                 1
18                    Kansai                 1
19                  Kanazawa                 1
20           

In [67]:
mean_stats = df.groupby("region")[["mag", "depth"]].mean().reset_index()
mean_stats = mean_stats.sort_values(by="mag", ascending=False).reset_index(drop=True)

mean_stats.columns = ["Region", " Magnitude mean", "Depth mean"]

print(mean_stats)



                      Region   Magnitude mean  Depth mean
0                      Tokyo             5.40       33.25
1                  Yamaguchi             5.30       33.00
2                      Osaka             5.15       37.55
3   Off East Coast Of Honshu             5.10       35.00
4                   Yokohama             5.00       30.00
5                Near Aomori             5.00       60.10
6                  Hiroshima             5.00       80.00
7                    Shikoku             4.90       22.00
8                    Fukuoka             4.90       12.00
9                   Nagasaki             4.90       25.30
10                    Nagoya             4.80       35.20
11                  Hokkaido             4.80       10.20
12   Okinawa, Ryukyu Islands             4.70       15.00
13                 Kagoshima             4.70       30.00
14                 Fukushima             4.60       40.23
15                    Kansai             4.60       40.00
16            

In [42]:
max_stats = df.groupby("region")[["mag", "depth"]].max().reset_index()
max_stats = max_stats.sort_values(by="mag", ascending=False).reset_index(drop=True)
max_stats.columns = ["Region", " Magnitude max", "Depth max"]
print(max_stats)


                      Region   Magnitude max  Depth max
0                      Tokyo             5.8      40.00
1                  Yamaguchi             5.3      33.00
2                      Osaka             5.2      45.10
3   Off East Coast Of Honshu             5.1      35.00
4                   Yokohama             5.0      30.00
5                Near Aomori             5.0      60.10
6                  Hiroshima             5.0      80.00
7                    Shikoku             4.9      22.00
8                    Fukuoka             4.9      12.00
9                   Nagasaki             4.9      25.30
10                    Nagoya             4.8      35.20
11                    Sendai             4.8      50.50
12                  Hokkaido             4.8      10.20
13   Okinawa, Ryukyu Islands             4.7      15.00
14                 Kagoshima             4.7      30.00
15                 Fukushima             4.6      40.23
16                    Kansai             4.6    

In [68]:

df.columns = [col.title() for col in df.columns]

df.rename(columns={'Mag': 'Magnitude'}, inplace=True)
cols = ['Datetime', 'Month'] + [col for col in df.columns if col not in ['Datetime', 'Month']]
df = df[cols]


In [69]:
tokyo_lat, tokyo_long  = 35.6764, 139.6500

In [70]:
earthquake_lat = np.radians(df["Latitude"])
earthquake_long = np.radians(df["Longitude"])
tokyo_lat = np.radians(tokyo_lat)
tokyo_long = np.radians(tokyo_long)

dlat = earthquake_lat - tokyo_lat
dlng = earthquake_long - tokyo_long

haversine = np.sin(dlat / 2)**2 + np.cos(earthquake_lat) * np.cos(tokyo_lat) * np.sin(dlng / 2)**2
haversine = 2 * np.arcsin(np.sqrt(haversine))

dist = (haversine * 6371).round(2)
df.insert(4, "dist_to_Tokyo_km", value=dist)

### Adding a new column: Data Source

In [71]:
df["data_source"] = "DATASET"
print(df)

                  Datetime      Month   Latitude  Longitude  dist_to_Tokyo_km  \
0  2025-09-15 12:45:30.123  September  38.322000   142.3690            380.54   
1  2025-09-17 14:10:05.000  September  43.061800   141.3545            834.13   
2  2025-09-18 23:11:55.000  September  33.590400   130.4017            876.96   
3  2025-09-19 05:00:00.000  September  35.689500   139.6917              4.04   
4  2025-09-20 14:30:10.789  September  37.765000   140.4670            243.39   
5  2025-09-21 02:15:45.987  September  34.693700   135.5023            392.42   
6  2025-09-22 11:05:21.000  September  33.856200   132.7653            660.47   
7  2025-09-23 18:40:00.000  September  39.702000   141.1520            466.71   
8  2025-09-24 09:09:09.090  September  35.443700   139.6380             25.90   
9  2025-09-26 01:25:33.000  September  26.212400   127.6809           1550.15   
10 2025-09-27 15:30:00.000  September  35.689500   139.6917              4.04   
11 2025-09-28 16:55:41.812  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["data_source"] = "DATASET"


In [47]:
df.to_csv('japan_clean_dataset.csv', index=False, encoding='utf-8-sig')

In [48]:

mag_avg = np.mean(df["Magnitude"])
mag_std_dev = np.std(df["Magnitude"])
mag_var = mag_std_dev ** 2


stats_df = pd.DataFrame({
    "Statistic": ["Average", "Variance", "Standard Deviation"],
    "Magnitude": [round(mag_avg, 2), round(mag_var, 2), round(mag_std_dev, 2)]
})


print("Earthquake magnitude statistics from 2025-09-15 to 2025-10-19:")
print(stats_df)

Earthquake magnitude statistics from 2025-09-15 to 2025-10-19:
            Statistic  Magnitude
0             Average       4.82
1            Variance       0.15
2  Standard Deviation       0.38


In [49]:
mag_p25 = np.percentile(df["Magnitude"], 25)
mag_p50 = np.percentile(df["Magnitude"], 50)
mag_p75 = np.percentile(df["Magnitude"], 75)
percentile_df = pd.DataFrame({
    "Percentile": ["25%", "50%", "75%"],
    "Magnitude ≤": [round(mag_p25, 2), round(mag_p50, 2), round(mag_p75, 2)]
})

print(f"Earthquake magnitude percentiles from 2025_09_15 to 2025_10_19:")
print(percentile_df)

Earthquake magnitude percentiles from 2025_09_15 to 2025_10_19:
  Percentile  Magnitude ≤
0        25%          4.6
1        50%          4.8
2        75%          5.0
