In [9]:
# Dependencies and Setup
import pandas as pd
from pathlib import Path
import hvplot.pandas
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objs as go
import plotly.io as pio
import plotly.graph_objects as go
from citipy import citipy
import geopy.distance
from geopy.geocoders import Nominatim #country data

# File to Load 
west_pacific_basin_to_load = Path("Resources/ibtracs.WP.list.v04r00.csv")

# Read School and Student Data File and store into Pandas DataFrames
west_pacific_basin_raw = pd.read_csv(west_pacific_basin_to_load)


Columns (1,2,8,9,14,19,20,131,132,133,161,162) have mixed types. Specify dtype option on import or set low_memory=False.



In [2]:
west_pacific_basin_raw.head()

Unnamed: 0,SID,SEASON,NUMBER,BASIN,SUBBASIN,NAME,ISO_TIME,NATURE,LAT,LON,...,BOM_GUST_PER,REUNION_GUST,REUNION_GUST_PER,USA_SEAHGT,USA_SEARAD_NE,USA_SEARAD_SE,USA_SEARAD_SW,USA_SEARAD_NW,STORM_SPEED,STORM_DIR
0,,Year,,,,,,,degrees_north,degrees_east,...,second,kts,second,ft,nmile,nmile,nmile,nmile,kts,degrees
1,1884177N17124,1884,14.0,WP,MM,NOT_NAMED,1884-06-24 16:00:00,NR,16.5000,124.000,...,,,,,,,,,6,281
2,1884177N17124,1884,14.0,WP,MM,NOT_NAMED,1884-06-24 18:00:00,NR,16.5373,123.800,...,,,,,,,,,6,282
3,1884177N17124,1884,14.0,WP,MM,NOT_NAMED,1884-06-24 21:00:00,NR,16.5978,123.500,...,,,,,,,,,6,283
4,1884177N17124,1884,14.0,WP,MM,NOT_NAMED,1884-06-25 00:00:00,NR,16.6706,123.200,...,,,,,,,,,6,286


# Data Cleaning

In [3]:
# create new dataframe from raw data

# cut irrelevant columns (keep columns A to P)
west_pacific_basin_clean_col = west_pacific_basin_raw[['SID', 'SEASON', 'NUMBER', 'BASIN', 'NAME', 'ISO_TIME',
                                                       'NATURE', 'LAT', 'LON', 'WMO_WIND', 'WMO_PRES', 'DIST2LAND']]

new_column_names = {'SID': 'ID', 'SEASON': 'Season', 'NUMBER': 'Number', 'BASIN': 'Basin',
                    'NAME': 'Name', 'ISO_TIME': 'ISO Time', 'NATURE': 'Nature', 'LAT': 'Latitude',
                    'LON': 'Longitude', 'WMO_WIND': 'Wind Speed (knots)',
                    'WMO_PRES': 'Wind Pressure (mb)', 'DIST2LAND': 'Distance to Land (km)'}

west_pacific_basin_clean_col_names = west_pacific_basin_clean_col.rename(columns=new_column_names)
west_pacific_basin_clean_col_names.head()

# turn ISO time into datetime datatype
west_pacific_basin_clean_col_names['ISO Time'] = pd.to_datetime(west_pacific_basin_clean_col_names['ISO Time'], 
                                                                format='%Y-%m-%d %H', errors='coerce')

# create a dataset from 1922 to 2022 from 'ISO_TIME'
west_pacific_basin_final = west_pacific_basin_clean_col_names[(west_pacific_basin_clean_col_names['ISO Time'].dt.year >= 1922) &
                                                              (west_pacific_basin_clean_col_names['ISO Time'].dt.year <= 2022)]

columns_to_convert = ["Number", "Wind Speed (knots)", "Wind Pressure (mb)", "Distance to Land (km)"]
for column in columns_to_convert:
    west_pacific_basin_final[column] = pd.to_numeric(west_pacific_basin_final[column], errors='coerce').astype(float)

west_pacific_basin_final.info()

west_pacific_basin_final



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



<class 'pandas.core.frame.DataFrame'>
Int64Index: 213161 entries, 29189 to 242349
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   ID                     213161 non-null  object        
 1   Season                 213161 non-null  object        
 2   Number                 213161 non-null  float64       
 3   Basin                  213161 non-null  object        
 4   Name                   213161 non-null  object        
 5   ISO Time               213161 non-null  datetime64[ns]
 6   Nature                 213161 non-null  object        
 7   Latitude               213161 non-null  object        
 8   Longitude              213161 non-null  object        
 9   Wind Speed (knots)     28200 non-null   float64       
 10  Wind Pressure (mb)     68676 non-null   float64       
 11  Distance to Land (km)  213161 non-null  float64       
dtypes: datetime64[ns](1), float64(4), object

Unnamed: 0,ID,Season,Number,Basin,Name,ISO Time,Nature,Latitude,Longitude,Wind Speed (knots),Wind Pressure (mb),Distance to Land (km)
29189,1922085N15112,1922,22.0,WP,NOT_NAMED,1922-03-25 22:00:00,NR,14.9,112.2,,,331.0
29190,1922085N15112,1922,22.0,WP,NOT_NAMED,1922-03-26 00:00:00,NR,15.0076,112.264,,,345.0
29191,1922085N15112,1922,22.0,WP,NOT_NAMED,1922-03-26 03:00:00,NR,15.1661,112.362,,,359.0
29192,1922085N15112,1922,22.0,WP,NOT_NAMED,1922-03-26 06:00:00,NR,15.3173,112.462,,,371.0
29193,1922085N15112,1922,22.0,WP,NOT_NAMED,1922-03-26 09:00:00,NR,15.4568,112.565,,,386.0
...,...,...,...,...,...,...,...,...,...,...,...,...
242345,2022345N17125,2022,53.0,WP,PAKHAR,2022-12-12 06:00:00,NR,20.0,130.0,,,828.0
242346,2022345N17125,2022,53.0,WP,PAKHAR,2022-12-12 09:00:00,NR,19.9725,130.292,,,858.0
242347,2022345N17125,2022,53.0,WP,PAKHAR,2022-12-12 12:00:00,NR,19.9,130.6,,,887.0
242348,2022345N17125,2022,53.0,WP,PAKHAR,2022-12-12 15:00:00,NR,19.73,130.865,,,915.0


In [4]:
# count # of 'NaT' values in ['ISO Time']
na_count = west_pacific_basin_clean_col_names['ISO Time'].isna().sum()
print("Number of 'NaT' values:", na_count)

Number of 'NaT' values: 1


In [5]:
def plot_hurricane_paths(df):
    # Group data by decades
    df['Decade'] = (df['ISO Time'].dt.year // 10) * 10
    grouped = df.groupby('Decade')

    for i, (decade, group) in enumerate(grouped):
        # Update the decade label for each iteration
        
        if i == 0:
            # Change the year range for the first iteration
            start_year = 1922
            end_year = 1931
            decade_label = f"{start_year}-{end_year}"
        
        else:
            # Update the year range for subsequent iterations
            start_year = decade + 2
            end_year = decade + 11
            decade_label = f"{start_year}-{end_year}"
        
        group = group[(group['ISO Time'].dt.year >= start_year) & (group['ISO Time'].dt.year <= end_year)]
    
        # Create hurricane path plot for the decade
        fig = go.Figure(data=go.Scattergeo(
            lat=group['Latitude'],
            lon=group['Longitude'],
            mode='markers',
            marker=dict(
                size=1,     
                # Other marker attributes here
            ),
        ))
        fig.update_layout(
            title=f'Hurricane Paths ({decade_label})',
            geo=dict(
                resolution=110,
                showland=True,
                showlakes=True,
                landcolor='rgb(204, 204, 204)',
                countrycolor='rgb(204, 204, 204)',
                lakecolor='rgb(255, 255, 255)',
                projection_type="natural earth",
                fitbounds="locations",  # Zoom the map around the data
                coastlinewidth=1,
                lataxis=dict(
                    range=[-90, 90],
                    showgrid=True,
                    dtick=10
                ),
                lonaxis=dict(
                    range=[-180, 180],
                    showgrid=True,
                    dtick=20
                ),
            )
        )
        
        # Save the figure as a PNG file
        pio.write_image(fig, f"North_Pacific/Fig_{start_year}_{end_year}.png")

        if decade >= 2010:
            break        
        
# Call the function to plot hurricane paths
plot_hurricane_paths(west_pacific_basin_final)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [118]:
# Find the top 10 longest hurricanes in 2022
# Filter hurricanes for the year 
hurricanes_2022 = west_pacific_basin_final[west_pacific_basin_final['ISO Time'].dt.year == 2022]

# Find the top 10 longest hurricanes based on maximum count of 'ID'
top_10_longest_hurricanes_data = hurricanes_2022.groupby('ID').size().nlargest(10).reset_index(name='Count')
top_10_longest_hurricanes_names = top_10_longest_hurricanes_data.merge(hurricanes_2022, on='ID').drop_duplicates(subset='ID')

# Display the top 10 longest hurricanes of 2022
print(top_10_longest_hurricanes_names[['Name', 'Count']])

# Create a new dataset for the top 10 longest hurricanes
top_10_longest_hurricanes_dataset = west_pacific_basin_final[west_pacific_basin_final['ID'].isin(top_10_longest_hurricanes_data['ID'])]

          Name  Count
0        MUIFA     75
75   HINNAMNOR     73
148       AERE     67
215    MALAKAS     65
280     NALGAE     61
341   NANMADOL     57
398       NORU     51
449      NESAT     47
496       ROKE     45
541     MERBOK     33


In [115]:
# Convert each hurricane into a line plot for each group

line_plots = []
for group_name, group_data in top_10_longest_hurricanes_dataset.groupby('ID'):
    line_plot = go.Scattergeo(
        lat=group_data['Latitude'],
        lon=group_data['Longitude'],
        mode='lines',
        name=group_data['Name'].iloc[0]  # Use the first value from the 'Name' column
    )
    line_plots.append(line_plot)

# Create the layout for the map
layout = go.Layout(
    title='Top 10 Longest Hurricanes in North Asia Pacific (2022)',
    showlegend=True,
    geo=dict(
        resolution=50,
        showland=True,
        showlakes=True,
        landcolor='rgb(204, 204, 204)',
        countrycolor='rgb(204, 204, 204)',
        lakecolor='rgb(255, 255, 255)',
        projection_type="robinson",
        coastlinewidth=1,
        fitbounds="locations",  # Zoom the map around the data
        lataxis=dict(
            range=[-90, 90],
            showgrid=True,
            dtick=10
        ),
        lonaxis=dict(
            range=[-180, 180],
            showgrid=True,
            dtick=20
        ),
    )
)

# Create the figure with line plots and layout
figure = go.Figure(data=line_plots, layout=layout)

# Display the map
figure.show()


In [116]:
# Filter hurricanes for the year 1952
hurricanes_1952 = west_pacific_basin_final[west_pacific_basin_final['ISO Time'].dt.year == 1952]

# Find the top 10 longest hurricanes based on the maximum count of 'ID'
top_10_longest_1952 = hurricanes_1952['ID'].value_counts().nlargest(10).reset_index()
top_10_longest_1952.columns = ['ID', 'Count']

# Merge with the original dataset to get hurricane names
top_10_longest_1952_names = top_10_longest_1952.merge(hurricanes_1952, on='ID')[['Name', 'Count']].drop_duplicates()

# Display the top 10 longest hurricanes of 1952
print(top_10_longest_1952_names)

# Create a new dataset for the top 10 longest hurricanes
top_10_longest_1952_data = west_pacific_basin_final[west_pacific_basin_final['ID'].isin(top_10_longest_1952['ID'])]


               Name  Count
0             KAREN    141
141            TRIX    109
250           POLLY     93
343  JEANNE:JEANNIE     93
436             IVY     85
521           AGNES     83
604          CARMEN     81
685           WILMA     81
766            BESS     79
845           GILDA     77


In [49]:
# Convert each hurricane into a line plot for each group

line_plots = []
for group_name, group_data in top_10_longest_1952_data.groupby('ID'):
    line_plot = go.Scattergeo(
        lat=group_data['Latitude'],
        lon=group_data['Longitude'],
        mode='lines',
        name=group_data['Name'].iloc[0]  # Use the first value from the 'Name' column
    )
    line_plots.append(line_plot)

# Create the layout for the map
layout = go.Layout(
    title='Top 10 Longest Hurricanes in North Asia Pacific (1952)',
    showlegend=True,
    geo=dict(
        resolution=50,
        showland=True,
        showlakes=True,
        landcolor='rgb(204, 204, 204)',
        countrycolor='rgb(204, 204, 204)',
        lakecolor='rgb(255, 255, 255)',
        projection_type="robinson",
        coastlinewidth=1,
        fitbounds="locations",  # Zoom the map around the data
        lataxis=dict(
            range=[-90, 90],
            showgrid=True,
            dtick=10
        ),
        lonaxis=dict(
            range=[-180, 180],
            showgrid=True,
            dtick=20
        ),
    )
)

# Create the figure with line plots and layout
figure = go.Figure(data=line_plots, layout=layout)

# Display the map
figure.show()


In [20]:
#Finding the top countries affected by hurricanes

#Getting latitude and longitude as lists
lat = hurricanes_2022["Latitude"]
list_lat = lat.tolist()
long = hurricanes_2022["Longitude"]
list_long = long.tolist()
lat_longs = zip(list_lat, list_long)

# Initialize geocoder
geolocator = Nominatim(user_agent="my-app")

# Create a dictionary to store the proximity counts for each country
proximity_counts = {}

# Define the radius in kilometers
radius_km = 111.12

# Set to store processed hurricanes
processed_hurricanes = set()

# Loop through each latitude and longitude pair
for lat, lon in zip(list_lat, list_long):
    location = geolocator.reverse(f"{lat},{lon}", exactly_one=True, language='en')
    if location:
        country = location.raw['address'].get('country', '')
        if country:
            # Check if the current hurricane has been processed before
            hurricane = (lat, lon)
            if hurricane not in processed_hurricanes:
                processed_hurricanes.add(hurricane)
                
                # Calculate the distance between the hurricane and the country's location
                for country_lat, country_lon in zip(list_lat, list_long):
                    distance = geopy.distance.distance(hurricane, (country_lat, country_lon)).km

                    # If the distance is within the radius, increment the proximity count for the country
                    if distance <= radius_km:
                        if country in proximity_counts:
                            proximity_counts[country] += 1
                        else:
                            proximity_counts[country] = 1

# Sort the countries based on their proximity counts in descending order
top_countries = sorted(proximity_counts, key=proximity_counts.get, reverse=True)[:10]

# Print the top 10 countries and their proximity counts
print("Top 10 countries with the highest proximity counts to hurricanes:")
for country in top_countries:
    print(f"{country}: {proximity_counts[country]}")


Top 10 countries with the highest proximity counts to hurricanes:
Philippines: 378
China: 193
Japan: 162
Vietnam: 37
South Korea: 12
Palau: 6
Laos: 4


In [34]:
# Create a choropleth map using Plotly

fig = go.Figure(data=go.Choropleth(
    locations=list(proximity_counts.keys()),
    z=list(proximity_counts.values()),
    locationmode='country names',
    colorscale='YlOrRd',
    marker_line_color='darkgray',
    marker_line_width=0.5,
))

# Set map title and colorbar title
fig.update_layout(
    title_text='Proximity Counts of Hurricanes in Asian Countries (2022)',
    geo=dict(
        showframe=False,
        showcoastlines=True,
        projection_type='robinson',
        fitbounds="locations",  # Zoom the map around the data
         lataxis=dict(
            range=[-90, 90],
            showgrid=True,
            dtick=10
        ),
        lonaxis=dict(
            range=[-180, 180],
            showgrid=True,
            dtick=20
        ),
    ),
    coloraxis_colorbar=dict(title='Count')
)

# Show the interactive map
fig.show()


In [53]:
#Finding the top countries affected by hurricanes

#Getting latitude and longitude as lists
lat = hurricanes_1952["Latitude"]
list_lat = lat.tolist()
long = hurricanes_1952["Longitude"]
list_long = long.tolist()
lat_longs = zip(list_lat, list_long)

# Initialize geocoder
geolocator = Nominatim(user_agent="my-app")

# Create a dictionary to store the proximity counts for each country
proximity_counts = {}

# Define the radius in kilometers
radius_km = 111.12

# Set to store processed hurricanes
processed_hurricanes = set()

# Loop through each latitude and longitude pair
for lat, lon in zip(list_lat, list_long):
    location = geolocator.reverse(f"{lat},{lon}", exactly_one=True, language='en')
    if location:
        country = location.raw['address'].get('country', '')
        if country:
            # Check if the current hurricane has been processed before
            hurricane = (lat, lon)
            if hurricane not in processed_hurricanes:
                processed_hurricanes.add(hurricane)
                
                # Calculate the distance between the hurricane and the country's location
                for country_lat, country_lon in zip(list_lat, list_long):
                    distance = geopy.distance.distance(hurricane, (country_lat, country_lon)).km

                    # If the distance is within the radius, increment the proximity count for the country
                    if distance <= radius_km:
                        if country in proximity_counts:
                            proximity_counts[country] += 1
                        else:
                            proximity_counts[country] = 1

# Sort the countries based on their proximity counts in descending order
top_countries = sorted(proximity_counts, key=proximity_counts.get, reverse=True)[:10]

# Print the top 10 countries and their proximity counts
print("Top 10 countries with the highest proximity counts to hurricanes:")
for country in top_countries:
    print(f"{country}: {proximity_counts[country]}")


Top 10 countries with the highest proximity counts to hurricanes:
China: 1563
Philippines: 810
Vietnam: 368
Taiwan: 191
Federated States of Micronesia: 164
Japan: 151
Cambodia: 130
Laos: 77
Thailand: 71
South Korea: 49


In [114]:
# Define the country to exclude (for some reason Russia is included even though it has the lowest count)
country_to_exclude = 'Russia'

# Convert the proximity_counts dictionary to a DataFrame
proximity_counts_df = pd.DataFrame.from_dict(proximity_counts, orient='index', columns=['Count']).head(13)

# Sort the DataFrame based on the proximity counts in descending order
proximity_counts_df = proximity_counts_df[proximity_counts_df.index != country_to_exclude]

fig = go.Figure(data=go.Choropleth(
    locations=proximity_counts_df.index.tolist(),
    z=proximity_counts_df['Count'].tolist(),
    locationmode='country names',
    colorscale='YlOrRd', 
    marker_line_color='darkgray',
    marker_line_width=0.5,
))

# Set map title and colorbar title
fig.update_layout(
    title_text='Proximity Counts of Hurricanes in Pan Pacific Countries (1952)',
    geo=dict(
        showframe=False,
        showcoastlines=True,
        projection_type='robinson',
        fitbounds="locations",  # Zoom the map around the data
        lataxis=dict(
            range=[-90, 90],
            showgrid=True,
            dtick=10
        ),
        lonaxis=dict(
            range=[-180, 180],
            showgrid=True,
            dtick=20
        ),
    ),
    coloraxis_colorbar=dict(title='Count')
)
    
# Show the interactive map
fig.show()