In [2]:
# import packages
import numpy as np
import pandas as pd
import seaborn as sns
import geopandas as gpd



In [3]:
# import dataset
df = pd.read_csv('https://data.nasa.gov/resource/gh4g-9sfh.csv?$limit=50000')
df.head()

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,geolocation
0,Aachen,1,Valid,L5,21.0,Fell,1880-01-01T00:00:00.000,50.775,6.08333,"(50.775, 6.08333)"
1,Aarhus,2,Valid,H6,720.0,Fell,1951-01-01T00:00:00.000,56.18333,10.23333,"(56.18333, 10.23333)"
2,Abee,6,Valid,EH4,107000.0,Fell,1952-01-01T00:00:00.000,54.21667,-113.0,"(54.21667, -113.0)"
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976-01-01T00:00:00.000,16.88333,-99.9,"(16.88333, -99.9)"
4,Achiras,370,Valid,L6,780.0,Fell,1902-01-01T00:00:00.000,-33.16667,-64.95,"(-33.16667, -64.95)"


In [4]:
# view data
df.describe


<bound method NDFrame.describe of              name     id nametype              recclass      mass   fall  \
0          Aachen      1    Valid                    L5      21.0   Fell   
1          Aarhus      2    Valid                    H6     720.0   Fell   
2            Abee      6    Valid                   EH4  107000.0   Fell   
3        Acapulco     10    Valid           Acapulcoite    1914.0   Fell   
4         Achiras    370    Valid                    L6     780.0   Fell   
...           ...    ...      ...                   ...       ...    ...   
45711  Zillah 002  31356    Valid               Eucrite     172.0  Found   
45712      Zinder  30409    Valid  Pallasite, ungrouped      46.0  Found   
45713        Zlin  30410    Valid                    H4       3.3  Found   
45714   Zubkovsky  31357    Valid                    L6    2167.0  Found   
45715  Zulu Queen  30414    Valid                  L3.7     200.0  Found   

                          year    reclat    reclong  

In [5]:
# cleaning geo data
df3 = df.loc[(df['reclat'] != 0) & (df['reclong'] != 0)]
df3

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,geolocation
0,Aachen,1,Valid,L5,21.0,Fell,1880-01-01T00:00:00.000,50.77500,6.08333,"(50.775, 6.08333)"
1,Aarhus,2,Valid,H6,720.0,Fell,1951-01-01T00:00:00.000,56.18333,10.23333,"(56.18333, 10.23333)"
2,Abee,6,Valid,EH4,107000.0,Fell,1952-01-01T00:00:00.000,54.21667,-113.00000,"(54.21667, -113.0)"
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976-01-01T00:00:00.000,16.88333,-99.90000,"(16.88333, -99.9)"
4,Achiras,370,Valid,L6,780.0,Fell,1902-01-01T00:00:00.000,-33.16667,-64.95000,"(-33.16667, -64.95)"
...,...,...,...,...,...,...,...,...,...,...
45711,Zillah 002,31356,Valid,Eucrite,172.0,Found,1990-01-01T00:00:00.000,29.03700,17.01850,"(29.037, 17.0185)"
45712,Zinder,30409,Valid,"Pallasite, ungrouped",46.0,Found,1999-01-01T00:00:00.000,13.78333,8.96667,"(13.78333, 8.96667)"
45713,Zlin,30410,Valid,H4,3.3,Found,1939-01-01T00:00:00.000,49.25000,17.66667,"(49.25, 17.66667)"
45714,Zubkovsky,31357,Valid,L6,2167.0,Found,2003-01-01T00:00:00.000,49.78917,41.50460,"(49.78917, 41.5046)"


In [6]:
# preparing geo location for matching country names
gdf = gpd.GeoDataFrame(df3, geometry = gpd.points_from_xy(df3['reclong'], df3['reclat']))
gdf.head()

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,geolocation,geometry
0,Aachen,1,Valid,L5,21.0,Fell,1880-01-01T00:00:00.000,50.775,6.08333,"(50.775, 6.08333)",POINT (6.08333 50.77500)
1,Aarhus,2,Valid,H6,720.0,Fell,1951-01-01T00:00:00.000,56.18333,10.23333,"(56.18333, 10.23333)",POINT (10.23333 56.18333)
2,Abee,6,Valid,EH4,107000.0,Fell,1952-01-01T00:00:00.000,54.21667,-113.0,"(54.21667, -113.0)",POINT (-113.00000 54.21667)
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976-01-01T00:00:00.000,16.88333,-99.9,"(16.88333, -99.9)",POINT (-99.90000 16.88333)
4,Achiras,370,Valid,L6,780.0,Fell,1902-01-01T00:00:00.000,-33.16667,-64.95,"(-33.16667, -64.95)",POINT (-64.95000 -33.16667)


In [7]:
# preparing country data
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world.rename(columns = {'name': 'country'}, inplace = True)
world.head()

Unnamed: 0,pop_est,continent,country,iso_a3,gdp_md_est,geometry
0,889953.0,Oceania,Fiji,FJI,5496,"MULTIPOLYGON (((180.00000 -16.06713, 180.00000..."
1,58005463.0,Africa,Tanzania,TZA,63177,"POLYGON ((33.90371 -0.95000, 34.07262 -1.05982..."
2,603253.0,Africa,W. Sahara,ESH,907,"POLYGON ((-8.66559 27.65643, -8.66512 27.58948..."
3,37589262.0,North America,Canada,CAN,1736425,"MULTIPOLYGON (((-122.84000 49.00000, -122.9742..."
4,328239523.0,North America,United States of America,USA,21433226,"MULTIPOLYGON (((-122.84000 49.00000, -120.0000..."


In [8]:
# match county with long and lat columns
gdf_countries = gpd.sjoin(world, gdf, how="right", op="contains")
gdf_countries

  if await self.run_code(code, result, async_=asy):
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: None

  gdf_countries = gpd.sjoin(world, gdf, how="right", op="contains")


Unnamed: 0,index_left,pop_est,continent,country,iso_a3,gdp_md_est,name,id,nametype,recclass,mass,fall,year,reclat,reclong,geolocation,geometry
0,129.0,11484055.0,Europe,Belgium,BEL,533097.0,Aachen,1,Valid,L5,21.0,Fell,1880-01-01T00:00:00.000,50.77500,6.08333,"(50.775, 6.08333)",POINT (6.08333 50.77500)
1,142.0,5818553.0,Europe,Denmark,DNK,350104.0,Aarhus,2,Valid,H6,720.0,Fell,1951-01-01T00:00:00.000,56.18333,10.23333,"(56.18333, 10.23333)",POINT (10.23333 56.18333)
2,3.0,37589262.0,North America,Canada,CAN,1736425.0,Abee,6,Valid,EH4,107000.0,Fell,1952-01-01T00:00:00.000,54.21667,-113.00000,"(54.21667, -113.0)",POINT (-113.00000 54.21667)
3,27.0,127575529.0,North America,Mexico,MEX,1268870.0,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976-01-01T00:00:00.000,16.88333,-99.90000,"(16.88333, -99.9)",POINT (-99.90000 16.88333)
4,9.0,44938712.0,South America,Argentina,ARG,445445.0,Achiras,370,Valid,L6,780.0,Fell,1902-01-01T00:00:00.000,-33.16667,-64.95000,"(-33.16667, -64.95)",POINT (-64.95000 -33.16667)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45711,164.0,6777452.0,Africa,Libya,LBY,52091.0,Zillah 002,31356,Valid,Eucrite,172.0,Found,1990-01-01T00:00:00.000,29.03700,17.01850,"(29.037, 17.0185)",POINT (17.01850 29.03700)
45712,55.0,23310715.0,Africa,Niger,NER,12911.0,Zinder,30409,Valid,"Pallasite, ungrouped",46.0,Found,1999-01-01T00:00:00.000,13.78333,8.96667,"(13.78333, 8.96667)",POINT (8.96667 13.78333)
45713,153.0,10669709.0,Europe,Czechia,CZE,250680.0,Zlin,30410,Valid,H4,3.3,Found,1939-01-01T00:00:00.000,49.25000,17.66667,"(49.25, 17.66667)",POINT (17.66667 49.25000)
45714,18.0,144373535.0,Europe,Russia,RUS,1699876.0,Zubkovsky,31357,Valid,L6,2167.0,Found,2003-01-01T00:00:00.000,49.78917,41.50460,"(49.78917, 41.5046)",POINT (41.50460 49.78917)


In [9]:
# cleaning
df = pd.DataFrame(gdf_countries)
df = df.drop(columns=['index_left', 'pop_est', 'gdp_md_est', 'geometry', 'continent', 'iso_a3', 'geolocation'])
# df['year'] = df['year'].astype(str).str.slice(0,4)
df = df.dropna()
# df['year'] = df['year'].apply(pd.to_numeric, errors='coerce').fillna(0.0)


df.head()

Unnamed: 0,country,name,id,nametype,recclass,mass,fall,year,reclat,reclong
0,Belgium,Aachen,1,Valid,L5,21.0,Fell,1880-01-01T00:00:00.000,50.775,6.08333
1,Denmark,Aarhus,2,Valid,H6,720.0,Fell,1951-01-01T00:00:00.000,56.18333,10.23333
2,Canada,Abee,6,Valid,EH4,107000.0,Fell,1952-01-01T00:00:00.000,54.21667,-113.0
3,Mexico,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976-01-01T00:00:00.000,16.88333,-99.9
4,Argentina,Achiras,370,Valid,L6,780.0,Fell,1902-01-01T00:00:00.000,-33.16667,-64.95


In [10]:
# matching evelation data from old data set
alt = pd.read_json("https://raw.githubusercontent.com/raychangCode/raychangCode.github.io/master/processed_4K_elevation%20(1).json")
# alt
df['altitude'] = alt['altitude']
df.head()

Unnamed: 0,country,name,id,nametype,recclass,mass,fall,year,reclat,reclong,altitude
0,Belgium,Aachen,1,Valid,L5,21.0,Fell,1880-01-01T00:00:00.000,50.775,6.08333,135.754379
1,Denmark,Aarhus,2,Valid,H6,720.0,Fell,1951-01-01T00:00:00.000,56.18333,10.23333,25.963053
2,Canada,Abee,6,Valid,EH4,107000.0,Fell,1952-01-01T00:00:00.000,54.21667,-113.0,631.783142
3,Mexico,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976-01-01T00:00:00.000,16.88333,-99.9,-320.418762
4,Argentina,Achiras,370,Valid,L6,780.0,Fell,1902-01-01T00:00:00.000,-33.16667,-64.95,1158.280029


In [25]:
# filter out year data for plotting
df['year'] = df['year'].str.slice(0,4)

df

Unnamed: 0,country,name,id,nametype,recclass,mass,fall,year,reclat,reclong,altitude
0,Belgium,Aachen,1,Valid,L5,21.0,Fell,1880,50.77500,6.08333,135.754379
1,Denmark,Aarhus,2,Valid,H6,720.0,Fell,1951,56.18333,10.23333,25.963053
2,Canada,Abee,6,Valid,EH4,107000.0,Fell,1952,54.21667,-113.00000,631.783142
3,Mexico,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976,16.88333,-99.90000,-320.418762
4,Argentina,Achiras,370,Valid,L6,780.0,Fell,1902,-33.16667,-64.95000,1158.280029
...,...,...,...,...,...,...,...,...,...,...,...
45711,Libya,Zillah 002,31356,Valid,Eucrite,172.0,Found,1990,29.03700,17.01850,
45712,Niger,Zinder,30409,Valid,"Pallasite, ungrouped",46.0,Found,1999,13.78333,8.96667,
45713,Czechia,Zlin,30410,Valid,H4,3.3,Found,1939,49.25000,17.66667,
45714,Russia,Zubkovsky,31357,Valid,L6,2167.0,Found,2003,49.78917,41.50460,


In [26]:
# sample data for better performance
df = df.sample(n=4000)
df.to_json(r'C:\Users\ray\Desktop\export_dataframe_4k.json', orient='records')
