In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
from matplotlib import pyplot as plt
import plotly.express as px

In [2]:
df = pd.read_csv('eda_missing_data_dataset1.csv')

In [3]:
df.head()

Unnamed: 0,date,center_point_geom,longitude,latitude,number_of_strikes
0,2018-08-01,POINT(-81.6 22.6),-81.6,22.6,48
1,2018-08-01,POINT(-81.1 22.6),-81.1,22.6,32
2,2018-08-01,POINT(-80.9 22.6),-80.9,22.6,118
3,2018-08-01,POINT(-80.8 22.6),-80.8,22.6,69
4,2018-08-01,POINT(-98.4 22.8),-98.4,22.8,44


In [4]:
df.shape

(717530, 5)

In [5]:
df_zip = pd.read_csv('eda_missing_data_dataset2.csv')
df_zip.head()

Unnamed: 0,date,zip_code,city,state,state_code,center_point_geom,number_of_strikes
0,2018-08-08,3281,Weare,New Hampshire,NH,POINT(-71.7 43.1),1
1,2018-08-14,6488,Heritage Village CDP,Connecticut,CT,POINT(-73.2 41.5),3
2,2018-08-16,97759,"Sisters city, Black Butte Ranch CDP",Oregon,OR,POINT(-121.4 44.3),3
3,2018-08-18,6776,New Milford CDP,Connecticut,CT,POINT(-73.4 41.6),48
4,2018-08-08,1077,Southwick,Massachusetts,MA,POINT(-72.8 42),2


In [6]:
df_zip.shape

(323700, 7)

In [7]:
df_joined = df.merge(df_zip, how='left', on = ['date', 'center_point_geom'])

In [8]:
df_joined.head()

Unnamed: 0,date,center_point_geom,longitude,latitude,number_of_strikes_x,zip_code,city,state,state_code,number_of_strikes_y
0,2018-08-01,POINT(-81.6 22.6),-81.6,22.6,48,,,,,
1,2018-08-01,POINT(-81.1 22.6),-81.1,22.6,32,,,,,
2,2018-08-01,POINT(-80.9 22.6),-80.9,22.6,118,,,,,
3,2018-08-01,POINT(-80.8 22.6),-80.8,22.6,69,,,,,
4,2018-08-01,POINT(-98.4 22.8),-98.4,22.8,44,,,,,


In [9]:
df_joined.describe()

Unnamed: 0,longitude,latitude,number_of_strikes_x,zip_code,number_of_strikes_y
count,717530.0,717530.0,717530.0,323700.0,323700.0
mean,-90.875445,33.328572,21.637081,57931.958996,25.410587
std,13.648429,7.938831,48.029525,22277.327411,57.421824
min,-133.9,16.6,1.0,1002.0,1.0
25%,-102.8,26.9,3.0,38260.75,3.0
50%,-90.3,33.2,6.0,59212.5,8.0
75%,-80.9,39.4,21.0,78642.0,24.0
max,-43.8,51.7,2211.0,99402.0,2211.0


In [10]:
df_null_geo = df_joined[pd.isnull(df_joined.state_code)]

In [11]:
df_null_geo.shape


(393830, 10)

In [12]:
df_joined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 717530 entries, 0 to 717529
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   date                 717530 non-null  object 
 1   center_point_geom    717530 non-null  object 
 2   longitude            717530 non-null  float64
 3   latitude             717530 non-null  float64
 4   number_of_strikes_x  717530 non-null  int64  
 5   zip_code             323700 non-null  float64
 6   city                 323700 non-null  object 
 7   state                323700 non-null  object 
 8   state_code           323700 non-null  object 
 9   number_of_strikes_y  323700 non-null  float64
dtypes: float64(4), int64(1), object(5)
memory usage: 54.7+ MB


In [13]:
top_missing = df_null_geo[['latitude', 'longitude', 'number_of_strikes_x']].groupby(['latitude', 'longitude']).sum().sort_values(by='number_of_strikes_x', ascending=False).reset_index()
top_missing.head(10)

Unnamed: 0,latitude,longitude,number_of_strikes_x
0,22.4,-84.2,3841
1,22.9,-82.9,3184
2,22.4,-84.3,2999
3,22.9,-83.0,2754
4,22.5,-84.1,2746
5,22.5,-84.2,2738
6,22.3,-81.0,2680
7,22.9,-82.4,2652
8,22.9,-82.3,2618
9,22.3,-84.3,2551


In [None]:
fig = px.scatter_geo(top_missing[top_missing.number_of_strikes_x >= 300],
                     
                        lat='latitude',
                        lon='longitude',
                        size='number_of_strikes_x',)
fig.update_layout(
    title_text = 'Missing_Data',
    title_x = 0.5,
    title_y = 0.95,
    title_font_size = 20,
    geo=dict(
        scope='usa',
        showland=True,
        landcolor='rgb(243, 243, 243)',
        subunitcolor='rgb(217, 217, 217)',
        countrycolor='rgb(217, 217, 217)',
        showlakes=True,
        lakecolor='rgb(255, 255, 255)',
        projection_type='albers usa',
    ),
    
    
)
fig.show()