In [1]:
import os
import gzip
import time
import shutil
import ftplib
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from matplotlib import pyplot as plt

# Downloading the data
<br>
Here I download the data from ftp.ncdc.noaa.gov/pub/data/swdi/stormevents/csvfiles/ <br>
to "input_dir_name" directory and unzipping the downloaded files to "output_dir_name" directory <br>
<br>

In [None]:
input_dir_name = '/home/avartation/Desktop/data'
output_dir_name = '/home/avartation/Desktop/csv_data'

In [46]:
############################
##    Downloading data    ##
############################

ftp = ftplib.FTP('ftp.ncdc.noaa.gov')
ftp.login('anonymous', 'email@email.com')
ftp.cwd('/pub/data/swdi/stormevents/csvfiles/')

for file_name in ftp.nlst():
    time.sleep(1)
    if file_name[-3:]=='.gz':
        print(file_name)
        file = open(os.path.join(input_dir_name, file_name), 'wb')
        ftp.retrbinary('RETR '+file_name, file.write)
        file.close()

print('\n\n\n Done ! \n\n\n')

StormEvents_details-ftp_v1.0_d2012_c20170519.csv.gz
StormEvents_fatalities-ftp_v1.0_d2012_c20170519.csv.gz
StormEvents_locations-ftp_v1.0_d2012_c20170519.csv.gz
StormEvents_locations-ftp_v1.0_d1969_c20170717.csv.gz
StormEvents_details-ftp_v1.0_d1972_c20181029.csv.gz
StormEvents_fatalities-ftp_v1.0_d1980_c20170717.csv.gz
StormEvents_locations-ftp_v1.0_d1992_c20170717.csv.gz
StormEvents_locations-ftp_v1.0_d1950_c20170120.csv.gz
StormEvents_locations-ftp_v1.0_d2007_c20170717.csv.gz
StormEvents_details-ftp_v1.0_d1952_c20170619.csv.gz
StormEvents_locations-ftp_v1.0_d1953_c20160223.csv.gz
StormEvents_locations-ftp_v1.0_d1954_c20160223.csv.gz
StormEvents_locations-ftp_v1.0_d1955_c20160223.csv.gz
StormEvents_details-ftp_v1.0_d1956_c20170717.csv.gz
StormEvents_locations-ftp_v1.0_d1957_c20160223.csv.gz
StormEvents_locations-ftp_v1.0_d1959_c20160223.csv.gz
StormEvents_locations-ftp_v1.0_d1961_c20160223.csv.gz
StormEvents_locations-ftp_v1.0_d1963_c20160223.csv.gz
StormEvents_locations-ftp_v1.0_d19

StormEvents_locations-ftp_v1.0_d1997_c20170717.csv.gz
StormEvents_fatalities-ftp_v1.0_d1998_c20170717.csv.gz
StormEvents_locations-ftp_v1.0_d2001_c20170717.csv.gz
StormEvents_locations-ftp_v1.0_d2004_c20170717.csv.gz
StormEvents_fatalities-ftp_v1.0_d2005_c20170717.csv.gz
StormEvents_details-ftp_v1.0_d2007_c20170717.csv.gz
StormEvents_details-ftp_v1.0_d2013_c20170519.csv.gz
StormEvents_fatalities-ftp_v1.0_d2013_c20170519.csv.gz
StormEvents_details-ftp_v1.0_d2010_c20170726.csv.gz
StormEvents_locations-ftp_v1.0_d2010_c20170726.csv.gz
StormEvents_fatalities-ftp_v1.0_d2015_c20180525.csv.gz
StormEvents_details-ftp_v1.0_d2016_c20180718.csv.gz
StormEvents_locations-ftp_v1.0_d2016_c20180718.csv.gz
StormEvents_fatalities-ftp_v1.0_d2018_c20190325.csv.gz
StormEvents_fatalities-ftp_v1.0_d1997_c20170717.csv.gz
StormEvents_fatalities-ftp_v1.0_d1952_c20170619.csv.gz
StormEvents_locations-ftp_v1.0_d1952_c20170619.csv.gz
StormEvents_locations-ftp_v1.0_d2000_c20170717.csv.gz
StormEvents_fatalities-ftp_v1

In [2]:
###############################
##    Unzipping the files    ##
###############################

file_count = len(os.listdir(input_dir_name))

for i, file_name in enumerate(os.listdir(input_dir_name)):
    print('{}-th out of {} files is being processed ({})'.format(i+1, file_count, file_name))
    assert file_name[-3:]=='.gz', '\n\n Foreign file format encountered !!! \n\n'
    with gzip.open(os.path.join(input_dir_name, file_name), 'rb') as file_in:
        with open(os.path.join(output_dir_name, file_name[:-3]), 'wb') as file_out:
            shutil.copyfileobj(file_in, file_out)

print('\n\n\n Done ! \n\n\n')

# Reading the data
<br>
So we now have 3 files per year <br> <br>
<hr> 
<h4>StormEvents_details-ftp_v1.0_dYYYY_cYYYYMMDD.csv</h4> <br>
<h4>StormEvents_locations-ftp_v1.0_dYYYY_cYYYYMMDD.csv</h4> <br>
<h4>StormEvents_fatalities-ftp_v1.0_dYYYY_cYYYYMMDD.csv</h4>
<hr> <br>
The main information we need is in StormEvents_details-ftp_v1.0_dYYYY_cYYYYMMDD.csv file <br>
<br>
Here they are read. <br>
<br>

In [3]:
########################
##    Reading data    ##
########################

data = {}
details = 0
locations = 0
fatalities = 0
else_count = 0

file_count = len(os.listdir(output_dir_name))

for i, file_name in enumerate(os.listdir(output_dir_name)):
    print('{}-th out of {} files is being processed ({})'.format(i+1, file_count, file_name))
    assert file_name[-4:]=='.csv', '\n\n Foreign file format encountered !!! \n\n'
    assert file_name[:11]=='StormEvents', '\n\n Foreign file name encountered !!! \n\n'
    
    year = file_name[-18:-14]
    if year not in data:
        data[year] = {}
    
    if file_name[12:19]=='details':
        data[year]['details'] = pd.read_csv(os.path.join(output_dir_name, file_name))
        details += 1
    elif file_name[12:21]=='locations':
        data[year]['locations'] = pd.read_csv(os.path.join(output_dir_name, file_name))
        locations += 1
    elif file_name[12:22]=='fatalities':
        data[year]['fatalities'] = pd.read_csv(os.path.join(output_dir_name, file_name))
        fatalities += 1
    else:
        raise Exception('\n\n Foreign file encountered !!! \n\n')

print('\n\n       Done ! \n')
print('details    :', details)
print('locations  :', locations)
print('fatalities :', fatalities)
print('else_count :', else_count)

1-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1973_c20160223.csv)
2-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1972_c20181029.csv)
3-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d1971_c20160223.csv)
4-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d2001_c20170717.csv)
5-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d1994_c20170717.csv)
6-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1977_c20160223.csv)
7-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1974_c20160223.csv)
8-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1996_c20170717.csv)


  interactivity=interactivity, compiler=compiler, result=result)


9-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d1968_c20160223.csv)
10-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d2017_c20190405.csv)
11-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1989_c20170717.csv)
12-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d2009_c20180718.csv)
13-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1999_c20170717.csv)
14-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1956_c20170717.csv)
15-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d2004_c20170717.csv)
16-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d2003_c20170717.csv)
17-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1964_c20160223.csv)
18-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1982_c20160223.csv)
19-th out of 207 files is being processed (StormEvents_

  interactivity=interactivity, compiler=compiler, result=result)


41-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1987_c20160223.csv)
42-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d2010_c20170726.csv)
43-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1951_c20160223.csv)
44-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1965_c20190301.csv)
45-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d2012_c20170519.csv)
46-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1993_c20170717.csv)
47-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d1962_c20160223.csv)
48-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1971_c20160223.csv)
49-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1989_c20170717.csv)
50-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d2010_c20170726.csv)
51-th out of 207 files is being processed (StormEvents_l

130-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d1977_c20160223.csv)
131-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d2016_c20180718.csv)
132-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d1955_c20160223.csv)
133-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1955_c20160223.csv)
134-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1998_c20170717.csv)
135-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1976_c20160223.csv)
136-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d1990_c20170717.csv)
137-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1983_c20160223.csv)
138-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1971_c20160223.csv)
139-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1953_c20160223.csv)
140-th out of 207 files is being processed (S

  interactivity=interactivity, compiler=compiler, result=result)


162-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d1988_c20170717.csv)
163-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d1960_c20160223.csv)
164-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d1956_c20170717.csv)
165-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d2007_c20170717.csv)
166-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d2005_c20170717.csv)
167-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d2005_c20170717.csv)
168-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1995_c20170522.csv)
169-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1965_c20190301.csv)
170-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1969_c20170717.csv)
171-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1966_c20160223.csv)
172-th out of 207 files is being proces

# BEGIN_LAT and BEGIN_LON
<br>
Ok, so the main fearutes, the most important ones are <br> <br>
<hr>
<h3>BEGIN_LAT</h3> and <h3>BEGIN_LON</h3>
<hr> <br>
As they tell exactly where the event happened. <br>
But there are rows (events, each row is an event) that don't have this features. <br>
So firstly I'm cleaning the data from such events by grouping the data year by year. <br>
<br>

In [13]:
##############################
##    Year-Location data    ##
##############################

year_loc_data = {}

for year in range(int(min(data.keys())), int(max(data.keys()))+1):
    if str(year) in data:
        print(year, end='   ')
        year_details = data[str(year)]['details']
        if 'BEGIN_LAT' in year_details.columns and 'BEGIN_LON' in year_details.columns:
            year_loc_data[str(year)] = year_details[['BEGIN_LAT', 'BEGIN_LON']].dropna()
            print(year_loc_data[str(year)].shape)
        else:
            raise Exception('\n\n BEGIN_LAT or BEGIN_LON columns not found in data ! \n\n')
    else:
        raise Exception('Year not found in data !')

print('\n\n   Done !  \n\n')
print('year_loc_data :', len(year_loc_data))

1950   (223, 2)
1951   (269, 2)
1952   (271, 2)
1953   (492, 2)
1954   (609, 2)
1955   (1413, 2)
1956   (1703, 2)
1957   (2184, 2)
1958   (2213, 2)
1959   (1812, 2)
1960   (1945, 2)
1961   (2246, 2)
1962   (2389, 2)
1963   (1968, 2)
1964   (2348, 2)
1965   (2835, 2)
1966   (2388, 2)
1967   (2688, 2)
1968   (3312, 2)
1969   (2926, 2)
1970   (3215, 2)
1971   (3471, 2)
1972   (2171, 2)
1973   (4453, 2)
1974   (5375, 2)
1975   (4975, 2)
1976   (3767, 2)
1977   (3728, 2)
1978   (3657, 2)
1979   (4278, 2)
1980   (6136, 2)
1981   (4517, 2)
1982   (7126, 2)
1983   (8322, 2)
1984   (7335, 2)
1985   (7979, 2)
1986   (8725, 2)
1987   (7363, 2)
1988   (7257, 2)
1989   (10407, 2)
1990   (10945, 2)
1991   (12516, 2)
1992   (13534, 2)
1993   (425, 2)
1994   (582, 2)
1995   (745, 2)
1996   (21900, 2)
1997   (19597, 2)
1998   (27627, 2)
1999   (21971, 2)
2000   (24590, 2)
2001   (25297, 2)
2002   (26586, 2)
2003   (27440, 2)
2004   (26995, 2)
2005   (33013, 2)
2006   (33825, 2)
2007   (35722, 2)
2008  

In [72]:
##################################
##    Merging all years data    ##
##################################

total_data = pd.DataFrame()

for year in tqdm(sorted(year_loc_data.keys())):
    total_data = total_data.append(data[year]['details'].loc[year_loc_data[year].index])

print('\n\n Done ! \n\n')
print(total_data.info())
total_data.head()



 Done ! 


<class 'pandas.core.frame.DataFrame'>
Int64Index: 948193 entries, 0 to 61705
Data columns (total 51 columns):
BEGIN_YEARMONTH       948193 non-null int64
BEGIN_DAY             948193 non-null int64
BEGIN_TIME            948193 non-null int64
END_YEARMONTH         948193 non-null int64
END_DAY               948193 non-null int64
END_TIME              948193 non-null int64
EPISODE_ID            758958 non-null float64
EVENT_ID              948193 non-null int64
STATE                 948193 non-null object
STATE_FIPS            948193 non-null float64
YEAR                  948193 non-null int64
MONTH_NAME            948193 non-null object
EVENT_TYPE            948193 non-null object
CZ_TYPE               948193 non-null object
CZ_FIPS               948193 non-null int64
CZ_NAME               946636 non-null object
WFO                   865623 non-null object
BEGIN_DATE_TIME       948193 non-null object
CZ_TIMEZONE           948193 non-null object
END_DATE_TIME         948193 

Unnamed: 0,BEGIN_YEARMONTH,BEGIN_DAY,BEGIN_TIME,END_YEARMONTH,END_DAY,END_TIME,EPISODE_ID,EVENT_ID,STATE,STATE_FIPS,...,END_RANGE,END_AZIMUTH,END_LOCATION,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,EPISODE_NARRATIVE,EVENT_NARRATIVE,DATA_SOURCE
0,195004,28,1445,195004,28,1445,,10096222,OKLAHOMA,40.0,...,0.0,,,35.12,-99.2,35.17,-99.2,,,PUB
1,195004,29,1530,195004,29,1530,,10120412,TEXAS,48.0,...,0.0,,,31.9,-98.6,31.73,-98.6,,,PUB
2,195007,5,1800,195007,5,1800,,10104927,PENNSYLVANIA,42.0,...,0.0,,,40.58,-75.7,40.65,-75.47,,,PUB
3,195007,5,1830,195007,5,1830,,10104928,PENNSYLVANIA,42.0,...,0.0,,,40.6,-76.75,,,,,PUB
4,195007,24,1440,195007,24,1440,,10104929,PENNSYLVANIA,42.0,...,0.0,,,41.63,-79.68,,,,,PUB


# Columns that we need
<br>
Now we have all the events that have BEGIN_LAT and BEGIN_LON columns(features). <br>
But surely there ar columns we won't be using while making our models. <br>
<br>
So here I filter the total data of all years with the columns necessary for us. <br>
<br>

In [82]:
#################################################
##    Filtering data with necessary columns    ##
#################################################

total_data = total_data[['BEGIN_LAT','BEGIN_LON','BEGIN_YEARMONTH','STATE',
                         'INJURIES_DIRECT','INJURIES_INDIRECT','DEATHS_DIRECT',
                         'DEATHS_INDIRECT','DAMAGE_PROPERTY','EVENT_TYPE',
                         'DAMAGE_CROPS','MAGNITUDE','MAGNITUDE_TYPE','FLOOD_CAUSE']]
print(total_data.info())
total_data.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 948193 entries, 0 to 61705
Data columns (total 14 columns):
BEGIN_LAT            948193 non-null float64
BEGIN_LON            948193 non-null float64
BEGIN_YEARMONTH      948193 non-null int64
STATE                948193 non-null object
INJURIES_DIRECT      948193 non-null int64
INJURIES_INDIRECT    948193 non-null int64
DEATHS_DIRECT        948193 non-null int64
DEATHS_INDIRECT      948193 non-null int64
DAMAGE_PROPERTY      688183 non-null object
EVENT_TYPE           948193 non-null object
DAMAGE_CROPS         617619 non-null object
MAGNITUDE            774587 non-null float64
MAGNITUDE_TYPE       282121 non-null object
FLOOD_CAUSE          76249 non-null object
dtypes: float64(3), int64(5), object(6)
memory usage: 128.5+ MB
None


Unnamed: 0,BEGIN_LAT,BEGIN_LON,BEGIN_YEARMONTH,STATE,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,DAMAGE_PROPERTY,EVENT_TYPE,DAMAGE_CROPS,MAGNITUDE,MAGNITUDE_TYPE,FLOOD_CAUSE
0,35.12,-99.2,195004,OKLAHOMA,0,0,0,0,250K,Tornado,0,0.0,,
1,31.9,-98.6,195004,TEXAS,0,0,0,0,25K,Tornado,0,0.0,,
2,40.58,-75.7,195007,PENNSYLVANIA,2,0,0,0,25K,Tornado,0,0.0,,
3,40.6,-76.75,195007,PENNSYLVANIA,0,0,0,0,2.5K,Tornado,0,0.0,,
4,41.63,-79.68,195007,PENNSYLVANIA,0,0,0,0,2.5K,Tornado,0,0.0,,


In [4]:
##############################################
##    Converting BEGIN_YEARMONTH to DATE    ##
##############################################

total_data['DATE'] = total_data.BEGIN_YEARMONTH.apply(lambda date: 
                                                      datetime.date(date//100, 
                                                                    date%100, 
                                                                    np.random.randint(1, 29)))
total_data['DATE'].head()

0    1950-04-13
1    1950-04-22
2    1950-07-11
3    1950-07-10
4    1950-07-18
Name: DATE, dtype: object

In [5]:
#######################
##    Saving data    ##
#######################

total_data.to_pickle('total_data.pkl')