In [1]:
import os
import gzip
import time
import shutil
import ftplib
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from matplotlib import pyplot as plt

In [2]:
csv_data_dir = '../../Data/csv_Data'

# Reading the data
<br>
So we now have 3 files per year:

    StormEvents_details-ftp_v1.0_dYYYY_cYYYYMMDD.csv
    StormEvents_locations-ftp_v1.0_dYYYY_cYYYYMMDD.csv
    StormEvents_fatalities-ftp_v1.0_dYYYY_cYYYYMMDD.csv

The main information we need is in StormEvents_details-ftp_v1.0_dYYYY_cYYYYMMDD.csv file. <br>
Next cell reads them.


In [6]:
########################
##    Reading data    ##
########################

data = {}
details = 0
locations = 0
fatalities = 0
else_count = 0

file_count = len(os.listdir(csv_data_dir))

for i, file_name in enumerate(os.listdir(csv_data_dir)):
    print('{}-th out of {} files is being processed ({})'.format(i+1, file_count, file_name))
    assert file_name[-4:]=='.csv', '\n\n Foreign file format encountered !!! \n\n'
    assert file_name[:11]=='StormEvents', '\n\n Foreign file name encountered !!! \n\n'
    
    year = file_name[-18:-14]
    if year not in data:
        data[year] = {}
    
    if file_name[12:19]=='details':
        data[year]['details'] = pd.read_csv(os.path.join(csv_data_dir, file_name))
        details += 1
    elif file_name[12:21]=='locations':
        data[year]['locations'] = pd.read_csv(os.path.join(csv_data_dir, file_name))
        locations += 1
    elif file_name[12:22]=='fatalities':
        data[year]['fatalities'] = pd.read_csv(os.path.join(csv_data_dir, file_name))
        fatalities += 1
    else:
        raise Exception('\n\n Foreign file encountered !!! \n\n')

print('\n\n       Done ! \n')
print('details    :', details)
print('locations  :', locations)
print('fatalities :', fatalities)
print('else_count :', else_count)

1-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1958_c20160223.csv)
2-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d2015_c20180525.csv)
3-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d2011_c20180718.csv)
4-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d2017_c20190405.csv)
5-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1957_c20160223.csv)
6-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1980_c20170717.csv)
7-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1968_c20160223.csv)
8-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d1982_c20160223.csv)
9-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1985_c20160223.csv)
10-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1977_c20160223.csv)
11-th out of 207 files is being processed (StormEvents_location

  interactivity=interactivity, compiler=compiler, result=result)


33-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1969_c20170717.csv)
34-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d2002_c20170717.csv)
35-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d1967_c20160223.csv)
36-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1955_c20160223.csv)
37-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1970_c20160223.csv)
38-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d2016_c20180718.csv)
39-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d1984_c20170717.csv)
40-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1987_c20160223.csv)
41-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1995_c20170522.csv)
42-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1981_c20170717.csv)
43-th out of 207 files is being processed (StormEvents_

132-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1976_c20160223.csv)
133-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d2006_c20170717.csv)


  interactivity=interactivity, compiler=compiler, result=result)


134-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d2012_c20170519.csv)
135-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1990_c20170717.csv)
136-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d1952_c20170619.csv)
137-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1973_c20160223.csv)
138-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d2014_c20180718.csv)
139-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1955_c20160223.csv)
140-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d1987_c20160223.csv)
141-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1959_c20160223.csv)
142-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d1950_c20170120.csv)
143-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1954_c20160223.csv)
144-th out of 207 files is being processed (Sto

  interactivity=interactivity, compiler=compiler, result=result)


164-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1962_c20160223.csv)
165-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d2000_c20170717.csv)
166-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d1958_c20160223.csv)
167-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1965_c20190301.csv)
168-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1957_c20160223.csv)
169-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1964_c20160223.csv)
170-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d2016_c20180718.csv)
171-th out of 207 files is being processed (StormEvents_fatalities-ftp_v1.0_d2006_c20170717.csv)
172-th out of 207 files is being processed (StormEvents_locations-ftp_v1.0_d1976_c20160223.csv)
173-th out of 207 files is being processed (StormEvents_details-ftp_v1.0_d1967_c20160223.csv)
174-th out of 207 files is being processed (StormE

In [26]:
# data['1950']['details'].info()
BEGIN_YEARMONTH, STATE, STATE_FIPS,
INJURIES_DIRECT, INJURIES_INDIRECT, 
DEATHS_DIRECT, DEATHS_INDIRECT, 
DAMAGE_PROPERTY, DAMAGE_CROPS, 
EVENT_TYPE, MAGNITUDE

In [27]:
#########################################################
##   Counting all the events not having STATE column   ##
#########################################################

for year in range(int(min(data.keys())), int(max(data.keys()))+1):
    isna = data[str(year)]['details'].STATE.isna()
    print(year,':',end='\t')
    print('{} events with NaN STATE from {}'.format(np.sum(isna), isna.shape[0]))


1950 :	0 events with NaN STATE from 223
1951 :	0 events with NaN STATE from 269
1952 :	0 events with NaN STATE from 272
1953 :	0 events with NaN STATE from 492
1954 :	0 events with NaN STATE from 609
1955 :	0 events with NaN STATE from 1413
1956 :	0 events with NaN STATE from 1703
1957 :	0 events with NaN STATE from 2184
1958 :	0 events with NaN STATE from 2213
1959 :	0 events with NaN STATE from 1813
1960 :	0 events with NaN STATE from 1945
1961 :	0 events with NaN STATE from 2246
1962 :	0 events with NaN STATE from 2389
1963 :	0 events with NaN STATE from 1968
1964 :	0 events with NaN STATE from 2348
1965 :	0 events with NaN STATE from 2835
1966 :	0 events with NaN STATE from 2388
1967 :	0 events with NaN STATE from 2688
1968 :	0 events with NaN STATE from 3312
1969 :	0 events with NaN STATE from 2926
1970 :	0 events with NaN STATE from 3215
1971 :	0 events with NaN STATE from 3471
1972 :	0 events with NaN STATE from 2171
1973 :	0 events with NaN STATE from 4453
1974 :	0 events with 

In [28]:
##########################
##    Filtering data    ##
##########################

filtered_data = []
for year in range(int(min(data.keys())), int(max(data.keys()))+1):
    print(year,':',end='\t')
    year_data = data[str(year)]['details'][['STATE', 'EVENT_TYPE', 'BEGIN_YEARMONTH']]
    print('{} events occured'.format(year_data.shape[0]))
    year_data = year_data.loc[year_data.dropna().index]
    print('     : \t{} non-NaN containing events'.format(year_data.shape[0]),end='\n\n')
    filtered_data.append(year_data)

filtered_data = pd.concat(filtered_data)
print('\n\n Done ! \n\n')
print(filtered_data.info())
filtered_data.head()

1950 :	223 events occured
     : 	223 non-NaN containing events

1951 :	269 events occured
     : 	269 non-NaN containing events

1952 :	272 events occured
     : 	272 non-NaN containing events

1953 :	492 events occured
     : 	492 non-NaN containing events

1954 :	609 events occured
     : 	609 non-NaN containing events

1955 :	1413 events occured
     : 	1413 non-NaN containing events

1956 :	1703 events occured
     : 	1703 non-NaN containing events

1957 :	2184 events occured
     : 	2184 non-NaN containing events

1958 :	2213 events occured
     : 	2213 non-NaN containing events

1959 :	1813 events occured
     : 	1813 non-NaN containing events

1960 :	1945 events occured
     : 	1945 non-NaN containing events

1961 :	2246 events occured
     : 	2246 non-NaN containing events

1962 :	2389 events occured
     : 	2389 non-NaN containing events

1963 :	1968 events occured
     : 	1968 non-NaN containing events

1964 :	2348 events occured
     : 	2348 non-NaN containing events

1965 

Unnamed: 0,STATE,EVENT_TYPE,BEGIN_YEARMONTH
0,OKLAHOMA,Tornado,195004
1,TEXAS,Tornado,195004
2,PENNSYLVANIA,Tornado,195007
3,PENNSYLVANIA,Tornado,195007
4,PENNSYLVANIA,Tornado,195007


In [29]:
##############################
##    Adding DATE column    ##
##############################

filtered_data['DATE'] = filtered_data.BEGIN_YEARMONTH.apply(
    lambda date: 
    datetime.date(date//100, date%100, np.random.randint(1,29))
)
filtered_data.head()

Unnamed: 0,STATE,EVENT_TYPE,BEGIN_YEARMONTH,DATE
0,OKLAHOMA,Tornado,195004,1950-04-06
1,TEXAS,Tornado,195004,1950-04-01
2,PENNSYLVANIA,Tornado,195007,1950-07-21
3,PENNSYLVANIA,Tornado,195007,1950-07-25
4,PENNSYLVANIA,Tornado,195007,1950-07-19


In [30]:
###########################################################
##   Froming DIST_FROM_START column for fast splitting   ##
###########################################################

start = filtered_data.DATE.min()
filtered_data['DIST_FROM_START'] = filtered_data.DATE.apply(lambda x: (x-start).days)
filtered_data.head()

Unnamed: 0,STATE,EVENT_TYPE,BEGIN_YEARMONTH,DATE,DIST_FROM_START
0,OKLAHOMA,Tornado,195004,1950-04-06,84
1,TEXAS,Tornado,195004,1950-04-01,79
2,PENNSYLVANIA,Tornado,195007,1950-07-21,190
3,PENNSYLVANIA,Tornado,195007,1950-07-25,194
4,PENNSYLVANIA,Tornado,195007,1950-07-19,188


In [31]:
#######################
##    Saving data    ##
#######################

filtered_data.to_pickle('filtered_data.pkl')