In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
# Read in the atlantic data as lines and account for the different format of header lines
atlantic_lines = pd.read_csv('atlantic.txt', header=None, sep='\n')
atlantic_data = atlantic_lines[0].str.split(',', expand=True)

# Name the columns
cols1 = ['Date', 'Time', 'Record Identifier', 'Status', 'Latitude', 
         'Longitude', 'Max Wind', 'Min Pressure']
cols2 = ['Wind Radii Max Extent ' + str(x) for x in range(1, 13)]
cols = np.concatenate((cols1, cols2, ['is_header']))

# Clean up each dataframe
atlantic_data.columns = cols

# Fill the last column with boolean values that indicate if each line is a header line
atlantic_data.is_header = (atlantic_data.is_header != '')

# Filter out ancient storms that don't have real data
atlantic_data = atlantic_data.iloc[atlantic_data[atlantic_data.Date == 'AL012004'].index[0]:, :]

# Replace the missing values with actual NaN values, then impute the NaNs
atlantic_data.replace(' -999', np.nan, inplace=True)
atlantic_data.ffill(inplace=True)
atlantic_data.bfill(inplace=True)

# Get rid of N/W in Lat/Long
def fun(x):
        if x is not None:
            return x[:-1]
        else: 
            return x
atlantic_data['Latitude'] = pd.to_numeric(atlantic_data['Latitude'].apply(fun))
atlantic_data['Longitude'] = pd.to_numeric(atlantic_data['Longitude'].apply(fun))

# Convert these columns to numerical datatypes
num_cols = ['Max Wind', 'Min Pressure', 'Wind Radii Max Extent 1',
       'Wind Radii Max Extent 2', 'Wind Radii Max Extent 3',
       'Wind Radii Max Extent 4', 'Wind Radii Max Extent 5',
       'Wind Radii Max Extent 6', 'Wind Radii Max Extent 7',
       'Wind Radii Max Extent 8', 'Wind Radii Max Extent 9',
       'Wind Radii Max Extent 10', 'Wind Radii Max Extent 11',
       'Wind Radii Max Extent 12']
atlantic_data[num_cols] = atlantic_data[num_cols].apply(pd.to_numeric)

# One hot encode status
atlantic_data = pd.get_dummies(atlantic_data, prefix='Status', prefix_sep='', columns=['Status'], drop_first=True)

### FOR NOW ###
# Drop header record identifier
atlantic_data.drop(columns=['Record Identifier'], inplace=True)

# Separate and save a list of time series
atlantic_series_list = []
head_index = 0
for k in range(1, atlantic_data.shape[0]):
    
    if atlantic_data.iloc[k, :].is_header:
        tail_index = k - 1
        atlantic_series_list.append(atlantic_data.iloc[head_index:tail_index, :].drop(columns='is_header'))
        head_index = k
atlantic_series_list.append(atlantic_data.iloc[head_index:, :].drop(columns='is_header'))
pickle.dump(atlantic_series_list, open('atlantic_series.pickle', 'wb'))

In [3]:
# Read in the pacific data as lines and account for the different format of header lines
pacific_lines = pd.read_csv('pacific.txt', header=None, sep='\n')
pacific_data = pacific_lines[0].str.split(',', expand=True)

# Name the columns
cols1 = ['Date', 'Time', 'Record Identifier', 'Status', 'Latitude', 
         'Longitude', 'Max Wind', 'Min Pressure']
cols2 = ['Wind Radii Max Extent ' + str(x) for x in range(1, 13)]
cols = np.concatenate((cols1, cols2, ['is_header']))

# Clean up each dataframe
pacific_data.columns = cols

# Fill the last column with boolean values that indicate if each line is a header line
pacific_data.is_header = (pacific_data.is_header != '')

# Filter out ancient storms that don't have real data
pacific_data = pacific_data.iloc[pacific_data[pacific_data.Date == 'EP012004'].index[0]:, :]

# Replace the missing values with actual NaN values
pacific_data.replace(' -999', np.nan, inplace=True)

# Replace other typos found in the data and then fill nans
pacific_data.replace([' ST', ' TY', ' PT', ' WV'], np.nan, inplace=True)
pacific_data.ffill(inplace=True)
pacific_data.bfill(inplace=True)

# Get rid of N/W in Lat/Long
def fun(x):
        if x is not None:
            return x[:-1]
        else: 
            return x
pacific_data['Latitude'] = pd.to_numeric(pacific_data['Latitude'].apply(fun))
pacific_data['Longitude'] = pd.to_numeric(pacific_data['Longitude'].apply(fun))

# Convert these columns to numerical datatypes
num_cols = ['Max Wind', 'Min Pressure', 'Wind Radii Max Extent 1',
       'Wind Radii Max Extent 2', 'Wind Radii Max Extent 3',
       'Wind Radii Max Extent 4', 'Wind Radii Max Extent 5',
       'Wind Radii Max Extent 6', 'Wind Radii Max Extent 7',
       'Wind Radii Max Extent 8', 'Wind Radii Max Extent 9',
       'Wind Radii Max Extent 10', 'Wind Radii Max Extent 11',
       'Wind Radii Max Extent 12']
pacific_data[num_cols] = pacific_data[num_cols].apply(pd.to_numeric)

# One hot encode status
pacific_data = pd.get_dummies(pacific_data, prefix='Status', prefix_sep='', columns=['Status'], drop_first=True)

### FOR NOW ###
# Drop header record identifier
pacific_data.drop(columns=['Record Identifier'], inplace=True)

# Separate and save a list of time series
pacific_series_list = []
head_index = 0
for k in range(1, pacific_data.shape[0]):
    
    if pacific_data.iloc[k, :].is_header:
        tail_index = k - 1
        pacific_series_list.append(pacific_data.iloc[head_index:tail_index, :].drop(columns='is_header'))
        head_index = k
pacific_series_list.append(pacific_data.iloc[head_index:, :].drop(columns='is_header'))
pickle.dump(pacific_series_list, open('pacific_series.pickle', 'wb'))

In [4]:
atlantic = pickle.load(open('atlantic_series.pickle', 'rb'))
pacific = pickle.load(open('pacific_series.pickle', 'rb'))
print('Number of Atlantic Hurricanes/Time Series: ' + str(len(atlantic)))
print('Number of Pacific Hurricanes/Time Series: ' + str(len(pacific)))

Number of Atlantic Hurricanes/Time Series: 271
Number of Pacific Hurricanes/Time Series: 323


In [5]:
atlantic[40]

Unnamed: 0,Date,Time,Latitude,Longitude,Max Wind,Min Pressure,Wind Radii Max Extent 1,Wind Radii Max Extent 2,Wind Radii Max Extent 3,Wind Radii Max Extent 4,...,Wind Radii Max Extent 12,Status DB,Status EX,Status HU,Status LO,Status SD,Status SS,Status TD,Status TS,Status WV
46523,AL252005,WILMA,37.7,6.0,30,1003,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46524,20051015,1800,17.6,78.5,25,1004,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
46525,20051016,0000,17.6,78.8,25,1004,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
46526,20051016,0600,17.5,79.0,30,1003,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
46527,20051016,1200,17.5,79.2,30,1003,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
46528,20051016,1800,17.5,79.4,30,1002,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
46529,20051017,0000,17.4,79.6,30,1001,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
46530,20051017,0600,16.9,79.6,35,1000,0,0,40,0,...,0,0,0,0,0,0,0,0,1,0
46531,20051017,1200,16.3,79.7,40,999,0,60,40,0,...,0,0,0,0,0,0,0,0,1,0
46532,20051017,1800,16.0,79.8,45,997,30,60,60,30,...,0,0,0,0,0,0,0,0,1,0


In [6]:
pacific[110]

Unnamed: 0,Date,Time,Latitude,Longitude,Max Wind,Min Pressure,Wind Radii Max Extent 1,Wind Radii Max Extent 2,Wind Radii Max Extent 3,Wind Radii Max Extent 4,...,Wind Radii Max Extent 11,Wind Radii Max Extent 12,Status DB,Status EX,Status HU,Status LO,Status SD,Status SS,Status TD,Status TS
23600,EP162009,MARTY,19.9,136.4,20,1009,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23601,20090915,1200,17.3,111.3,25,1007,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
23602,20090915,1800,17.7,111.6,25,1007,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
23603,20090916,0000,18.0,111.9,25,1007,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
23604,20090916,0600,18.3,112.2,30,1005,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
23605,20090916,1200,18.6,112.4,35,1003,0,50,60,60,...,0,0,0,0,0,0,0,0,0,1
23606,20090916,1800,18.8,112.5,40,1002,0,60,60,60,...,0,0,0,0,0,0,0,0,0,1
23607,20090917,0000,19.0,112.6,40,1002,0,45,70,60,...,0,0,0,0,0,0,0,0,0,1
23608,20090917,0600,19.2,112.7,40,1003,0,45,70,60,...,0,0,0,0,0,0,0,0,0,1
23609,20090917,1200,19.4,112.8,40,1003,15,45,60,60,...,0,0,0,0,0,0,0,0,0,1
