In [1]:
reset -fs

In [2]:
import pandas as pd
import numpy as np
import re
import os
from datetime import datetime as dt

import geopandas
from shapely.geometry import Point, Polygon
from pyproj import CRS

import matplotlib.pyplot as plt

import warnings
warnings.simplefilter('ignore')

# Downloading data

The full dataset is from 2012-2021, and includes both Atlantic storms and Eastern Pacific Storms. 

In [3]:
import requests
from bs4 import BeautifulSoup 
import urllib
from urllib.request import urlopen, urlretrieve, quote

url = 'https://ftp.nhc.noaa.gov/atcf/archive/MESSAGES/2012/dis/'
r = requests.get(url)

soup = BeautifulSoup(r.content)

files = soup.find_all("a", href=re.compile("discus"))

In [4]:
def download(dest_folder: str):
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)  # create folder if it does not exist
    
    for file in files:
        file_link = url + file.get('href')
        filename = file_link.split('/')[-1].replace(".", "_")
        filepath = os.path.join(dest_folder, filename)

        r = requests.get(file_link, stream=True)
        if r.ok:
            print("saving to", os.path.abspath(filepath))
            with open(filepath, 'wb') as f:
                for chunk in r.iter_content(chunk_size=1024 * 8):
                    if chunk:
                        f.write(chunk)
                        f.flush()
                        os.fsync(f.fileno())
        else:  # HTTP status code 4XX/5XX
            print("Download failed: status code {}\n{}".format(r.status_code, r.text))


download(dest_folder="discussions")


# Initial Preprocessing: Perl Scripts
I ran these two scripts in the same directory as the downloaded text files. The first one (`remove_issuer.pl`) removes an extra line in the header that was present on about 10% of the files. The second one (`cleanup.pl`) inserts custom delimiters and corrects some errors in desired data fields.

`remove_issuer.pl`:

`cleanup.pl`:

# Creating Corpus Data Frame

* Could not read discussions/clean_output/al182020_discus_001 - needed to remove newlines and change 'CVT' to 'CDT' manually
* Also needed to manually move the separators around the date field in ep172014_discus_018

In [5]:
import glob

from pandas._libs.parsers import ParserError

discussion_list = []
discussions = glob.glob("discussions/clean_output/*")

for d in discussions:
    try:
        dis_df = pd.read_csv(d, header=None, sep='|', on_bad_lines='skip', names=['info', 'date', 'body', 'positions', 'author']).assign(tag=d)
        discussion_list.append(dis_df)
    except ParserError:
        raise Exception('Could not read {}'.format(d))
    
corpus = pd.concat(discussion_list, axis=0, ignore_index=True)
corpus['tag'] = corpus['tag'].str.split('\/').str[-1].str.strip()
corpus['storm'] = corpus['tag'].str.split('_').str[0].str.strip()
corpus

Unnamed: 0,info,date,body,positions,author,tag,storm
0,ZCZC MIATCDAT3 ALL TTAA00 KNHC DDHHMM Tropica...,500 PM AST Sun Oct 07 2018,Late-arriving ASCAT data from shortly after 1...,INIT 07/2100Z 35.9N 49.9W 50 KT 60 MPH ...,Forecaster Zelinsky,al132018_discus_046,al132018
1,ZCZC MIATCDAT1 ALL TTAA00 KNHC DDHHMM Hurrica...,500 PM AST Mon Sep 10 2018,"Unfortunately, the models were right. Florenc...",INIT 10/2100Z 25.4N 61.1W 120 KT 140 MPH ...,Forecaster Blake,al062018_discus_046,al062018
2,ZCZC MIATCDAT2 ALL TTAA00 KNHC DDHHMM Tropica...,400 PM CDT Tue Aug 08 2017,Visible satellite images and surface synoptic...,INIT 08/2100Z 20.2N 90.3W 35 KT 40 MPH ...,Forecaster Pasch,al072017_discus_009,al072017
3,ZCZC MIATCDAT2 ALL TTAA00 KNHC DDHHMM Hurrica...,1100 AM AST Thu Sep 07 2017,Jose is developing an eye. Convection remain...,INIT 07/1500Z 14.9N 50.6W 80 KT 90 MPH ...,Forecaster Zelinsky,al122017_discus_009,al122017
4,ZCZC MIATCDAT1 ALL TTAA00 KNHC DDHHMM Hurric...,400 PM CDT Fri Oct 09 2020,Reports from an Air Force Reserve Hurricane ...,INIT 09/2100Z 29.3N 93.2W 90 KT 105 MPH...,Forecaster Beven,al262020_discus_021,al262020
...,...,...,...,...,...,...,...
7613,ZCZC MIATCDAT2 ALL TTAA00 KNHC DDHHMM TROPICA...,1100 AM AST SAT OCT 11 2014,Tropical Storm Fay made the transition to a t...,INIT 11/1500Z 27.9N 65.3W 60 KT 70 MPH ...,Forecaster Stewart,al072014_discus_006,al072014
7614,ZCZC MIATCDEP1 ALL TTAA00 KNHC DDHHMM TROPICA...,800 AM PDT WED JUL 15 2015,Satellite images indicate that the cloud patt...,INIT 15/1500Z 19.0N 132.9W 40 KT 45 MPH ...,Forecaster Avila,ep062015_discus_013,ep062015
7615,ZCZC MIATCDEP3 ALL TTAA00 KNHC DDHHMM HURRICA...,200 PM PDT SAT AUG 29 2015,Recent microwave imagery continued to show Ji...,INIT 29/2100Z 12.8N 126.4W 120 KT 140 MPH ...,Forecaster Berg,ep132015_discus_013,ep132015
7616,ZCZC MIATCDAT2 ALL TTAA00 KNHC DDHHMM Tropic...,500 AM AST Mon Sep 07 2020,Tropical Depression Seventeen is a large tro...,INIT 07/0900Z 17.3N 42.1W 30 KT 35 MPH...,Forecaster Latto,al172020_discus_002,al172020


# Example NHC Discussion Text

In [6]:
corpus.body[1]

" Unfortunately, the models were right. Florence has rapidly intensified into an extremely dangerous hurricane, with 30-second GOES-16 visible imagery showing well-defined eyewall mesovortices rotating inside of the eye. A NOAA Hurricane Hunter aircraft found peak SFMR winds of about 120 kt, with flight-level winds and dropsonde measurements also supporting that value for the initial wind speed estimate. Notably, the aircraft data also show the size of the hurricane-force winds has doubled in the past 12 hours.  None of the guidance suggest that Florence has peaked in intensity, and this is supported by a continuation of a low-shear environment, and even warmer waters over the next 36 hours.  Thus, the intensity forecast is raised from the previous one, bringing Florence close to category 5 strength tomorrow. Near landfall, the vertical wind shear could increase, along with the increasing likelihood of eyewall cycles.  While the intensity forecast shows some weakening of the maximum wi

# Preprocessing: Pandas
### Creating a datetime column
Despite claiming to handle timezone letter codes with `%Z`, pandas refused to convert. Replaced with time offset instead.

In [7]:
# Replace time zone codes with time offsets
corpus = corpus.replace({'date':{'AST':'-0400',
                                 'EST':'-0500',
                                 'EDT':'-0400',
                                 'CST':'-0600',
                                 'CDT':'-0500',
                                 'MST':'-0700',
                                 'MDT':'-0600',
                                 'PST':'-0800',
                                 'PDT':'-0700',
                                 'HST':'-1000',
                                 'GMT':'-0000'}}, regex=True)

In [8]:
corpus['datetime'] = pd.to_datetime(corpus['date'].str.strip(), format="%I%M %p %z %a %b %d %Y", utc=True)
corpus

Unnamed: 0,info,date,body,positions,author,tag,storm,datetime
0,ZCZC MIATCDAT3 ALL TTAA00 KNHC DDHHMM Tropica...,500 PM -0400 Sun Oct 07 2018,Late-arriving ASCAT data from shortly after 1...,INIT 07/2100Z 35.9N 49.9W 50 KT 60 MPH ...,Forecaster Zelinsky,al132018_discus_046,al132018,2018-10-07 21:00:00+00:00
1,ZCZC MIATCDAT1 ALL TTAA00 KNHC DDHHMM Hurrica...,500 PM -0400 Mon Sep 10 2018,"Unfortunately, the models were right. Florenc...",INIT 10/2100Z 25.4N 61.1W 120 KT 140 MPH ...,Forecaster Blake,al062018_discus_046,al062018,2018-09-10 21:00:00+00:00
2,ZCZC MIATCDAT2 ALL TTAA00 KNHC DDHHMM Tropica...,400 PM -0500 Tue Aug 08 2017,Visible satellite images and surface synoptic...,INIT 08/2100Z 20.2N 90.3W 35 KT 40 MPH ...,Forecaster Pasch,al072017_discus_009,al072017,2017-08-08 21:00:00+00:00
3,ZCZC MIATCDAT2 ALL TTAA00 KNHC DDHHMM Hurrica...,1100 AM -0400 Thu Sep 07 2017,Jose is developing an eye. Convection remain...,INIT 07/1500Z 14.9N 50.6W 80 KT 90 MPH ...,Forecaster Zelinsky,al122017_discus_009,al122017,2017-09-07 15:00:00+00:00
4,ZCZC MIATCDAT1 ALL TTAA00 KNHC DDHHMM Hurric...,400 PM -0500 Fri Oct 09 2020,Reports from an Air Force Reserve Hurricane ...,INIT 09/2100Z 29.3N 93.2W 90 KT 105 MPH...,Forecaster Beven,al262020_discus_021,al262020,2020-10-09 21:00:00+00:00
...,...,...,...,...,...,...,...,...
7613,ZCZC MIATCDAT2 ALL TTAA00 KNHC DDHHMM TROPICA...,1100 AM -0400 SAT OCT 11 2014,Tropical Storm Fay made the transition to a t...,INIT 11/1500Z 27.9N 65.3W 60 KT 70 MPH ...,Forecaster Stewart,al072014_discus_006,al072014,2014-10-11 15:00:00+00:00
7614,ZCZC MIATCDEP1 ALL TTAA00 KNHC DDHHMM TROPICA...,800 AM -0700 WED JUL 15 2015,Satellite images indicate that the cloud patt...,INIT 15/1500Z 19.0N 132.9W 40 KT 45 MPH ...,Forecaster Avila,ep062015_discus_013,ep062015,2015-07-15 15:00:00+00:00
7615,ZCZC MIATCDEP3 ALL TTAA00 KNHC DDHHMM HURRICA...,200 PM -0700 SAT AUG 29 2015,Recent microwave imagery continued to show Ji...,INIT 29/2100Z 12.8N 126.4W 120 KT 140 MPH ...,Forecaster Berg,ep132015_discus_013,ep132015,2015-08-29 21:00:00+00:00
7616,ZCZC MIATCDAT2 ALL TTAA00 KNHC DDHHMM Tropic...,500 AM -0400 Mon Sep 07 2020,Tropical Depression Seventeen is a large tro...,INIT 07/0900Z 17.3N 42.1W 30 KT 35 MPH...,Forecaster Latto,al172020_discus_002,al172020,2020-09-07 09:00:00+00:00


### Lowercase all text, strip extra whitespace and "forecaster" from author column

In [9]:
# document text
corpus['body'] = corpus['body'].str.strip()
corpus['body'] = corpus['body'].str.lower()


# author info
corpus['author'] = corpus['author'].str.lower()
corpus = corpus.replace({'author':{'forecaster':''}}, regex=True)
corpus['author'] = corpus['author'].str.strip()

# remove weather forecast links
corpus['body'] = corpus['body'].str.replace('http\S+|www.\S+', '', case=False)
corpus['body'] = corpus['body'].str.replace('awips header', '')
corpus['body'] = corpus['body'].str.replace('wmo header', '')


### Create a column of storm strength at time of writing

In [10]:
corpus['positions'] = corpus['positions'].str.strip().str.replace('  ', ' ')
corpus['mph'] = corpus['positions'].str.split().str[6]
corpus['category'] = (['TD' if x<=38
                     else 'TS' if 39<=x<=73
                     else '1' if 74<=x<=95
                     else '2' if 96<=x<=110
                     else '3' if 111<=x<=129
                     else '4' if 130<=x<=156
                     else '5'
                     for x in corpus['mph'].astype('int')])

corpus

Unnamed: 0,info,date,body,positions,author,tag,storm,datetime,mph,category
0,ZCZC MIATCDAT3 ALL TTAA00 KNHC DDHHMM Tropica...,500 PM -0400 Sun Oct 07 2018,late-arriving ascat data from shortly after 12...,INIT 07/2100Z 35.9N 49.9W 50 KT 60 MPH 12H 08...,zelinsky,al132018_discus_046,al132018,2018-10-07 21:00:00+00:00,60,TS
1,ZCZC MIATCDAT1 ALL TTAA00 KNHC DDHHMM Hurrica...,500 PM -0400 Mon Sep 10 2018,"unfortunately, the models were right. florence...",INIT 10/2100Z 25.4N 61.1W 120 KT 140 MPH 12H 1...,blake,al062018_discus_046,al062018,2018-09-10 21:00:00+00:00,140,4
2,ZCZC MIATCDAT2 ALL TTAA00 KNHC DDHHMM Tropica...,400 PM -0500 Tue Aug 08 2017,visible satellite images and surface synoptic ...,INIT 08/2100Z 20.2N 90.3W 35 KT 40 MPH 12H 09...,pasch,al072017_discus_009,al072017,2017-08-08 21:00:00+00:00,40,TS
3,ZCZC MIATCDAT2 ALL TTAA00 KNHC DDHHMM Hurrica...,1100 AM -0400 Thu Sep 07 2017,jose is developing an eye. convection remains...,INIT 07/1500Z 14.9N 50.6W 80 KT 90 MPH 12H 08...,zelinsky,al122017_discus_009,al122017,2017-09-07 15:00:00+00:00,90,1
4,ZCZC MIATCDAT1 ALL TTAA00 KNHC DDHHMM Hurric...,400 PM -0500 Fri Oct 09 2020,reports from an air force reserve hurricane hu...,INIT 09/2100Z 29.3N 93.2W 90 KT 105 MPH 12H 1...,beven,al262020_discus_021,al262020,2020-10-09 21:00:00+00:00,105,2
...,...,...,...,...,...,...,...,...,...,...
7613,ZCZC MIATCDAT2 ALL TTAA00 KNHC DDHHMM TROPICA...,1100 AM -0400 SAT OCT 11 2014,tropical storm fay made the transition to a tr...,INIT 11/1500Z 27.9N 65.3W 60 KT 70 MPH 12H 12...,stewart,al072014_discus_006,al072014,2014-10-11 15:00:00+00:00,70,TS
7614,ZCZC MIATCDEP1 ALL TTAA00 KNHC DDHHMM TROPICA...,800 AM -0700 WED JUL 15 2015,satellite images indicate that the cloud patte...,INIT 15/1500Z 19.0N 132.9W 40 KT 45 MPH 12H 1...,avila,ep062015_discus_013,ep062015,2015-07-15 15:00:00+00:00,45,TS
7615,ZCZC MIATCDEP3 ALL TTAA00 KNHC DDHHMM HURRICA...,200 PM -0700 SAT AUG 29 2015,recent microwave imagery continued to show jim...,INIT 29/2100Z 12.8N 126.4W 120 KT 140 MPH 12H ...,berg,ep132015_discus_013,ep132015,2015-08-29 21:00:00+00:00,140,4
7616,ZCZC MIATCDAT2 ALL TTAA00 KNHC DDHHMM Tropic...,500 AM -0400 Mon Sep 07 2020,tropical depression seventeen is a large tropi...,INIT 07/0900Z 17.3N 42.1W 30 KT 35 MPH 12H 07...,latto,al172020_discus_002,al172020,2020-09-07 09:00:00+00:00,35,TD


### Adding a geometry column for storm position

In [11]:
corpus['lat'] = corpus['positions'].str.split(' ').str[2].str.strip().str.replace('N','')
corpus['lon'] = corpus['positions'].str.split(' ').str[3].str.strip().str.replace('W','')

# manual edits due to odd errors
corpus.at[4460,'lon']=9.3
corpus.at[916,'lon']=8.6
corpus.at[261,'lon']=6.9
corpus.at[210,'lon']=9.5

corpus_geo = geopandas.GeoDataFrame(
    corpus, crs = CRS("WGS84"), geometry = geopandas.points_from_xy(corpus.lon, corpus.lat))
corpus

Unnamed: 0,info,date,body,positions,author,tag,storm,datetime,mph,category,lat,lon,geometry
0,ZCZC MIATCDAT3 ALL TTAA00 KNHC DDHHMM Tropica...,500 PM -0400 Sun Oct 07 2018,late-arriving ascat data from shortly after 12...,INIT 07/2100Z 35.9N 49.9W 50 KT 60 MPH 12H 08...,zelinsky,al132018_discus_046,al132018,2018-10-07 21:00:00+00:00,60,TS,35.9,49.9,POINT (49.90000 35.90000)
1,ZCZC MIATCDAT1 ALL TTAA00 KNHC DDHHMM Hurrica...,500 PM -0400 Mon Sep 10 2018,"unfortunately, the models were right. florence...",INIT 10/2100Z 25.4N 61.1W 120 KT 140 MPH 12H 1...,blake,al062018_discus_046,al062018,2018-09-10 21:00:00+00:00,140,4,25.4,61.1,POINT (61.10000 25.40000)
2,ZCZC MIATCDAT2 ALL TTAA00 KNHC DDHHMM Tropica...,400 PM -0500 Tue Aug 08 2017,visible satellite images and surface synoptic ...,INIT 08/2100Z 20.2N 90.3W 35 KT 40 MPH 12H 09...,pasch,al072017_discus_009,al072017,2017-08-08 21:00:00+00:00,40,TS,20.2,90.3,POINT (90.30000 20.20000)
3,ZCZC MIATCDAT2 ALL TTAA00 KNHC DDHHMM Hurrica...,1100 AM -0400 Thu Sep 07 2017,jose is developing an eye. convection remains...,INIT 07/1500Z 14.9N 50.6W 80 KT 90 MPH 12H 08...,zelinsky,al122017_discus_009,al122017,2017-09-07 15:00:00+00:00,90,1,14.9,50.6,POINT (50.60000 14.90000)
4,ZCZC MIATCDAT1 ALL TTAA00 KNHC DDHHMM Hurric...,400 PM -0500 Fri Oct 09 2020,reports from an air force reserve hurricane hu...,INIT 09/2100Z 29.3N 93.2W 90 KT 105 MPH 12H 1...,beven,al262020_discus_021,al262020,2020-10-09 21:00:00+00:00,105,2,29.3,93.2,POINT (93.20000 29.30000)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7613,ZCZC MIATCDAT2 ALL TTAA00 KNHC DDHHMM TROPICA...,1100 AM -0400 SAT OCT 11 2014,tropical storm fay made the transition to a tr...,INIT 11/1500Z 27.9N 65.3W 60 KT 70 MPH 12H 12...,stewart,al072014_discus_006,al072014,2014-10-11 15:00:00+00:00,70,TS,27.9,65.3,POINT (65.30000 27.90000)
7614,ZCZC MIATCDEP1 ALL TTAA00 KNHC DDHHMM TROPICA...,800 AM -0700 WED JUL 15 2015,satellite images indicate that the cloud patte...,INIT 15/1500Z 19.0N 132.9W 40 KT 45 MPH 12H 1...,avila,ep062015_discus_013,ep062015,2015-07-15 15:00:00+00:00,45,TS,19.0,132.9,POINT (132.90000 19.00000)
7615,ZCZC MIATCDEP3 ALL TTAA00 KNHC DDHHMM HURRICA...,200 PM -0700 SAT AUG 29 2015,recent microwave imagery continued to show jim...,INIT 29/2100Z 12.8N 126.4W 120 KT 140 MPH 12H ...,berg,ep132015_discus_013,ep132015,2015-08-29 21:00:00+00:00,140,4,12.8,126.4,POINT (126.40000 12.80000)
7616,ZCZC MIATCDAT2 ALL TTAA00 KNHC DDHHMM Tropic...,500 AM -0400 Mon Sep 07 2020,tropical depression seventeen is a large tropi...,INIT 07/0900Z 17.3N 42.1W 30 KT 35 MPH 12H 07...,latto,al172020_discus_002,al172020,2020-09-07 09:00:00+00:00,35,TD,17.3,42.1,POINT (42.10000 17.30000)


In [12]:
corpus.to_csv('corpus_clean.csv', index=False)