In [1]:
reset -fs

In [2]:
import os, re, glob
import shutil

import urllib.error
import urllib.request
from bs4 import BeautifulSoup 

import matplotlib.pyplot as plt
import matplotlib.style as style
import multiprocessing

import numpy as np
import pandas as pd
from pandas._libs.parsers import ParserError
import tensorflow as tf

from joblib import Parallel, delayed
from time import time
from tqdm import tqdm

# Downloading ADT calculations

In [3]:
years = ['2021','2020','2019','2018','2017','2016','2015','2014','2013','2012']
for year in years:
    dest_folder= 'noaa_adt/raw'
    url = 'https://www.ssd.noaa.gov/PS/TROP/DATA/{0}/adt/text/'.format(year)

    r = requests.get(url)
    soup = BeautifulSoup(r.content)
    files = soup.find_all("a", href=re.compile('L-list|E-list'))
    #print(files)

    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)  # create folder if it does not exist

    for file in files:
        file_link = url + file.get('href')
        filename = year + '_' + file_link.split('/')[-1]
        print(filename)
        filepath = os.path.join(dest_folder, filename)

        r = requests.get(file_link, stream=True)
        if r.ok:
            print("saving to", os.path.abspath(filepath))
            with open(filepath, 'wb') as f:
                for chunk in r.iter_content(chunk_size=1024 * 8):
                    if chunk:
                        f.write(chunk)
                        f.flush()
                        os.fsync(f.fileno())
        else:  # HTTP status code 4XX/5XX
            print("Download failed: {}".format(filename))     

# Perl script for initial processing

This requires the creation of a `clean_output` folder in the same directory as the `noaa_adt/raw` folder before it will run.  

# Create image metadata dataframe from cleaned ADT output files 

In [4]:
column_names_old = ['date',
                    'time',
                    'ci',
                    'mslp',
                    'bias',
                    'vmax',
                    'fnl_tno',
                    'adj_raw',
                    'ini_raw',
                    'limit',
                    'weaken',
                    'rpd_weaken',
                    'ctr_temp',
                    'mean_cloud_temp',
                    'scene',
                    'est_rmw',
                    'lat',
                    'lon',
                    'fix_method'] 

In [5]:
column_names_new = ['date',
                    'time',
                    'ci',
                    'mslp',
                    'vmax',
                    'fnl_tno',
                    'adj_raw',
                    'ini_raw',
                    'limit',
                    'weaken',
                    'rpd_weaken',
                    'ctr_temp',
                    'mean_cloud_temp',
                    'scene',
                    'est_rmw',
                    'mw_score',
                    'lat',
                    'lon',
                    'fix_method',
                    'sat',
                    'vza',
                    'comment1',
                    'comment2',
                    'comment3',
                    'comment4',
                    'comment5',
                    'comment6'] 

In [6]:
metadata = []

adts = glob.glob("noaa_adt/clean_output/old/*")

for a in adts:
    try:
        adt_df = pd.read_csv(a, names=column_names_old, skiprows=3, header=None, delim_whitespace=True, dtype=str)
        adt_df['storm'] = os.path.basename(a)[0:8]
        if os.path.basename(a)[7]=='L':
            adt_df['storm'] = adt_df['storm'].str.replace('_','al').str.replace('L','')
        else:
            adt_df['storm'] = adt_df['storm'].str.replace('_','ep').str.replace('E','')
            
        metadata.append(adt_df)
    
    except ParserError:
        raise Exception('Could not read {}'.format(a))
    
adt_old = pd.concat(metadata, axis=0, ignore_index=True)
adt_old.head(3)

Unnamed: 0,date,time,ci,mslp,bias,vmax,fnl_tno,adj_raw,ini_raw,limit,weaken,rpd_weaken,ctr_temp,mean_cloud_temp,scene,est_rmw,lat,lon,fix_method,storm
0,2012AUG21,91500,2.0,1009.0,0.0,30.0,2.0,2.0,2.0,none,OFF,OFF,-63.46,-51.5,RCDO,,15.2,51.27,FCST,2012al09
1,2012AUG21,94500,2.0,1009.0,0.0,30.0,1.9,1.9,1.6,0.1Thour,ON,OFF,-61.86,-51.24,UNIFRM,,15.2,51.42,FCST,2012al09
2,2012AUG21,101500,2.0,1009.0,0.0,30.0,1.9,1.9,1.6,0.1Thour,ON,OFF,-59.36,-49.47,UNIFRM,,15.2,51.56,FCST,2012al09


In [7]:
metadata = []

adts = glob.glob("noaa_adt/clean_output/new/*")

for a in adts:
    try:
        adt_new_df = pd.read_csv(a, names=column_names_new, skiprows=5, header=None, delim_whitespace=True, dtype=str)
        adt_new_df['storm'] = os.path.basename(a)[0:8]
        if os.path.basename(a)[7]=='L':
            adt_new_df['storm'] = adt_new_df['storm'].str.replace('_','al').str.replace('L','')
        else:
            adt_new_df['storm'] = adt_new_df['storm'].str.replace('_','ep').str.replace('E','')
        metadata.append(adt_new_df)
    except ParserError:
        raise Exception('Could not read {}'.format(a))
    
adt_new = pd.concat(metadata, axis=0, ignore_index=True)
adt_new.head(3)

Unnamed: 0,date,time,ci,mslp,vmax,fnl_tno,adj_raw,ini_raw,limit,weaken,...,fix_method,sat,vza,comment1,comment2,comment3,comment4,comment5,comment6,storm
0,2020JUN24,152000,2.0,1010.0,30.0,2.0,2.1,2.1,none,OFF,...,FCST,GOES17,13.3,,,,,,,2020ep03
1,2020JUN24,155000,2.1,1010.0,31.0,2.1,2.2,2.5,0.2Thour,OFF,...,FCST,GOES17,13.3,,,,,,,2020ep03
2,2020JUN24,161000,2.1,1010.0,31.0,2.1,2.3,2.5,0.2Thour,OFF,...,FCST,GOES17,13.4,,,,,,,2020ep03


In [8]:
# retain only needed columns
metadata_old = adt_old[['date','time','vmax','scene','lat','lon','storm']]
metadata_new = adt_new[['date','time','vmax','scene','lat','lon','storm']]

# combine into one df
frames = [metadata_old, metadata_new]
storm_metadata = pd.concat(frames).reset_index(drop=True)
storm_metadata.shape

(94402, 7)

# Data cleaning and prep

In [9]:
# change date column to match image file date format
month_dict = {'JAN': '01',
              'FEB': '02',
              'MAR': '03',
              'APR': '04',
              'MAY': '05',
              'JUN': '06',
              'JUL': '07',
              'AUG': '08',
              'SEP': '09',
              'OCT': '10',
              'NOV': '11',
              'DEC': '12'
             }
storm_metadata.date = storm_metadata.date.replace(month_dict, regex=True)
storm_metadata.time = storm_metadata.time.astype('str').str[:-2]
storm_metadata.head(3)

Unnamed: 0,date,time,vmax,scene,lat,lon,storm
0,20120821,915,30.0,RCDO,15.2,51.27,2012al09
1,20120821,945,30.0,UNIFRM,15.2,51.42,2012al09
2,20120821,1015,30.0,UNIFRM,15.2,51.56,2012al09


In [13]:
# add storm categories
image_metadata = storm_metadata[['storm','vmax','scene']].copy()
image_metadata['cat'] = (['TD' if x<=33
                     else 'TS' if 34<=x<=63
                     else 'CAT1' if 64<=x<=82
                     else 'CAT2' if 83<=x<=95
                     else 'CAT3' if 96<=x<=112
                     else 'CAT4' if 113<=x<=136
                     else 'CAT5'
                     for x in image_metadata['vmax'].astype('float')])

# add year
image_metadata['year'] = image_metadata.storm.str[:4]
image_metadata.head(3)
image_metadata.year.value_counts()

2018    12306
2016    10927
2012    10854
2020    10464
2017     9499
2015     8870
2021     8746
2014     8499
2019     8124
2013     6113
Name: year, dtype: int64

In [11]:
# add storm image id 
image_metadata['id'] = storm_metadata.storm + '_4kmsrbdc_' + storm_metadata.date + storm_metadata.time

# add image url
url_start = ['https://rammb-data.cira.colostate.edu/tc_realtime/products/storms/{0}/4kmsrbdc/'.format(storm) for storm in storm_metadata.storm]
image_metadata['image'] = url_start + storm_metadata.storm + '_4kmsrbdc_' + storm_metadata.date + storm_metadata.time + '.jpg'
image_metadata.image[4000]


'https://rammb-data.cira.colostate.edu/tc_realtime/products/storms/2012al17/4kmsrbdc/2012al17_4kmsrbdc_201210171415.jpg'

In [22]:
test_metadata = image_metadata

# Use the ADT data to download a matching image dataset

In [14]:
# Create a multithread download function for faster downloading of images
def download_parallel(df, image_dir):
    
    filenames = df['id'].apply(lambda x : os.path.join(image_dir, str(x)+'.jpg'))
    urls = df['image']

    # Create destination directory
    if os.path.exists(image_dir):
        print("Directory '{}' already exists and will be deleted.".format(image_dir))
        shutil.rmtree(image_dir)
    print("Created new directory '{}'".format(image_dir))
    os.makedirs(image_dir)
    
    # Define function to download one single image
    def download_image(url, filename):
        try:
            urllib.request.urlretrieve(url, filename)
            return 0
        except:
            return os.path.basename(filename).split('.')[0]
    
    # Download images in parallel
    start = time()
    print("\nDownloading...")
    num_cores = multiprocessing.cpu_count()
    ko_list = Parallel(n_jobs=num_cores)(delayed(download_image)(u, f) for f, u in zip(filenames, urls))
    
    print("\nDownload in parallel mode took %d seconds." %(time()-start))
    print("Success:", len([i for i in ko_list if i==0]))
    print("Errors:", len([i for i in ko_list if i!=0]))
    
    # Remove not downloaded posters from the dataframe
    ko_index = df[df['id'].isin(ko_list)].index
    df = df.drop(ko_index)
    
    return df
    


# Define destination folder
destination = 'images/2km_relative_infrared/raw/'
# Download in parallel and return the successful subset of the movies dataframe
test_metadata = download_parallel(test_metadata, destination)

Directory 'images/2km_relative_infrared' already exists and will be deleted.
Created new directory 'images/2km_relative_infrared'

Downloading...

Download in parallel mode took 11250 seconds.
Success: 80052
Errors: 14350


In [24]:
# check that images exist for each observation
test_metadata['exists'] = test_metadata['id'].apply(lambda x: os.path.isfile('images/2km_relative_infrared/raw/' + x + '.jpg'))
test_metadata = test_metadata[test_metadata.exists == True].reset_index(drop=True)

In [18]:
test_metadata.to_csv('csv/2km_metadata.csv', index=False)

# Image resizing

I did several smaller versions of the dataset due to memory issues. (`large`, `medium`, `small`, `tiny`)

In [19]:
from PIL import Image
import os

path = "images/2km_relative_infrared/raw/"
resize_ratio = 0.25  # where 0.5 is half size, 2 is double size

def resize_aspect_fit():
    dirs = os.listdir(path)
    for item in dirs:
        if not item.startswith('.'):
            try:
                image = Image.open(path+item)
                file_path, extension = os.path.splitext(path+item)

                new_image_height = int(image.size[0] / (1/resize_ratio))
                new_image_length = int(image.size[1] / (1/resize_ratio))
                new_file_path = 'images/2km_relative_infrared/small/'

                image = image.resize((new_image_height, new_image_length), Image.ANTIALIAS)
                image.save(new_file_path + item, 'JPEG', quality=50)
            except OSError:
                raise Exception('Could not read {}'.format(item))

resize_aspect_fit()