In [1]:
%matplotlib inline

import os

import geopandas
import numpy as np
import pandas as pd
from shapely.geometry import Point
from shapely.geometry import LineString
import missingno as msn
import seaborn as sns
import matplotlib.pyplot as plt
import geohash2
from sklearn import preprocessing
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from keras.models import Sequential
from keras.utils import to_categorical
from keras.layers.core import Dense, Dropout, Activation, Masking
from keras.layers.recurrent import LSTM
from keras.layers import Flatten, Embedding

Using TensorFlow backend.


In [2]:
# Read in all Atlantic hurricane data
hurricanes = pd.read_csv('../data/atlantic_hurricanes.csv')
hurricanes['Date'] = pd.to_datetime(hurricanes['Date'], errors='coerce')
hurricanes.dropna(inplace=True)
hurricanes['year'] = hurricanes['Date'].dt.year.astype(int)
hurricanes = hurricanes[hurricanes['year'] >= 2000]
hurricanes = hurricanes[hurricanes['Name'] != 'Unnamed']
hurricanes['slug'] = hurricanes['Name'] + '-' + hurricanes['year'].astype(str)
hurricanes['Long'] = 0 - hurricanes['Long']
hurricanes['coordinates'] = hurricanes[['Long', 'Lat']].values.tolist()
hurricanes['coordinates'] = hurricanes['coordinates'].apply(Point)
hurricanes['movement_speed'] = hurricanes['Movement'].str.extract(r'(\d+)\s?[mph|MPH]')
geohashes = []

for index,row in hurricanes.iterrows():
    latitude = row['coordinates'].x
    longitude = row['coordinates'].y
    geohash = geohash2.encode(
        latitude=latitude, 
        longitude=longitude, 
        precision=5
    )
    geohashes.append(geohash)
    
hurricanes['geohash'] = geohashes
n_classes = len(set(hurricanes['geohash']))

label_encoder = preprocessing.LabelEncoder()
min_max_scaler = preprocessing.MinMaxScaler()

hurricanes['encoded_label'] = label_encoder.fit_transform(hurricanes['geohash'])

In [5]:
hurricanes.isna()

Unnamed: 0,AdvisoryNumber,Date,Lat,Long,Wind,Pres,Movement,stormtypeName,Name,Received,Forecaster,year,slug,coordinates,movement_speed,geohash,encoded_label
2367,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2368,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2369,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2370,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2371,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35018,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
35019,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
35020,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
35021,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [81]:
features = ['Lat', 'Long', 'Wind', 'Pres', 'movement_speed']
label = 'encoded_label'
test_slug = 'Katrina-2005'

for feature in features:
    values = hurricanes[feature].values
    hurricanes[feature] = min_max_scaler.fit_transform(
        values.reshape(-1, 1)
    )

pre_train_x = []
pre_train_y = []

pre_test_x = []
pre_test_y = []

for name, group in hurricanes.groupby('slug'):
    temp_df = hurricanes[hurricanes['slug'] == name]
    if name == test_slug:
        pre_test_x.append(temp_df[features].to_numpy())
        pre_test_y.append(temp_df[label].to_numpy())
    if len(temp_df) >= 40:
        pre_train_x.append(temp_df[features].to_numpy())
        pre_train_y.append(temp_df[label].to_numpy())

In [82]:
def pad_sequence(data):
    padded = pad_sequences(
        sequences=data,
        maxlen=50,
        dtype='object',
        padding='post',
        truncating='pre',
        value=0.0
    )
                   
    return padded

def build_structure():
    model = Sequential()
    
    model.add(LSTM(
        units=100,
        input_shape=(50, 5),
        activation='relu', 
        recurrent_activation='hard_sigmoid', 
        return_sequences=True,
        dropout=0.1,
        recurrent_dropout=0.1
    ))

    # Output layer
    model.add(Dense(
        units=n_classes, 
        activation='softmax'
    ))

    model.compile(
        loss='sparse_categorical_crossentropy', 
        optimizer='Adagrad',
        metrics=['categorical_accuracy']
    )
    
    return model

In [1]:
post_train_x = pad_sequence(data=pre_train_x)
post_train_y = pad_sequence(data=pre_train_y)
post_train_y = post_train_y.reshape(71, 50, 1)
post_test_x = pad_sequence(data=pre_test_x)
post_test_y = pad_sequence(data=pre_test_y)
post_test_y = post_test_y.reshape(1, 50, 1)

NameError: name 'pad_sequence' is not defined

In [None]:
model = build_structure()
model.fit(
    x=post_train_x,
    y=post_train_y,
    epochs=10,
    verbose=2,
    validation_split=0.6,
    shuffle=True
)

Train on 28 samples, validate on 43 samples
Epoch 1/10


In [71]:
model.predict_classes(post_train_x)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])