In [115]:
import os
import random
import pandas as pd

In [116]:
dataset_name = "daily_weather"

In [117]:
input_fname = "weather_data.csv"
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_fig_fname = os.path.join(output_dir, f'{dataset_name}.png')
os.makedirs(output_dir, exist_ok=True)

# Read Data

In [118]:
data = pd.read_csv(input_fname, parse_dates=['date'])
data.head()

Unnamed: 0,date,date_epoch,mintemp,maxtemp,avgtemp,totalsnow,sunhour,uv_index,time,temperature,...,sunrise,sunset,moonrise,moonset,moon_phase,moon_illumination,query,location,country,region
0,2020-01-01,1577836800,1,4,2,0.0,8.7,2,0,4,...,7:20 AM,4:39 PM,11:25 AM,11:02 PM,Waxing Crescent,37,"New York City, United States of America",New York City,United States of America,New York
1,2020-01-02,1577923200,0,8,3,0.0,8.7,2,0,8,...,7:20 AM,4:40 PM,11:49 AM,12:00 AM,First Quarter,44,"New York City, United States of America",New York City,United States of America,New York
2,2020-01-03,1578009600,5,9,7,0.0,3.5,2,0,9,...,7:20 AM,4:41 PM,12:13 PM,No moonset,First Quarter,51,"New York City, United States of America",New York City,United States of America,New York
3,2020-01-04,1578096000,4,11,8,0.0,3.5,2,0,11,...,7:20 AM,4:41 PM,12:38 PM,12:58 AM,First Quarter,59,"New York City, United States of America",New York City,United States of America,New York
4,2020-01-05,1578182400,1,5,3,0.0,8.7,2,0,5,...,7:20 AM,4:42 PM,1:05 PM,1:58 AM,First Quarter,66,"New York City, United States of America",New York City,United States of America,New York


In [119]:
data.columns

Index(['date', 'date_epoch', 'mintemp', 'maxtemp', 'avgtemp', 'totalsnow',
       'sunhour', 'uv_index', 'time', 'temperature', 'wind_speed',
       'wind_degree', 'wind_dir', 'weather_code', 'weather_icons',
       'weather_descriptions', 'precip', 'humidity', 'visibility', 'pressure',
       'cloudcover', 'heatindex', 'dewpoint', 'windchill', 'windgust',
       'feelslike', 'sunrise', 'sunset', 'moonrise', 'moonset', 'moon_phase',
       'moon_illumination', 'query', 'location', 'country', 'region'],
      dtype='object')

In [120]:
data[['location', 'country']].value_counts()

location        country                 
Bangkok         Thailand                    1096
Barcelona       Spain                       1096
Toronto         Canada                      1096
Tokyo           Japan                       1096
Sydney          Australia                   1096
Santiago        Chile                       1096
Rome            Italy                       1096
Rio De Janeiro  Brazil                      1096
Paris           France                      1096
New York City   United States of America    1096
Nairobi         Kenya                       1096
Mumbai          India                       1096
Mexico City     Mexico                      1096
Marrakesh       Morocco                     1096
Los Angeles     United States of America    1096
London          United Kingdom              1096
Lima            Peru                        1096
Lagos           Nigeria                     1096
Dubai           United Arab Emirates        1096
Cape Town       South Africa

# Prepare Data

## Update Location field

In [121]:
data['location'] = data['query']

## Add Hemisphere as Static Covariate

In [122]:
data['location'].unique()

array(['New York City, United States of America',
       'Los Angeles, United States of America', 'Toronto, Canada',
       'Mexico City, Mexico', 'Vancouver, Canada',
       'Rio De Janeiro, Brazil', 'Buenos Aires, Argentina', 'Lima, Peru',
       'Santiago, Chile', 'Bogota, Colombia', 'London, United Kingdom',
       'Paris, France', 'Rome, Italy', 'Berlin, Germany',
       'Barcelona, Spain', 'Tokyo, Japan', 'Beijing, China',
       'Bangkok, Thailand', 'Mumbai, India',
       'Dubai, United Arab Emirates', 'Cairo, Egypt',
       'Cape Town, South Africa', 'Nairobi, Kenya', 'Marrakesh, Morocco',
       'Lagos, Nigeria', 'Sydney, Australia'], dtype=object)

In [123]:
northern_hemisphere_cities = [
    'New York City, United States of America',
    'Los Angeles, United States of America',
    'Toronto, Canada',
    'Mexico City, Mexico',
    'Vancouver, Canada',
    'Bogota, Colombia',
    'London, United Kingdom',
    'Paris, France',
    'Rome, Italy',
    'Berlin, Germany',
    'Barcelona, Spain',
    'Tokyo, Japan',
    'Beijing, China',
    'Bangkok, Thailand',
    'Mumbai, India',
    'Dubai, United Arab Emirates',
    'Cairo, Egypt',
    'Marrakesh, Morocco',
    'Lagos, Nigeria',
]

southern_hemisphere_cities = [
    'Rio De Janeiro, Brazil',
    'Buenos Aires, Argentina',
    'Lima, Peru',
    'Santiago, Chile',
    'Cape Town, South Africa',
    'Nairobi, Kenya',
    'Sydney, Australia'    
]

data["in_northern_hemisphere"] = 0
idx = data['location'].isin(northern_hemisphere_cities)
data.loc[idx, "in_northern_hemisphere"] = 1

data["in_southern_hemisphere"] = 1 - data["in_northern_hemisphere"]

data[['location', 'in_northern_hemisphere', 'in_southern_hemisphere']].drop_duplicates()

Unnamed: 0,location,in_northern_hemisphere,in_southern_hemisphere
0,"New York City, United States of America",1,0
1096,"Los Angeles, United States of America",1,0
2192,"Toronto, Canada",1,0
3288,"Mexico City, Mexico",1,0
4384,"Vancouver, Canada",1,0
5480,"Rio De Janeiro, Brazil",0,1
6576,"Buenos Aires, Argentina",0,1
7672,"Lima, Peru",0,1
8768,"Santiago, Chile",0,1
9864,"Bogota, Colombia",1,0


In [124]:
series_col = "location"
epoch_col = 'date'
epoch_label = "day"
time_col = 'date'
target_col = 'weather_descriptions'
exog_cols = [
    'mintemp', 'avgtemp', 'sunhour', 'uv_index', 'wind_speed', 'wind_degree',
    'precip', 'humidity', 'visibility', 'pressure', 'cloudcover', 'heatindex', 'dewpoint',
    'windchill', 'windgust', 'feelslike', 
    'in_northern_hemisphere', 'in_southern_hemisphere', 'maxtemp'
]

In [125]:
data[target_col] = data[target_col].apply(lambda x: x[2:-2])

# Train/Test split

In [127]:
test_size = 0.2

train_df, test_df = [], []
grouped = data.groupby(series_col)
for location, group in grouped:
    num_samples = len(group)
    num_test_samples = int(num_samples * test_size)

    test = group.iloc[-num_test_samples:]
    train = group.iloc[:-num_test_samples]

    test_df.append(test)
    train_df.append(train)

train_df = pd.concat(train_df)
test_df = pd.concat(test_df)

test_key = test_df[[series_col, time_col, target_col]]
test_df.drop(columns=[target_col], inplace=True)

# Save Data Files

In [128]:
all_cols = [series_col, time_col] + exog_cols

data.sort_values(by=[series_col, time_col], inplace=True)
data[all_cols + [target_col]].to_csv(outp_fname, index=False, float_format="%.1f")
train_df[all_cols + [target_col]].to_csv(train_fname, index=False, float_format="%.1f")
test_df[all_cols].to_csv(test_fname, index=False, float_format="%.1f")
test_key.to_csv(test_key_fname, index=False, float_format="%.1f")