In [1]:
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader

import matplotlib.pyplot as plt
import os 
import random
import warnings

import torch

warnings.filterwarnings("ignore")

SEED_VALUE = 100
os.environ['PYTHONHASHSEED'] = str(SEED_VALUE)
random.seed(SEED_VALUE)
np.random.seed(SEED_VALUE)
torch.manual_seed(SEED_VALUE)
torch.cuda.manual_seed(SEED_VALUE)
torch.cuda.manual_seed_all(SEED_VALUE)


torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [2]:
# df = pd.read_csv("./data/city_pollution_data.csv")
df = pd.read_csv("./data/US2019to2023.csv")


DROP_ONEHOT = True
SEQ_LENGTH = 5


if DROP_ONEHOT:
  INPUT_DIM = 8
else:
  INPUT_DIM = 29

HIDDEN_DIM = 32
LAYER_DIM = 3


normalization_type = 'mean_std' # 'max', mean_std

In [3]:
import datetime

def get_train_test_data(df):
  # we'll mostly need median and variance values of features for most of our needs

  for col in df.columns:
    for x in ["Country", "min", "max", "count", "County", "past_week", "latitude", "longitude", "State", "variance"]:
      if x in col:
        df.drop([col], axis=1, inplace=True)

  # df["Population Staying at Home"] = df["Population Staying at Home"].apply(lambda x: x.replace(",", ""))
  # df["Population Not Staying at Home"] = df["Population Not Staying at Home"].apply(lambda x: x.replace(",", ""))

  # Now we want 2 more features. Which day of week it is and which month it is.
  # Both of these will be one-hot and hence we'll add 7+12 = 19 more columns.
  # Getting month id is easy from the datetime column. 
  # For day of week, we'll use datetime library.
  
  df['weekday'] = df['Date'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").weekday())
  df['month'] = df['Date'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").month - 1)

  # using one-hot on month and weekday
  weekday_onehot = pd.get_dummies(df['weekday'])
  weekday_onehot.columns = ["day_"+str(x) for x in weekday_onehot]
  month_onehot = pd.get_dummies(df['month'])
  month_onehot.columns = ["month_"+str(x) for x in month_onehot]

  df.drop(['weekday', 'month'], axis=1, inplace=True)
  df = df.join([weekday_onehot, month_onehot])

  cities_list = list(set(df['City']))
  print(cities_list)
  cities_list.sort()
  print(cities_list)
  city_df = {}
  test_indices_of_cities = {}
  train_set = {}
  test_set = {}
  TEST_SET_SIZE = 60                                        

  for city in cities_list:
    city_df[city] = df[df['City'] == city].sort_values('Date').reset_index()
    for col in city_df[city].columns:
      if col in ["median_pm25", "median_o3", "median_so2", "median_no2", "median_pm10", "median_co"]:
        continue
      try:  
        _mean = np.nanmean(city_df[city][col])
        if np.isnan(_mean) == True:
          _mean = 0
        city_df[city][col] = city_df[city][col].fillna(_mean)
      except:
        pass
    if city_df[city].shape[0] < 600 :
      print("City with less than 600 data : {} {}".format(city_df[city].shape[0], city))
      del city_df[city]
      continue
    
    test_index_start = random.randint(0, city_df[city].shape[0] - TEST_SET_SIZE)
    test_indices_of_cities[city] = [test_index_start, test_index_start + TEST_SET_SIZE]

    test_set[city] = city_df[city].iloc[test_index_start:test_index_start + TEST_SET_SIZE]
    train_set[city] = city_df[city].drop(index=list(range(test_index_start, test_index_start + TEST_SET_SIZE)))

  return train_set, test_set

In [4]:
train_set, test_set = get_train_test_data(df)

cities_list = list(train_set.keys())

all_train = pd.DataFrame()
for city in cities_list:
  all_train = all_train.append(train_set[city], ignore_index=True)

all_test = pd.DataFrame({})
for city in test_set:
  all_test = all_test.append(test_set[city], ignore_index=True)

concat_df = pd.concat([all_train,all_test],axis=0)

['Queens', 'Little Rock', 'El Paso', 'Manhattan', 'Saint Paul', 'Salem', 'Austin', 'Las Vegas', 'Portland', 'Phoenix', 'Miami', 'Springfield', 'Staten Island', 'Richmond', 'Raleigh', 'San Diego', 'Indianapolis', 'Albuquerque', 'Charlotte', 'Omaha', 'Boise', 'Seattle', 'The Bronx', 'Fresno', 'Nashville', 'Denver', 'San Jose', 'Chicago', 'Los Angeles', 'Madison', 'Dallas', 'Columbus', 'Sacramento', 'Washington D.C.', 'Boston', 'Jackson', 'Jacksonville', 'Memphis', 'Oklahoma City', 'San Francisco', 'Tucson', 'Providence', 'Detroit', 'Baltimore', 'Oakland', 'Philadelphia', 'Fort Worth', 'San Antonio', 'Columbia', 'Salt Lake City', 'Brooklyn', 'Honolulu', 'Atlanta', 'Houston', 'Milwaukee', 'Hartford', 'Tallahassee']
['Albuquerque', 'Atlanta', 'Austin', 'Baltimore', 'Boise', 'Boston', 'Brooklyn', 'Charlotte', 'Chicago', 'Columbia', 'Columbus', 'Dallas', 'Denver', 'Detroit', 'El Paso', 'Fort Worth', 'Fresno', 'Hartford', 'Honolulu', 'Houston', 'Indianapolis', 'Jackson', 'Jacksonville', 'Las V

In [5]:
col_max = {}
col_mean = {}
col_mean2 = {}
col_std = {}

for city in cities_list:
  col_mean[city] = {}
  for col in train_set[city]:
    if col in ["index", "Date", "City"]:
      continue

    train_set[city][col] = train_set[city][col].astype("float")
    test_set[city][col] = test_set[city][col].astype("float")

    if col in ["median_pm25", "median_o3", "median_so2", "median_no2", "median_pm10", "median_co"]:
      _mean = np.nanmean(train_set[city][col])
      if np.isnan(_mean) == True:
        _mean = 0
      
      col_mean[city][col] = _mean
      train_set[city][col] = train_set[city][col].fillna(_mean)

    if normalization_type == 'mean_std':
      col_mean2[col] = np.nanmean(concat_df[col].astype("float"))
      col_std[col] = np.nanstd(concat_df[col].astype("float"))
      train_set[city][col] = (train_set[city][col] - col_mean2[col]) / (col_std[col] + 0.001)
      test_set[city][col] = (test_set[city][col] - col_mean2[col]) / (col_std[col] + 0.001)

    else:
      col_max[col] = concat_df[col].astype("float").max()
      train_set[city][col] = train_set[city][col] / (col_max[col] + 0.001)
      test_set[city][col] = test_set[city][col] / (col_max[col] + 0.001)

  if DROP_ONEHOT:
    train_set[city].drop(train_set[city].columns[-19:], axis=1, inplace=True)
    test_set[city].drop(test_set[city].columns[-19:], axis=1, inplace=True)


In [6]:
# number of data per city

print("num of cities : ", len(cities_list))
for city in cities_list:
    print(city+"({})".format(len(train_set[city])), end=", ")

num of cities :  57
Albuquerque(1613), Atlanta(1613), Austin(1613), Baltimore(1613), Boise(1613), Boston(1611), Brooklyn(1613), Charlotte(1613), Chicago(1613), Columbia(545), Columbus(1613), Dallas(1613), Denver(1612), Detroit(1545), El Paso(1613), Fort Worth(1610), Fresno(1613), Hartford(1613), Honolulu(1612), Houston(1613), Indianapolis(1593), Jackson(1613), Jacksonville(1613), Las Vegas(833), Little Rock(1613), Los Angeles(1613), Madison(1613), Manhattan(1613), Memphis(1613), Miami(1613), Milwaukee(1613), Nashville(1613), Oakland(1613), Oklahoma City(1613), Omaha(1613), Philadelphia(1613), Phoenix(1613), Portland(1613), Providence(1613), Queens(1613), Raleigh(1613), Richmond(1596), Sacramento(1613), Saint Paul(1613), Salem(1613), Salt Lake City(1613), San Antonio(1612), San Diego(1455), San Francisco(1613), San Jose(1613), Seattle(1613), Springfield(1605), Staten Island(1613), Tallahassee(1613), The Bronx(1613), Tucson(1583), Washington D.C.(1613), 

In [None]:
city = 'The Bronx'

In [29]:
test_set[city]['median_co']

53    -0.188060
54          NaN
55    -0.193684
56    -0.188060
57    -0.182436
58    -0.176812
59    -0.047459
60    -0.030587
61    -0.013715
62    -0.182436
63    -0.047459
64    -0.058707
65    -0.030587
66     0.008781
67    -0.008091
68    -0.075580
69    -0.041835
70    -0.188060
71    -0.047459
72    -0.058707
73    -0.013715
74    -0.081204
75    -0.188060
76    -0.176812
77    -0.069956
78    -0.030587
79    -0.075580
80    -0.064331
81    -0.188060
82    -0.182436
83    -0.064331
84    -0.064331
85    -0.188060
86    -0.081204
87     0.065021
88    -0.047459
89    -0.188060
90    -0.182436
91    -0.081204
92    -0.086828
93    -0.081204
94    -0.182436
95    -0.047459
96    -0.024963
97    -0.036211
98    -0.047459
99    -0.188060
100   -0.086828
101   -0.069956
102   -0.024963
103   -0.064331
104   -0.086828
105   -0.086828
106   -0.086828
107   -0.092452
108   -0.064331
109   -0.103700
110         NaN
111   -0.041835
112   -0.047459
Name: median_co, dtype: float64

In [40]:
nan = test_set[city]['median_co'][110]

In [43]:
# df['City']['Portland']
count = 0
for co in test_set[city]['median_co']:
    if np.isnan(co):
        count += 1
print(count)

2
