In [19]:
import numpy as np
import pandas as pd
import pdb

In [20]:
#To run this... download data files how they're stored in Google Drive (i.e. separate folders for Arizona, California, Florida, and Other States) 
# as well as summary files (AZ_states.csv and Other_states.csv)

#Output: 3 dataframes (train_data, val_data, test_data)

# First get file names organized 
# AZ_counties and Other_states are formatted differently
# Start with Other_states
names=pd.read_csv("Data/Other_states.csv");
#Header is: County, Lat, Long, Subfolder, Val, Actual State
names=names.values[:];

# Train and validate filenames will be the same
train_filenames=[]
test_filenames=[]

train_city_names=[]
test_city_names=[]


for i in range(0,len(names)):
    #California and Florida have their own subfolder, all other counties are in "Other States" folder
    name="Data/"+str(names[i,3])+"/2011-2020_rcp45_CanESM2_"+str(names[i,0])
    #testing data has val=1
    if names[i,4]==1:
        try:
            #does rcp45 exist for all counties? Not sure, so take rcp85 if it doesn't
            pd.read_csv(name+".csv");
            #file name we'll open later to concatenate data
            test_filenames.append(name)
            #keep track of city/state now to make it easier later
            test_city_names.append(str(names[i,0])+","+str(names[i,5]))
        except:
            name="Data/"+str(names[i,3])+"/2011-2020_rcp85_CanESM2_"+str(names[i,0])
            test_filenames.append(name)
            test_city_names.append(str(names[i,0])+","+str(names[i,5]))
    #training data has val=0
    #repeat same process as above for training/validation subsets
    else:
        try:
            pd.read_csv(name+".csv");
            train_filenames.append(name)
            train_city_names.append(str(names[i,0])+","+str(names[i,5]))
        except:
            name="Data/"+str(names[i,3])+"/2011-2020_rcp85_CanESM2_"+str(names[i,0])
            train_filenames.append(name)
            train_city_names.append(str(names[i,0])+","+str(names[i,5]))
            

#Same process as above for AZ counties. Only change is naming convention
names=pd.read_csv("Data/AZ_states.csv");
names=names.values[:];

for i in range(0,len(names)):
    name="Data/Arizona/historical_rcp45_"+str(names[i,1])+"_"+str(names[i,2])
    #testing data has val=1
    if names[i,4]==1:
        try:
            pd.read_csv(name+".csv");
            test_filenames.append(name)
            test_city_names.append(str(names[i,0])+","+str(names[i,3]))
        except:
            name="Data/Arizona/historical_rcp85_"+str(names[i,1])+"_"+str(names[i,2])
            test_filenames.append(name)
            test_city_names.append(str(names[i,0])+","+str(names[i,3]))
    #training data has val=0
    else:
        try:
            pd.read_csv(name);
            train_filenames.append(name)
            train_city_names.append(str(names[i,0])+","+str(names[i,3]))
        except:
            name="Data/Arizona/historical_rcp85_"+str(names[i,1])+"_"+str(names[i,2])
            train_filenames.append(name)
            train_city_names.append(str(names[i,0])+","+str(names[i,3]))

#Save all of these as arrays for reference
np.save('train_city_names',np.asarray(train_city_names))
np.save('test_city_names',np.asarray(test_city_names))
np.save('train_names',np.asarray(train_filenames))
np.save('test_names',np.asarray(test_filenames))

In [21]:
#function to get all of the indexes of the years I want
get_indexes = lambda x, xs: [i for (y, i) in zip(xs, range(len(xs))) if x == y]

#load in all those arrays we saved last time
#filenames
train_names=np.load('train_names.npy', allow_pickle=True)
test_names=np.load('test_names.npy', allow_pickle=True)
#city names
tr_cities=np.load('train_city_names.npy', allow_pickle=True)
te_cities=np.load('test_city_names.npy', allow_pickle=True)

train_cities=[]
test_cities=[]
val_cities=[]

#indexes city names
index=0

#initialize dataframes so we can concatenate later
train_fin=np.zeros((1,10))
train_fin=pd.DataFrame(train_fin)
val_fin=np.zeros((1,10))
val_fin=pd.DataFrame(val_fin)
for name in train_names:
    #training data from 2011-2018, validation data from 2019-2020
    tr_yr=2011
    val_yr=2019
    data=pd.read_csv(name+".csv")
    data=data.values[:]
    #Training data
    while tr_yr<2019:
        #AZ data doesn't have same years as other states, so check to see where beginning of the year is for each file
        start=(get_indexes(tr_yr, data[:,0]))
        #Concatenate training dataframe with 365 days for each year (ignore leap years)
        if not np.isnan(data[start[0]:start[0]+365,:]).any():
            train_fin=pd.concat([train_fin,pd.DataFrame(data[start[0]:start[0]+365,:])])
            #Add associated city for each of rows corresponding to a year
            for i in range(0,365):
                train_cities.append(tr_cities[index])   
        tr_yr+=1
        
    #Validation data
    #Same process for validation data, just different years
    while val_yr<2021:
        start=(get_indexes(val_yr, data[:,0]))
        if not np.isnan(data[start[0]:start[0]+365,:]).any():
            val_fin=pd.concat([val_fin,pd.DataFrame(data[start[0]:start[0]+365,:])])
            for i in range(0,365):
                val_cities.append(tr_cities[index])   
        val_yr+=1
    index+=1

#Testing counties are in different csv files, so handle those now. Same process as training/validation
index=0
test_fin=np.zeros((1,10))
test_fin=pd.DataFrame(test_fin)
for name in test_names:
    #get all years (2011-2020) for testing counties. Narrow down later as needed
    te_yr=2011
    data=pd.read_csv(name+".csv")
    data=data.values[:]
        
    #Testing data
    while te_yr<2021:
        start=(get_indexes(te_yr, data[:,0]))
        if not np.isnan(data[start[0]:start[0]+365,:]).any():
            test_fin=pd.concat([test_fin,pd.DataFrame(data[start[0]:start[0]+365,:])])
            for i in range(0,365):
                test_cities.append(te_cities[index])   
        te_yr+=1
    index+=1

#Erase the junk first row used to initialize dataframes
train_fin = train_fin.iloc[1:]
test_fin = test_fin.iloc[1:]
val_fin = val_fin.iloc[1:]

#Convert city arrays to dataframes
train_cities = pd.DataFrame(train_cities)
test_cities = pd.DataFrame(test_cities)
val_cities = pd.DataFrame(val_cities)

#Make sure indexes align so dataframes can be concatenated
train_fin.reset_index(drop=True, inplace=True)
train_cities.reset_index(drop=True, inplace=True)
test_fin.reset_index(drop=True, inplace=True)
test_cities.reset_index(drop=True, inplace=True)
val_fin.reset_index(drop=True, inplace=True)
val_cities.reset_index(drop=True, inplace=True)

#concatenate and save :)
train_data=pd.concat([train_cities, train_fin], axis=1)
test_data=pd.concat([test_cities, test_fin], axis=1)
validate_data=pd.concat([val_cities, val_fin], axis=1)

short=train_data.values
short=short[:,4:]
short=short.astype("float32")
print(np.isnan(short).any())

short=test_data.values
short=short[:,4:]
short=short.astype("float32")
print(np.isnan(short).any())


short=validate_data.values
short=short[:,4:]
short=short.astype("float32")
print(np.isnan(short).any())


pdb.set_trace()

train_data.to_pickle("train_data")
test_data.to_pickle("test_data")
validate_data.to_pickle("val_data")

False
False
False
--Return--
> <ipython-input-21-d855c50884aa>(113)<module>()->None
-> pdb.set_trace()


(Pdb)  c
