# Build A Matrices - with New York City

According to the website below:
"In some instances, we report data from multiple counties or other non-county geographies as a single county. For instance, we report a single value for New York City, comprising the cases for New York, Kings, Queens, Bronx and Richmond Counties."

https://github.com/nytimes/covid-19-data

So I combine the flows for these 5 counties into a single New York City county

## Where the data is from
**Daily Flows:** 

URL: https://github.com/GeoDS/COVID19USFlows-DailyFlows/tree/master/daily_flows/county2county

Date Range: 01/21/2020 to 04/15/2021 

In [2]:
#below are the packages I will be using
import pandas as pd
import numpy as np
import datetime as dt

# Path Variables
The default path is the GitHub descon-uccs/pandemic-data repository 

In [34]:
#Paper input file path
paperFilepath = '../Data/Jupyter Notebook Input/Final_Paper_Data_avg_cases.csv'

#file path for where the A matrices that are created will be stored
AMatrixFilepath = '../Data/A Matrices/'

In [8]:
#below I create a dictionary where keys are fips and the value is the row/column for that particular fips code
def create_matrix_indices_dictionary(matrix_data):
    FIPS_dictionary = {}
    FIPS_list = sorted(set(matrix_data['fips'].tolist()))
    
    #below I append FIPS [36005, 36047, 36061, 36081, 36085], which are the counties that make up New York City
    FIPS_list.append(36005)
    FIPS_list.append(36047)
    FIPS_list.append(36061)
    FIPS_list.append(36081)
    FIPS_list.append(36085)
    
    i = 0
    for key in FIPS_list:
        FIPS_dictionary[key] = i
        i += 1
        
    return FIPS_dictionary 

In [9]:
#below I create a dictionary where keys are fips and the value is the row/column for that particular fips code
def create_final_CSV_matrix_indices_dictionary(matrix_data):
    FIPS_dictionary2 = {}
    FIPS_list = sorted(set(matrix_data['fips'].tolist()))
    
    FIPS_list.append(36998)
    FIPS_list.sort()
    
    i = 0
    for key in FIPS_list:
        FIPS_dictionary2[key] = i
        i += 1
        
    return FIPS_dictionary2 

In [10]:
#below is a function to create a population vector, this is used in order to take each row of the A matrix and divide 
    #the row by the population of the county that corresponds to that row
def get_population_vector(dataset, FIPS_dictionary):
    populations = []
    
    temp_FIPS = list(FIPS_dictionary.keys())[:-5]
    
    for fips in temp_FIPS:
        value = dataset.loc[dataset['fips'] == fips, 'population'].iloc[0]
        #print(value)
        populations.append(value)
    
    #below I add the populations for counties [36005, 36047, 36061, 36081, 36085], which are the counties that make up New York City 
    populations.append(1418207)  
    populations.append(2559903)
    populations.append(1628706)
    populations.append(2253858)
    populations.append(476143)
    
    return populations

In [11]:
#below is a function to create the A matrix, where rows and columns are FIPS codes and the values are flows, we decided
    #to take the matrix with the original flows and divide each row of the matrix by the popualtion of the county that
    #corresponds to that row, then this function returns the transpose of the resulting matrix
def create_A_matrix(matrix_data, dataset, FIPS_dictionary):
    
    matrix_size = len(FIPS_dictionary)
    #print(len(FIPS_dictionary))
    
    #below is a matrix where origin are rows and destination are columns
    matrix = np.zeros((matrix_size, matrix_size))
    
    for index, row in matrix_data.iterrows():
        
        origin = FIPS_dictionary[row['geoid_o']]
        destination = FIPS_dictionary[row['geoid_d']]
        pop_flow_value = row['pop_flows']
        
        matrix[origin][destination] = pop_flow_value
    
    #below I create a vector of each of the populations 
    population_vector = np.array(get_population_vector(dataset, FIPS_dictionary))
    
    #print(population_vector)
    
    #below I divide each row of the matrix by the population of the county that corresponds to that row
    new_matrix = matrix / population_vector[:, np.newaxis]
    
    temp_df = pd.DataFrame(new_matrix)
    
    #below I combine the 5 New York counties into a single county called New York City county
    
    #below I sum the columns of the 5 New York counties, place it in row 1880, which is the correct spot for New York
        #City county based on FIPS code order
    NY_incoming_flows = list(temp_df[3132:].sum(axis=0))
    temp_df.loc[1879.5] = NY_incoming_flows
    temp_df = temp_df.sort_index().reset_index(drop=True)

    #below I sum the rows of the 5 New York counties, place it in column 1880 
    NY_outgoing_flows = list(temp_df.loc[:, 3132:].sum(axis=1))
    temp_df.insert(1880, 'NY', NY_outgoing_flows)

    #below I remove the 5 rows and columns that correspond to the 5 counties 
    temp_df.drop(temp_df.index[[3133 ,3134, 3135, 3136, 3137]], inplace=True)
    temp_df.drop(temp_df.columns[[3133, 3134, 3135, 3136, 3137]], axis=1, inplace=True)
    temp_df.columns = list(range(3133))
    
    final_matrix = temp_df.values
    
    #below I return the transpose of new_matrix, which means columns are now the origin while rows are the destination
    return temp_df, np.transpose(final_matrix)

## Begin to build the A Matrices

In [12]:
#below I use the CSV that I created with data for the paper in order to determine what FIPS codes we have decided to look at
dataset = pd.read_csv(paperFilepath)
dataset['date'] = pd.to_datetime(dataset['date'])
dataset 

Unnamed: 0,date,fips,state,county,cases,vaccinations,population
0,2020-03-24,1001,Alabama,Autauga,0.14,0,55869
1,2020-03-25,1001,Alabama,Autauga,0.57,0,55869
2,2020-03-26,1001,Alabama,Autauga,0.86,0,55869
3,2020-03-27,1001,Alabama,Autauga,0.86,0,55869
4,2020-03-28,1001,Alabama,Autauga,0.86,0,55869
...,...,...,...,...,...,...,...
1185072,2021-04-11,56045,Wyoming,Weston,0.75,1379,6927
1185073,2021-04-12,56045,Wyoming,Weston,0.86,1379,6927
1185074,2021-04-13,56045,Wyoming,Weston,0.88,1380,6927
1185075,2021-04-14,56045,Wyoming,Weston,1.00,1392,6927


In [30]:
#below I create a dictionary that is used to build the matrices where keys are the FIPS we decided to look at while 
    #values are the row/column that correspond to each FIPS code
FIPS_dictionary = create_matrix_indices_dictionary(dataset) 

In [31]:
len(FIPS_dictionary)

3137

In [53]:
desired_fips = list(FIPS_dictionary.keys())
len(desired_fips)

3137

In [51]:
#below I create another FIPS dictionary that does not have the 5 New York Counties
FIPS_dictionary2 = create_final_CSV_matrix_indices_dictionary(dataset)
len(FIPS_dictionary2)

3133

In [16]:
FIPS_dictionary2[36998]

1880

In [17]:
paper_dates1 = dataset["date"]
result1 = (paper_dates1.drop_duplicates()).to_list()
print(min(result1), max(result1))

2020-01-21 00:00:00 2021-04-15 00:00:00


In [18]:
#below I create a list of dates for the daily flows we will be using
daily_flow_dates = []

start_flow_date = dt.date(2020,1,21)
end_flow_date = dt.date(2021,4,15)

delta = end_flow_date - start_flow_date

for i in range(delta.days + 1):
    day = start_flow_date + dt.timedelta(days=i)
    
    daily_flow_dates.append(day.strftime("%Y_%m_%d"))
    #print(day.strftime("%Y_%m_%d"))

In [19]:
print("Start Date: ", daily_flow_dates[0])

print("End Date: ", daily_flow_dates[-1])

Start Date:  2020_01_21
End Date:  2021_04_15


In [20]:
len(daily_flow_dates)

451

**Build A Matrices**

In [72]:
#NOTE: it takes a little over 23 minutes to create 10 A matrices and save them in a CSV
filename_template = 'https://raw.githubusercontent.com/GeoDS/COVID19USFlows-DailyFlows/master/daily_flows/county2county/daily_county2county_'

for date in daily_flow_dates:
    
    filename = filename_template + date + '.csv'
    #print(filename)
    
    #some CSVs used column name date_range while others used date, due to this I decided to not include that column in
        #the dataframe
    col_list = ["geoid_o", "geoid_d", "pop_flows"] 
    temp_data = pd.read_csv(filename, usecols=col_list, dtype = {"geoid_o": int, "geoid_d": int})

    temp_data = (temp_data[temp_data['geoid_o'].isin(desired_fips)]).reset_index(drop=True)
    data = (temp_data[temp_data['geoid_d'].isin(desired_fips)]).reset_index(drop=True)
    
    matrix, transpose_matrix = create_A_matrix(data, dataset, FIPS_dictionary)
    #print(temp_matrix)
    
    transpose_matrix_df = pd.DataFrame(transpose_matrix)
    
    csv_file_path = AMatrixFilepath + date + "_A_matrix.csv"
    
    #below I place the A matrix in a dataframe and change the column names to the FIPS codes they correspond to, I also
        #add a column with the FIPS codes so that it is easy to see what index/row/column the FIPS correspond to   
    cols = list(FIPS_dictionary2.keys())
    transpose_matrix_df.set_axis(cols, axis=1, inplace=True)
    transpose_matrix_df.insert(0, 'FIPS index', cols)
    
    transpose_matrix_df.to_csv(csv_file_path)
    
print("Complete!")

Complete!
