#### Objective:- Create table for generating race migration points.

In [5]:
import pandas as pd, numpy as np
import cudf, cupy as cp
import os

### DATA PREP

In [2]:
# df = cudf.read_csv('census_full/HI_DE_DC_mapped_data.csv').drop('Unnamed: 0',axis=1)
df = cudf.read_csv('data/mapped_data_with_race.csv',usecols=['ID20','R'],dtype={'ID20':'int64','STATE':'int32','COUNTY':'str','points':'int32'})
df.head()

Unnamed: 0,ID20,STATE,P_delta,points
0,10010201001000,1,-10,22
1,10010201001001,1,4,24
2,10010201001002,1,-23,23
3,10010201001003,1,4,8
4,10010201001005,1,-8,8


In [3]:
len(df)

6194258

#### Generate random points

In [10]:
df = df[df.STATE==12].reset_index(drop=True)

In [11]:
len(df)

150000

In [12]:
df = df.iloc[:150000]

In [13]:
df.head()

Unnamed: 0,index,ID20,STATE,P_delta,points
0,933607,120010002011000,12,-1,29
1,933608,120010002011001,12,-4,12
2,933609,120010002011002,12,-23,23
3,933610,120010002011003,12,-7,7
4,933611,120010002011004,12,2,16


In [14]:
df.tail()

Unnamed: 0,index,ID20,STATE,P_delta,points
149995,1083602,120830010101019,12,17,17
149996,1083603,120830010101020,12,0,8
149997,1083604,120830010101021,12,10,10
149998,1083605,120830010101022,12,6,6
149999,1083606,120830010101023,12,2,14


In [15]:
def random_points_in_polygon(number, polygon):
    # print(polygon)
    points_x = np.array([])
    points_y = np.array([])
    min_x, min_y, max_x, max_y = polygon.bounds
    i= 0
    while i < number:
        point_x = random.uniform(min_x, max_x)
        point_y = random.uniform(min_y, max_y)
        if polygon.contains(Point(point_x, point_y)):
            points_x = np.append(points_x, point_x)
            points_y = np.append(points_y, point_y)
            i += 1
    return points_x, points_y # returns list of points(lat), list of points(long)

def generate_data(state, df_temp, gpdf):
    t1 = datetime.datetime.now()
    geoid_index_df = df_temp.index.to_numpy()
    final_points_x = np.array([])
    final_points_y = np.array([])
    geoid = np.array([])
    # f=0
    for index, row in gpdf.iterrows():
        # f+=1
        points_x = np.array([])
        points_y = np.array([])
        geoid_temp = np.array([])
        
        if row['GEOID20'] in geoid_index_df and df_temp.loc[row['GEOID20']]>0:
            num_points = df_temp.loc[row['GEOID20']]
            polygon = row['geometry']
            
            if polygon is not None:
                points_x, points_y = random_points_in_polygon(num_points, polygon)
                # print(points_x,points_y)
                geoid_temp = np.array([row['GEOID20']]*len(points_x))
                geoid = np.append(geoid,geoid_temp)
                final_points_x = np.append(final_points_x, points_x)
                # print(final_points_x)
                final_points_y = np.append(final_points_y, points_y)
            print('Processing '+str(state)+' - Completed:', "{0:0.2f}".format((index/len(gpdf))*100), '%', end='')
            print('', end='\r')
        
        # if f==11:
        #     break

    print('Processing for '+str(state)+' complete \n total time', datetime.datetime.now() - t1)
    df_fin = cudf.DataFrame({'GEOID20': geoid,'x': final_points_x, 'y':final_points_y}) #,'COUNTY':county,'p_delta':p_delta,'p_net':p_net})
    df_fin.GEOID20 = df_fin.GEOID20.astype('int').astype('str')
    
    df_fin.to_csv('data/migration_population_with_race/population_FL1_%s'%str(state)+'.csv', index=False)
def exec_data(state_key_list):
    c=0
    for i in state_key_list:
        # print(i)
        c+=1
        if i< 10:
            i_str = '0'+str(i)
        else:
            i_str = str(i)
        # path = 'census_2020_data/nhgis0003_shape/nhgis0003_shapefile_tl2020_%s0_block_2020/%s_block_2020.shp'%(i_str,states[i])
        path ='data/tl_shapefiles/tl_2021_%s_tabblock20.shp'%(i_str)
        #print(path)
        print("started reading shape file for state ", states[i])
        if os.path.isfile(path):    
            gpdf = gpd.read_file(path)[['GEOID20', 'geometry']].sort_values('GEOID20').reset_index(drop=True)
            gpdf.GEOID20 = gpdf.GEOID20.astype('int64')
            gpdf = gpdf[gpdf.GEOID20<=120830010101023]
            print("completed reading shape file for state ", states[i])
            df_temp = df.query('STATE == @i')[['ID20', 'points']]
            df_temp.index = df_temp.ID20
            df_temp = df_temp['points']
            # print(gpdf.head(3))
            # print(df_temp)
            print("starting to generate data for "+str(states[i])+"... ")
            generate_data(states[i], df_temp, gpdf)
            del(df_temp)
        else:
            print("shape file does not exist")
            continue
        # if c==2:
        #     break 

In [16]:
# states = {1 :"AL",2 :"AK",4 :"AZ",5 :"AR",6 :"CA",8 :"CO",9 :"CT",10:"DE",11:"DC",12:"FL",13:"GA",15:"HI",
#           16:"ID",17:"IL",18:"IN",19:"IA",20:"KS",21:"KY",22:"LA",23:"ME",24:"MD",25:"MA",26:"MI",27:"MN",
#           28:"MS",29:"MO",30:"MT",31:"NE",32:"NV",33:"NH",34:"NJ",35:"NM",36:"NY",37:"NC",38:"ND",39:"OH",
#           40:"OK",41:"OR",42:"PA",44:"RI",45:"SC",46:"SD",47:"TN",48:"TX",49:"UT",50:"VT",51:"VA",53:"WA",
#           54:"WV",55:"WI",56:"WY",72:"PR"}

states = {12:"FL"} 
exec_data(states.keys())

started reading shape file for state  FL
completed reading shape file for state  FL
starting to generate data for FL... 
Processing for FL complete 100.00 %78.32 %
 total time 3:57:34.135479


#### Concat divided states

In [17]:
# part1 = cudf.read_csv('data/migration_population_with_race/population_FL1_FL.csv',dtype={'ID20':'int64','x':'float32','y':'float32'})
# part2 = cudf.read_csv('data/migration_population_with_race/population_FL2_FL.csv',dtype={'ID20':'int64','x':'float32','y':'float32'})
# part3 = cudf.read_csv('data/migration_population_with_race/population_FL3_FL.csv',dtype={'ID20':'int64','x':'float32','y':'float32'})
# # part4 = cudf.read_csv('data/migration_population_with_race/population_NY4_NY.csv',dtype={'ID20':'int64','x':'float32','y':'float32'})

In [24]:
# fl_df = cudf.concat([part1,part2,part3])

In [26]:
# df[df.STATE==12]['points'].sum()

In [13]:
# fl_df.to_csv('data/migration_population_with_race/population_FL.csv')

### Concat States

In [2]:
def merge_shape_and_states(state_key_list):
    concat_states = cudf.DataFrame()
    
    for i in state_key_list:
        if i< 10:
            i_str = '0'+str(i)
        else:
            i_str = str(i)
        path = 'data/migration_population_with_race//population_%s'%str(states[i])+'.csv'
        if os.path.isfile(path):    
            temp = cudf.read_csv(path,dtype={'ID20':'int64','x':'float32','y':'float32'})# Load shape files
            concat_states = cudf.concat([concat_states,temp])
        else:
            print(i,states[i])
            print("shape file does not exist")
            continue
    return concat_states

In [3]:
states = {1 :"AL",2 :"AK",4 :"AZ",5 :"AR",6 :"CA",8 :"CO",9 :"CT",10:"DE",11:"DC",12:"FL",13:"GA",15:"HI",
          16:"ID",17:"IL",18:"IN",19:"IA",20:"KS",21:"KY",22:"LA",23:"ME",24:"MD",25:"MA",26:"MI",27:"MN",
          28:"MS",29:"MO",30:"MT",31:"NE",32:"NV",33:"NH",34:"NJ",35:"NM",36:"NY",37:"NC",38:"ND",39:"OH",
          40:"OK",41:"OR",42:"PA",44:"RI",45:"SC",46:"SD",47:"TN",48:"TX",49:"UT",50:"VT",51:"VA",53:"WA",
          54:"WV",55:"WI",56:"WY",72:"PR"}

In [6]:
indv_df = merge_shape_and_states(states.keys()).drop('Unnamed: 0',axis=1)
indv_df.rename(columns={'GEOID20':'ID20'},inplace=True)
indv_df.head()

Unnamed: 0,ID20,x,y
0,10010201001000,-86.480591,32.469173
1,10010201001000,-86.478142,32.470337
2,10010201001000,-86.478485,32.471489
3,10010201001000,-86.479645,32.469475
4,10010201001000,-86.479912,32.471939


In [8]:
# cpu_df = indv_df.to_pandas()

In [9]:
# cpu_df.to_csv('data/final_data_with_race.csv')

In [10]:
len(cpu_df)

182532663

In [9]:
dataset = indv_df.merge(df,on='ID20',how='left').sort_values('ID20')
dataset.head()

Unnamed: 0,ID20,x,y,STATE,P_delta,points
4496,10010201001000,-86,32,1,-10,22
4497,10010201001000,-86,32,1,-10,22
4498,10010201001000,-86,32,1,-10,22
4499,10010201001000,-86,32,1,-10,22
4500,10010201001000,-86,32,1,-10,22


In [10]:
dataset['P_net'] = dataset['P_delta'].apply(lambda x: -1 if x < 0 else ( 1 if x>0 else 0))

In [14]:
cdf = dataset.to_pandas()
cdf.head()

Unnamed: 0,ID20,x,y,STATE,P_delta,points,P_net
4496,10010201001000,-86,32,1,-10,22,-1
4497,10010201001000,-86,32,1,-10,22,-1
4498,10010201001000,-86,32,1,-10,22,-1
4499,10010201001000,-86,32,1,-10,22,-1
4500,10010201001000,-86,32,1,-10,22,-1


In [15]:
# cdf.to_csv('data/final_data_with_race.csv')