#### Objective: Assign race, net and county columns to migration points - create final total population dataset

In [2]:
import cudf, cupy as cp
import pandas as pd,numpy as np
import random, pickle
pd.set_option('display.max_colwidth', 100000)

In [8]:
df = pd.read_parquet('data/total_parts_combined.parquet').reset_index(drop=True)
# df = cudf.from_pandas(df)

In [3]:
adf = pd.read_parquet('data/total_attr_gen_df.parquet')
adf.head()

Unnamed: 0,ID20,COUNTY,P20,eq_P10,block_diff,block_net
0,10010201001000,Autauga County,21,31.0,-10.0,-1
1,10010201001001,Autauga County,34,30.0,4.0,1
2,10010201001002,Autauga County,29,52.0,-23.0,-1
3,10010201001003,Autauga County,17,13.0,4.0,1
4,10010201001004,Autauga County,0,0.0,0.0,0


In [4]:
adf['points'] = adf[['P20','eq_P10','block_net']].apply(lambda row: row[0] if row[-1] >=0 else row[0] + row[1],axis=1)
adf.head()

Unnamed: 0,ID20,COUNTY,P20,eq_P10,block_diff,block_net,points
0,10010201001000,Autauga County,21,31.0,-10.0,-1,52.0
1,10010201001001,Autauga County,34,30.0,4.0,1,34.0
2,10010201001002,Autauga County,29,52.0,-23.0,-1,81.0
3,10010201001003,Autauga County,17,13.0,4.0,1,17.0
4,10010201001004,Autauga County,0,0.0,0.0,0,0.0


In [5]:
adf.points.sum()

504475979.0

#### Compute races

In [6]:
full_2020_path='data/nhgis0007_csv/nhgis0007_ds248_2020_block.csv'
races = cudf.read_csv(full_2020_path,usecols=['GEOCODE','U7B001','U7B003','U7B004','U7B005','U7B006','U7B007','U7B008','U7B009'],encoding='unicode_escape',dtype={'GEOCODE':'int64','U7B003':'int32','U7B004':'int32','U7B005':'int32','U7B006':'int32','U7B007':'int32','U7B008':'int32','U7B009':'int32'})
races.rename(columns={'GEOCODE':'ID20'},inplace=True)

In [7]:
races['block_net'] = adf['block_net']
races['P10'] = adf['eq_P10']
races['points'] = adf['points']

In [8]:
craces = races.to_pandas()

In [9]:
craces.rename(columns={'U7B001':'P20','U7B003':'R1','U7B004':'R2','U7B005':'R3','U7B006':'R4','U7B007':'R5','U7B008':'R6','U7B009':'R7'},inplace=True)
craces.head(6)

Unnamed: 0,ID20,P20,R1,R2,R3,R4,R5,R6,R7,block_net,P10,points
0,10010201001000,21,12,4,0,0,0,0,5,-1,31.0,52.0
1,10010201001001,34,18,11,0,2,0,1,2,1,30.0,34.0
2,10010201001002,29,24,2,0,0,0,2,1,-1,52.0,81.0
3,10010201001003,17,16,0,0,0,0,0,1,1,13.0,17.0
4,10010201001004,0,0,0,0,0,0,0,0,0,0.0,0.0
5,10010201001005,0,0,0,0,0,0,0,0,-1,8.0,8.0


In [10]:
craces = craces[(craces.P20!=0) | (craces.P10!=0)]

In [11]:
craces['P20'].sum() + craces[craces.block_net==-1]['P10'].sum()

504475979.0

In [12]:
craces['P10'] = craces['P10'].astype('int32')

In [13]:
craces['R1'] = craces['R1'].apply(lambda x : [1]*x)
craces['R2'] = craces['R2'].apply(lambda x : [2]*x)
craces['R3'] = craces['R3'].apply(lambda x : [3]*x)
craces['R4'] = craces['R4'].apply(lambda x : [4]*x)
craces['R5'] = craces['R5'].apply(lambda x : [5]*x)
craces['R6'] = craces['R6'].apply(lambda x : [6]*x)
craces['R7'] = craces['R7'].apply(lambda x : [7]*x)
craces['P10_list'] = craces[['P10','block_net']].apply(lambda row : [0]*row[0] if row[1] <0 else [],axis=1)

In [14]:
craces['pop'] = craces['R1']+craces['R2']+craces['R3']+craces['R4']+craces['R5']+craces['R6']+craces['R7'] + craces['P10_list']

In [15]:
craces = craces[['ID20','P20','P10','block_net','pop','points']]

In [16]:
craces = craces.reset_index(drop=True)

In [17]:
craces.iloc[[185909]]

Unnamed: 0,ID20,P20,P10,block_net,pop,points
185909,40131042032003,84,89,-1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",173.0


In [18]:
pickle.dump(craces,open('craces_list3.pkl','wb'))

#### Compute net

In [2]:
craces = pickle.load(open('craces_list3.pkl','rb'))
craces

Unnamed: 0,ID20,P20,P10,block_net,pop,points
0,10010201001000,21,31,-1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",52.0
1,10010201001001,34,30,1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 6, 7, 7]",34.0
2,10010201001002,29,52,-1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 6, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",81.0
3,10010201001003,17,13,1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7]",17.0
4,10010201001005,0,8,-1,"[0, 0, 0, 0, 0, 0, 0, 0]",8.0
...,...,...,...,...,...,...
6265158,721537506022009,62,24,1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]",62.0
6265159,721537506022010,21,23,-1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",44.0
6265160,721537506022011,27,6,1,"[1, 1, 1, 1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]",27.0
6265161,721537506022012,43,63,-1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",106.0


In [3]:
# # craces = races.to_pandas()
# # craces.rename(columns={'U7B001':'P20','U7B003':'R1','U7B004':'R2','U7B005':'R3','U7B006':'R4','U7B007':'R5','U7B008':'R6','U7B009':'R7'},inplace=True)
# craces = craces[['ID20','P20','P10','block_net','points']]
# # del(races)
# craces.head()

In [6]:
# craces = craces[(craces.P20!=0) | (craces.P10!=0) ]

In [9]:
craces.P20.sum() + craces[(craces.block_net==-1)].P10.sum()

504475979

In [10]:
# craces[['P20','P10','block_net','points']] = craces[['P20','P10','block_net','points']].astype('int32')

In [11]:
craces['net_list'] = craces[['P20','P10','block_net']].apply(lambda row: [1]*(row[0]-row[1]) + [0]*row[1]  if row[2] >=0 else [0]*row[0] + [-1]*(row[1]-row[0]) + [-99]*row[0],axis=1) # in -> p20-p10; stationary -> p10; | stationary-> p20   out -> p10-p20 extra -> p20  

In [12]:
craces

Unnamed: 0,ID20,P20,P10,block_net,pop,points,net_list
0,10010201001000,21,31,-1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",52.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99]"
1,10010201001001,34,30,1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 6, 7, 7]",34.0,"[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,10010201001002,29,52,-1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 6, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",81.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99]"
3,10010201001003,17,13,1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7]",17.0,"[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,10010201001005,0,8,-1,"[0, 0, 0, 0, 0, 0, 0, 0]",8.0,"[-1, -1, -1, -1, -1, -1, -1, -1]"
...,...,...,...,...,...,...,...
6265158,721537506022009,62,24,1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]",62.0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
6265159,721537506022010,21,23,-1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",44.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99]"
6265160,721537506022011,27,6,1,"[1, 1, 1, 1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]",27.0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]"
6265161,721537506022012,43,63,-1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",106.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, ...]"


In [14]:
# pickle.dump(craces,open('race_and_net_list.pkl','wb'))

In [None]:
craces = pickle.load(open('race_and_net_list.pkl','rb'))

In [3]:
# graces = cudf.from_pandas(craces['pop'])
# gnet = cudf.from_pandas(craces['net_list'])

In [5]:
edf = pd.DataFrame()

In [6]:
# edf['races'] = graces.explode('pop').reset_index(drop=True)['pop']
edf['races'] = craces['pop'].explode().reset_index(drop=True)
edf['net_list'] = craces['net_list'].explode().reset_index(drop=True)

In [14]:
edf.rename(columns={'net_list':'net'},inplace=True)

In [15]:
# pickle.dump(edf,open('race_and_net_exploded.pkl','wb'))

In [2]:
edf = pickle.load(open('race_and_net_exploded.pkl','rb'))

In [3]:
county_ser = pickle.load(open('county_list.pkl','rb'))
county_ser = county_ser.to_pandas()

In [6]:
edf['county'] = county_ser

In [11]:
df[['race','net','county']] = edf

In [13]:
# df.to_parquet('data/total_population_dataset.parquet')

In [3]:
df = pd.read_parquet('data/total_population_dataset.parquet')

In [5]:
df = df[df.net!=-99]

In [7]:
df.to_parquet('data/total_population_dataset.parquet')

In [8]:
small = df.sample(1000000)

In [10]:
small.to_parquet('data/total_population_dataset_sm.parquet')

In [18]:
len(net_list)

504475979

In [19]:
pickle.dump(net_list,open('data/net_exploded3.pkl','wb'))

#### Get stored computed races

In [23]:
craces = pickle.load(open('craces_list3.pkl','rb'))

In [4]:
graces = cudf.from_pandas(craces[['ID20','pop']])

In [25]:
temp =graces.explode('pop')

In [27]:
temp = temp.reset_index(drop=True)['pop']

In [28]:
ctemp = temp.to_pandas()

In [31]:
ctemp[ctemp==-1]

Series([], Name: pop, dtype: int64)

In [30]:
# ctemp.reset_index(drop=True).to_parquet('data/total_races.parquet')

In [33]:
pickle.dump(ctemp,open('data/races_exploded3.pkl','wb'))

#### Concat races and stored counties

In [5]:
races = pickle.load(open('data/races_exploded3.pkl','rb'))

In [6]:
df['races'] = races

In [7]:
df = df.drop('ID20',axis=1) # change if you need id

In [8]:
county_ser = pickle.load(open('county_list.pkl','rb'))

In [9]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,x,y,races
0,-86.478941,32.47019,1
1,-86.4793,32.469311,1
2,-86.481722,32.469914,1
3,-86.482149,32.471196,1
4,-86.482242,32.471287,1


In [10]:
df['county'] = county_ser.to_pandas()

In [11]:
df.to_parquet('data/population_county_races3.parquet')

### Concat net

In [47]:
attr = pd.read_parquet('data/population_county_races3.parquet')
net = pickle.load(open('data/net_exploded3.pkl','rb'))

In [50]:
attr = attr.reset_index(drop=True)
net = net.reset_index(drop=True)

In [52]:
attr['net'] = net.to_pandas()

In [2]:
# attr = pd.read_parquet('data/total_population_dataset3.parquet')

In [12]:
# attr.to_parquet('data/total_population_dataset3.parquet')