# Combining age with other attributes

In [1]:
import cudf
import pyarrow as pa
import pandas as pd
import numpy as np
import cupy as cp
import os

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [4]:
df = cudf.read_parquet('../../data/all_states_us_lat_long_sex.parquet')

In [5]:
df = df.sort_values('GISJOIN')
df

Unnamed: 0,GISJOIN,x,y,sex,p_id
0,1000100201001000,-86.480965,32.468025,0,0
1,1000100201001000,-86.484355,32.469958,0,1
2,1000100201001000,-86.476974,32.467935,0,2
3,1000100201001000,-86.483532,32.468555,0,3
4,1000100201001000,-86.479665,32.471680,0,4
...,...,...,...,...,...
308745304,56004509513003128,-104.197455,43.845559,1,308745304
308745305,56004509513003128,-104.198425,43.845850,1,308745305
308745306,56004509513003128,-104.199305,43.846296,1,308745306
308745307,56004509513003128,-104.199264,43.846305,1,308745307


In [7]:
df_age = cudf.read_parquet('./age_step_2.parquet')

In [8]:
df_age = df_age.sort_values('GISJOIN').reset_index()
df_age.drop_column('index')

In [9]:
df_age

Unnamed: 0,GISJOIN,age
0,1000100201001,1
1,1000100201001,3
2,1000100201001,4
3,1000100201001,3
4,1000100201001,4
...,...,...
308745304,56004509513003,85
308745305,56004509513003,85
308745306,56004509513003,85
308745307,56004509513003,85


In [10]:
df_age_with_gis_mappings = cudf.read_parquet('./GISJOIN_to_GISJOIN_og_mapping.parquet/*')
df_age_with_gis_mappings.drop_column('sex')

In [11]:
df_age_with_gis_mappings = df_age_with_gis_mappings.sort_values('GISJOIN').reset_index()
df_age_with_gis_mappings.drop_column('index')

In [12]:
df_age_with_gis_mappings.drop_column('GISJOIN_og')

In [13]:
df_age_with_gis_mappings

Unnamed: 0,p_id,GISJOIN
0,672,1000100201001
1,673,1000100201001
2,674,1000100201001
3,675,1000100201001
4,676,1000100201001
...,...,...
308745304,308743899,56004509513003
308745305,308743900,56004509513003
308745306,308743901,56004509513003
308745307,308743902,56004509513003


In [14]:
df_age_with_gis_mappings['GISJOIN_age'] = df_age.GISJOIN.values

In [15]:
df_age_with_gis_mappings.query('GISJOIN != GISJOIN_age')

Unnamed: 0,p_id,GISJOIN,GISJOIN_age


In [16]:
df_age_with_gis_mappings['age'] = df_age.age.values

In [17]:
df_age_with_gis_mappings.drop_column('GISJOIN_age')
df_age_with_gis_mappings.drop_column('GISJOIN')

In [18]:
del(df_age)

In [19]:
df_age_with_gis_mappings

Unnamed: 0,p_id,age
0,672,1
1,673,3
2,674,4
3,675,3
4,676,4
...,...,...
308745304,308743899,85
308745305,308743900,85
308745306,308743901,85
308745307,308743902,85


### merge sex and age with lat-long table

In [20]:
df.head()

Unnamed: 0,GISJOIN,x,y,sex,p_id
0,1000100201001000,-86.480965,32.468025,0,0
1,1000100201001000,-86.484355,32.469958,0,1
2,1000100201001000,-86.476974,32.467935,0,2
3,1000100201001000,-86.483532,32.468555,0,3
4,1000100201001000,-86.479665,32.47168,0,4


In [21]:
df = df.merge(df_age_with_gis_mappings, on='p_id', how='inner')

In [22]:
df

Unnamed: 0,GISJOIN,x,y,sex,p_id,age
0,1000100206002022,-86.476936,32.434510,0,25824,41
1,1000100206002022,-86.475503,32.434379,0,25825,44
2,1000100206002022,-86.470999,32.436681,0,25826,43
3,1000100206002022,-86.476399,32.434521,0,25827,40
4,1000100206002022,-86.476031,32.435329,0,25828,41
...,...,...,...,...,...,...
308745304,56004300002001560,-107.955505,44.085190,0,308730331,11
308745305,56004300002001560,-107.952841,44.086804,0,308730332,14
308745306,56004300002001560,-107.956119,44.083611,0,308730333,13
308745307,56004300002001560,-107.954096,44.087581,0,308730334,14


In [23]:
df.dtypes

GISJOIN      int64
x          float64
y          float64
sex           int8
p_id         int32
age          int64
dtype: object

In [24]:
df.age = df.age.astype('int8')

In [26]:
df.to_parquet('../../data/census_loc_age_sex.parquet')