# Combining age with other attributes

In [1]:
import cudf
import pyarrow as pa
import pandas as pd
import numpy as np
import cupy as cp
import os

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [3]:
df = cudf.read_parquet('../data/all_states_us_lat_long_sex.parquet/*')

In [4]:
df = df.sort_values('GISJOIN')
df

Unnamed: 0,GISJOIN,x,y,sex,p_id
0,1000100201001000,-9.627034e+06,3.825003e+06,0,0
1,1000100201001000,-9.626834e+06,3.825034e+06,0,1
2,1000100201001000,-9.626683e+06,3.825079e+06,0,2
3,1000100201001000,-9.627138e+06,3.825075e+06,0,3
4,1000100201001000,-9.627112e+06,3.824731e+06,0,4
...,...,...,...,...,...
308745304,56004509513003128,-1.159921e+07,5.441573e+06,1,308745304
308745305,56004509513003128,-1.159932e+07,5.441618e+06,1,308745305
308745306,56004509513003128,-1.159941e+07,5.441687e+06,1,308745306
308745307,56004509513003128,-1.159941e+07,5.441688e+06,1,308745307


In [5]:
import cudf

In [6]:
df_age = cudf.read_parquet('./age_step_2.parquet/*')

In [7]:
df_age = df_age.sort_values('GISJOIN').reset_index()
df_age.drop_column('index')

In [8]:
df_age

Unnamed: 0,GISJOIN,age
0,1000100201001,3
1,1000100201001,1
2,1000100201001,3
3,1000100201001,3
4,1000100201001,3
...,...,...
308745304,56004509513003,99
308745305,56004509513003,91
308745306,56004509513003,91
308745307,56004509513003,85


In [9]:
df_age_with_gis_mappings = cudf.read_parquet('./GISJOIN_to_GISJOIN_og_mapping.parquet/*')
df_age_with_gis_mappings.drop_column('sex')

In [10]:
df_age_with_gis_mappings = df_age_with_gis_mappings.sort_values('GISJOIN').reset_index()
df_age_with_gis_mappings.drop_column('index')

In [11]:
df_age_with_gis_mappings.drop_column('GISJOIN_og')

In [12]:
df_age_with_gis_mappings

Unnamed: 0,p_id,GISJOIN
0,576,1000100201001
1,577,1000100201001
2,578,1000100201001
3,579,1000100201001
4,580,1000100201001
...,...,...
308745304,308744475,56004509513003
308745305,308744476,56004509513003
308745306,308744477,56004509513003
308745307,308744478,56004509513003


In [13]:
df_age_with_gis_mappings['GISJOIN_age'] = df_age.GISJOIN.values

In [14]:
df_age_with_gis_mappings.query('GISJOIN != GISJOIN_age')

Unnamed: 0,p_id,GISJOIN,GISJOIN_age


In [15]:
df_age_with_gis_mappings['age'] = df_age.age.values

In [16]:
df_age_with_gis_mappings.drop_column('GISJOIN_age')
df_age_with_gis_mappings.drop_column('GISJOIN')

In [17]:
del(df_age)

In [18]:
df_age_with_gis_mappings

Unnamed: 0,p_id,age
0,576,3
1,577,1
2,578,3
3,579,3
4,580,3
...,...,...
308745304,308744475,99
308745305,308744476,91
308745306,308744477,91
308745307,308744478,85


### merge sex and age with lat-long table

In [19]:
df.head()

Unnamed: 0,GISJOIN,x,y,sex,p_id
0,1000100201001000,-9627034.0,3825003.0,0,0
1,1000100201001000,-9626834.0,3825034.0,0,1
2,1000100201001000,-9626683.0,3825079.0,0,2
3,1000100201001000,-9627138.0,3825075.0,0,3
4,1000100201001000,-9627112.0,3824731.0,0,4


In [20]:
df = df.merge(df_age_with_gis_mappings, on='p_id', how='inner')

In [21]:
df

Unnamed: 0,GISJOIN,x,y,sex,p_id,age
0,1000100208022016,-9.624799e+06,3.830467e+06,1,35808,43
1,1000100208022016,-9.625003e+06,3.831105e+06,1,35809,42
2,1000100208022016,-9.624606e+06,3.831383e+06,1,35810,41
3,1000100208022016,-9.625018e+06,3.830996e+06,1,35811,44
4,1000100208022016,-9.625278e+06,3.829954e+06,1,35812,44
...,...,...,...,...,...,...
308745304,56004509511002128,-1.160984e+07,5.468301e+06,1,308739899,33
308745305,56004509511002128,-1.159039e+07,5.458426e+06,1,308739900,33
308745306,56004509511002128,-1.159039e+07,5.458569e+06,1,308739901,34
308745307,56004509511002136,-1.159880e+07,5.457649e+06,0,308739902,33


In [24]:
df.dtypes

GISJOIN      int64
x          float64
y          float64
sex           int8
p_id         int32
age          int64
dtype: object

In [25]:
df.age = df.age.astype('int8')

In [26]:
df.to_parquet('../data/all_states_lat_long_age_sex.parquet')