# Combining age with other attributes

In [1]:
import cudf
import pyarrow as pa
import pandas as pd
import numpy as np
import cupy as cp
import os

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [3]:
df = cudf.read_parquet('../data/all_states_us_lat_long_sex.parquet/*')

In [4]:
df = df.sort_values('GISJOIN').reset_index()
df.drop_column('index')
df

Unnamed: 0,GISJOIN,x,y,sex
0,1000100201001000,-9.627034e+06,3.825003e+06,0
1,1000100201001000,-9.626834e+06,3.825034e+06,0
2,1000100201001000,-9.626683e+06,3.825079e+06,0
3,1000100201001000,-9.627138e+06,3.825075e+06,0
4,1000100201001000,-9.627112e+06,3.824731e+06,0
...,...,...,...,...
308745304,56004509513003128,-1.159921e+07,5.441573e+06,1
308745305,56004509513003128,-1.159932e+07,5.441618e+06,1
308745306,56004509513003128,-1.159941e+07,5.441687e+06,1
308745307,56004509513003128,-1.159941e+07,5.441688e+06,1


In [5]:
os.environ['CUDA_VISIBLE_DEVICES'] = "1"

In [6]:
import cudf

In [7]:
df_age = cudf.read_parquet('./age_step_2.parquet/*')

In [8]:
df_age = df_age.sort_values('GISJOIN').reset_index()
df_age.drop_column('index')

In [9]:
df_age

Unnamed: 0,GISJOIN,age
0,1000100201001,1
1,1000100201001,1
2,1000100201001,4
3,1000100201001,2
4,1000100201001,2
...,...,...
308745304,56004509513003,87
308745305,56004509513003,97
308745306,56004509513003,99
308745307,56004509513003,98


In [10]:
df_age_with_gis_mappings = cudf.read_parquet('./GISJOIN_to_GISJOIN_og_mapping.parquet/*')
df_age_with_gis_mappings.drop_column('sex')

In [11]:
df_age_with_gis_mappings = df_age_with_gis_mappings.sort_values('GISJOIN').reset_index()
df_age_with_gis_mappings.drop_column('index')

In [12]:
df_age_with_gis_mappings

Unnamed: 0,GISJOIN,GISJOIN_og
0,1000100201001,1000100201001025
1,1000100201001,1000100201001025
2,1000100201001,1000100201001025
3,1000100201001,1000100201001025
4,1000100201001,1000100201001025
...,...,...
308745304,56004509513003,56004509513003000
308745305,56004509513003,56004509513003000
308745306,56004509513003,56004509513003000
308745307,56004509513003,56004509513003000


In [13]:
df_age_with_gis_mappings['GISJOIN_age'] = df_age.GISJOIN.values

In [14]:
df_age_with_gis_mappings.query('GISJOIN != GISJOIN_age')

Unnamed: 0,GISJOIN,GISJOIN_og,GISJOIN_age


In [15]:
df_age_with_gis_mappings['age'] = df_age.age.values

In [16]:
df_age_with_gis_mappings.drop_column('GISJOIN_age')
df_age_with_gis_mappings.drop_column('GISJOIN')

In [17]:
df_age_with_gis_mappings = df_age_with_gis_mappings.rename({'GISJOIN_og': 'GISJOIN'})

In [18]:
df_age_with_gis_mappings = df_age_with_gis_mappings.sort_values('GISJOIN').reset_index()
df_age_with_gis_mappings.drop_column('index')

In [19]:
df_age_with_gis_mappings

Unnamed: 0,GISJOIN,age
0,1000100201001000,67
1,1000100201001000,69
2,1000100201001000,67
3,1000100201001000,67
4,1000100201001000,74
...,...,...
308745304,56004509513003128,52
308745305,56004509513003128,50
308745306,56004509513003128,50
308745307,56004509513003128,51


In [20]:
del(df_age)

### merge sex and age with lat-long table

In [21]:
df.head()

Unnamed: 0,GISJOIN,x,y,sex
0,1000100201001000,-9627034.0,3825003.0,0
1,1000100201001000,-9626834.0,3825034.0,0
2,1000100201001000,-9626683.0,3825079.0,0
3,1000100201001000,-9627138.0,3825075.0,0
4,1000100201001000,-9627112.0,3824731.0,0


In [22]:
df['GISJOIN_age'] = df_age_with_gis_mappings.GISJOIN.values
df['age'] = df_age_with_gis_mappings.age.values

In [23]:
del(df_age_with_gis_mappings)

In [24]:
df

Unnamed: 0,GISJOIN,x,y,sex,GISJOIN_age,age
0,1000100201001000,-9.627034e+06,3.825003e+06,0,1000100201001000,67
1,1000100201001000,-9.626834e+06,3.825034e+06,0,1000100201001000,69
2,1000100201001000,-9.626683e+06,3.825079e+06,0,1000100201001000,67
3,1000100201001000,-9.627138e+06,3.825075e+06,0,1000100201001000,67
4,1000100201001000,-9.627112e+06,3.824731e+06,0,1000100201001000,74
...,...,...,...,...,...,...
308745304,56004509513003128,-1.159921e+07,5.441573e+06,1,56004509513003128,52
308745305,56004509513003128,-1.159932e+07,5.441618e+06,1,56004509513003128,50
308745306,56004509513003128,-1.159941e+07,5.441687e+06,1,56004509513003128,50
308745307,56004509513003128,-1.159941e+07,5.441688e+06,1,56004509513003128,51


In [37]:
(df.GISJOIN == df.GISJOIN).unique()

0    True
Name: GISJOIN, dtype: bool

In [38]:
df.drop_column('GISJOIN_age')

In [39]:
df.to_parquet('../data/all_states_lat_long_age_sex.parquet')

  "Using CPU via PyArrow to write Parquet dataset, this will "
