# Combining age with other attributes

In [1]:
import cudf
import pyarrow as pa
import pandas as pd
import numpy as np
import cupy as cp
import os

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [2]:
df = cudf.read_parquet('../data/all_states_lat_long_cow_income_education_age_sex.parquet/*')

In [3]:
df.drop_column('age')

In [4]:
df.x = df.x.astype('float32')
df.y = df.y.astype('float32')

In [5]:
df = df.sort_values('GISJOIN')
df

Unnamed: 0,GISJOIN,x,y,sex,p_id,education,income,cow
49440,1000100201001000,-9626773.0,3824702.25,1,35,0,19,0
49441,1000100201001000,-9627132.0,3824709.25,1,36,0,19,0
49442,1000100201001000,-9626740.0,3825409.00,1,37,0,19,0
49443,1000100201001000,-9627007.0,3825576.00,1,38,0,19,0
49444,1000100201001000,-9626684.0,3825259.75,1,39,0,19,0
...,...,...,...,...,...,...,...,...
308742648,56004509513003128,-11599208.0,5441573.00,1,308745304,12,4,0
308742649,56004509513003128,-11599316.0,5441618.00,1,308745305,12,4,0
308742650,56004509513003128,-11599414.0,5441687.00,1,308745306,12,4,0
308742651,56004509513003128,-11599409.0,5441688.50,1,308745307,12,4,0


In [6]:
df_age = cudf.read_parquet('./age_step_2.parquet/*')

In [7]:
df_age = df_age.sort_values('GISJOIN').reset_index()
df_age.drop_column('index')

In [8]:
df_age

Unnamed: 0,GISJOIN,age
0,1000100201001,1
1,1000100201001,1
2,1000100201001,4
3,1000100201001,2
4,1000100201001,1
...,...,...
308745304,56004509513003,85
308745305,56004509513003,85
308745306,56004509513003,85
308745307,56004509513003,85


In [9]:
df_age_with_gis_mappings = cudf.read_parquet('./GISJOIN_to_GISJOIN_og_mapping.parquet/*')
df_age_with_gis_mappings.drop_column('sex')

In [10]:
df_age_with_gis_mappings = df_age_with_gis_mappings.sort_values('GISJOIN').reset_index()
df_age_with_gis_mappings.drop_column('index')

In [11]:
df_age_with_gis_mappings.drop_column('GISJOIN_og')

In [12]:
df_age_with_gis_mappings

Unnamed: 0,p_id,GISJOIN
0,576,1000100201001
1,577,1000100201001
2,578,1000100201001
3,579,1000100201001
4,580,1000100201001
...,...,...
308745304,308744475,56004509513003
308745305,308744476,56004509513003
308745306,308744477,56004509513003
308745307,308744478,56004509513003


In [13]:
df_age_with_gis_mappings['GISJOIN_age'] = df_age.GISJOIN.values

In [14]:
df_age_with_gis_mappings.query('GISJOIN != GISJOIN_age')

Unnamed: 0,p_id,GISJOIN,GISJOIN_age


In [15]:
df_age_with_gis_mappings['age'] = df_age.age.values

In [16]:
df_age_with_gis_mappings.drop_column('GISJOIN_age')
df_age_with_gis_mappings.drop_column('GISJOIN')

In [17]:
del(df_age)

In [18]:
df_age_with_gis_mappings

Unnamed: 0,p_id,age
0,576,1
1,577,1
2,578,4
3,579,2
4,580,1
...,...,...
308745304,308744475,85
308745305,308744476,85
308745306,308744477,85
308745307,308744478,85


### merge sex and age with lat-long table

In [19]:
df.head()

Unnamed: 0,GISJOIN,x,y,sex,p_id,education,income,cow
49440,1000100201001000,-9626773.0,3824702.25,1,35,0,19,0
49441,1000100201001000,-9627132.0,3824709.25,1,36,0,19,0
49442,1000100201001000,-9626740.0,3825409.0,1,37,0,19,0
49443,1000100201001000,-9627007.0,3825576.0,1,38,0,19,0
49444,1000100201001000,-9626684.0,3825259.75,1,39,0,19,0


In [20]:
df = df.merge(df_age_with_gis_mappings, on='p_id', how='inner')

In [21]:
df

Unnamed: 0,GISJOIN,x,y,sex,p_id,education,income,cow,age
0,1000100207002017,-9623904.0,3823389.75,0,29042,10,12,0,45
1,1000100207002017,-9623795.0,3823349.75,0,29043,10,12,0,49
2,1000100207002017,-9623942.0,3823424.00,0,29044,10,12,0,45
3,1000100207002017,-9623815.0,3823353.50,0,29045,10,12,0,48
4,1000100207002017,-9623835.0,3823438.75,1,29046,10,0,0,48
...,...,...,...,...,...,...,...,...,...
308745304,56004509511002136,-11593976.0,5456722.50,1,308739960,5,5,0,45
308745305,56004509511002136,-11598623.0,5457926.50,1,308739961,5,5,2,46
308745306,56004509511002136,-11598757.0,5458250.50,1,308739962,5,5,2,46
308745307,56004509511002136,-11598786.0,5456930.50,1,308739963,5,5,2,46


In [22]:
df.dtypes

GISJOIN        int64
x            float32
y            float32
sex             int8
p_id           int32
education       int8
income          int8
cow             int8
age            int64
dtype: object

In [23]:
df.age = df.age.astype('int8')

In [25]:
df

Unnamed: 0,GISJOIN,x,y,sex,p_id,education,income,cow,age
0,1000100207002017,-9623904.0,3823389.75,0,29042,10,12,0,45
1,1000100207002017,-9623795.0,3823349.75,0,29043,10,12,0,49
2,1000100207002017,-9623942.0,3823424.00,0,29044,10,12,0,45
3,1000100207002017,-9623815.0,3823353.50,0,29045,10,12,0,48
4,1000100207002017,-9623835.0,3823438.75,1,29046,10,0,0,48
...,...,...,...,...,...,...,...,...,...
308745304,56004509511002136,-11593976.0,5456722.50,1,308739960,5,5,0,45
308745305,56004509511002136,-11598623.0,5457926.50,1,308739961,5,5,2,46
308745306,56004509511002136,-11598757.0,5458250.50,1,308739962,5,5,2,46
308745307,56004509511002136,-11598786.0,5456930.50,1,308739963,5,5,2,46


In [24]:
df.to_parquet('../data/all_states_lat_long_cow_income_education_age_sex.parquet')

  "Using CPU via PyArrow to write Parquet dataset, this will "
