# Combining education with other attributes

In [1]:
import cudf
import pyarrow as pa
import pandas as pd
import numpy as np
import cupy as cp
import os

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [3]:
df = cudf.read_parquet('../data/all_states_lat_long_age_sex.parquet/*')

In [4]:
df = df.sort_values('GISJOIN').reset_index()
df.drop_column('index')
df

Unnamed: 0,GISJOIN,x,y,sex,age
0,1000100201001000,-9.627034e+06,3.825003e+06,0,67
1,1000100201001000,-9.626834e+06,3.825034e+06,0,69
2,1000100201001000,-9.626683e+06,3.825079e+06,0,67
3,1000100201001000,-9.627138e+06,3.825075e+06,0,67
4,1000100201001000,-9.627112e+06,3.824731e+06,0,74
...,...,...,...,...,...
308745304,56004509513003128,-1.159921e+07,5.441573e+06,1,52
308745305,56004509513003128,-1.159932e+07,5.441618e+06,1,50
308745306,56004509513003128,-1.159941e+07,5.441687e+06,1,50
308745307,56004509513003128,-1.159941e+07,5.441688e+06,1,51


In [5]:
df_education = cudf.read_parquet('./education_step_2.parquet/*')

In [6]:
df_education = df_education.sort_values('GISJOIN').reset_index()
df_education.drop_column('index')

In [7]:
df_education

Unnamed: 0,GISJOIN,education
0,1000100201001,0
1,1000100201001,0
2,1000100201001,0
3,1000100201001,0
4,1000100201001,0
...,...,...
308745304,56004509513003,14
308745305,56004509513003,14
308745306,56004509513003,14
308745307,56004509513003,14


In [8]:
df_education_with_gis_mappings = cudf.read_parquet('./GISJOIN_to_GISJOIN_og_mapping.parquet/*')
df_education_with_gis_mappings.drop_column('sex')

In [9]:
df_education_with_gis_mappings = df_education_with_gis_mappings.sort_values('GISJOIN').reset_index()
df_education_with_gis_mappings.drop_column('index')

In [10]:
df_education_with_gis_mappings

Unnamed: 0,GISJOIN,GISJOIN_og
0,1000100201001,1000100201001025
1,1000100201001,1000100201001025
2,1000100201001,1000100201001025
3,1000100201001,1000100201001025
4,1000100201001,1000100201001025
...,...,...
308745304,56004509513003,56004509513003128
308745305,56004509513003,56004509513003128
308745306,56004509513003,56004509513003128
308745307,56004509513003,56004509513003128


In [11]:
df_education_with_gis_mappings['GISJOIN_education'] = df_education.GISJOIN.values

In [12]:
df_education_with_gis_mappings.query('GISJOIN != GISJOIN_education')

Unnamed: 0,GISJOIN,GISJOIN_og,GISJOIN_education


In [13]:
df_education_with_gis_mappings['education'] = df_education.education.values

In [14]:
df_education_with_gis_mappings.drop_column('GISJOIN_education')
df_education_with_gis_mappings.drop_column('GISJOIN')

In [15]:
df_education_with_gis_mappings = df_education_with_gis_mappings.rename({'GISJOIN_og': 'GISJOIN'})

In [16]:
df_education_with_gis_mappings = df_education_with_gis_mappings.sort_values('GISJOIN').reset_index()
df_education_with_gis_mappings.drop_column('index')

In [17]:
df_education_with_gis_mappings

Unnamed: 0,GISJOIN,education
0,1000100201001000,8
1,1000100201001000,8
2,1000100201001000,8
3,1000100201001000,8
4,1000100201001000,8
...,...,...
308745304,56004509513003128,14
308745305,56004509513003128,14
308745306,56004509513003128,14
308745307,56004509513003128,14


In [18]:
del(df_education)

### merge sex and education with lat-long table

In [19]:
df.head()

Unnamed: 0,GISJOIN,x,y,sex,age
0,1000100201001000,-9627034.0,3825003.0,0,67
1,1000100201001000,-9626834.0,3825034.0,0,69
2,1000100201001000,-9626683.0,3825079.0,0,67
3,1000100201001000,-9627138.0,3825075.0,0,67
4,1000100201001000,-9627112.0,3824731.0,0,74


In [20]:
df['GISJOIN_education'] = df_education_with_gis_mappings.GISJOIN.values
df['education'] = df_education_with_gis_mappings.education.values

In [21]:
del(df_education_with_gis_mappings)

In [22]:
df

Unnamed: 0,GISJOIN,x,y,sex,age,GISJOIN_education,education
0,1000100201001000,-9.627034e+06,3.825003e+06,0,67,1000100201001000,8
1,1000100201001000,-9.626834e+06,3.825034e+06,0,69,1000100201001000,8
2,1000100201001000,-9.626683e+06,3.825079e+06,0,67,1000100201001000,8
3,1000100201001000,-9.627138e+06,3.825075e+06,0,67,1000100201001000,8
4,1000100201001000,-9.627112e+06,3.824731e+06,0,74,1000100201001000,8
...,...,...,...,...,...,...,...
308745304,56004509513003128,-1.159921e+07,5.441573e+06,1,52,56004509513003128,14
308745305,56004509513003128,-1.159932e+07,5.441618e+06,1,50,56004509513003128,14
308745306,56004509513003128,-1.159941e+07,5.441687e+06,1,50,56004509513003128,14
308745307,56004509513003128,-1.159941e+07,5.441688e+06,1,51,56004509513003128,14


In [23]:
(df.GISJOIN == df.GISJOIN).unique()

0    True
Name: GISJOIN, dtype: bool

In [24]:
df.drop_column('GISJOIN_education')

In [25]:
df.to_parquet('../data/all_states_lat_long_education_age_sex.parquet')

  "Using CPU via PyArrow to write Parquet dataset, this will "
