# Combining education with other attributes

In [1]:
import cudf
import pyarrow as pa
import pandas as pd
import numpy as np
import cupy as cp
import os

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [3]:
df = cudf.read_parquet('../data/all_states_lat_long_age_sex.parquet/*')

In [4]:
df = df.sort_values('GISJOIN')
df

Unnamed: 0,GISJOIN,x,y,sex,p_id,age
7040,1000100201001000,-9.627138e+06,3.825267e+06,1,32,46
7041,1000100201001000,-9.627491e+06,3.825096e+06,1,33,47
7042,1000100201001000,-9.626667e+06,3.824849e+06,1,34,49
7043,1000100201001000,-9.626773e+06,3.824702e+06,1,35,49
7044,1000100201001000,-9.627132e+06,3.824709e+06,1,36,46
...,...,...,...,...,...,...
308741752,56004509513003128,-1.159921e+07,5.441573e+06,1,308745304,35
308741753,56004509513003128,-1.159932e+07,5.441618e+06,1,308745305,35
308741754,56004509513003128,-1.159941e+07,5.441687e+06,1,308745306,38
308741755,56004509513003128,-1.159941e+07,5.441688e+06,1,308745307,35


In [5]:
df.dtypes

GISJOIN      int64
x          float64
y          float64
sex           int8
p_id         int32
age           int8
dtype: object

In [6]:
df_education = cudf.read_parquet('./education_step_2.parquet/*')



In [7]:
df_education.education = df_education.education.astype('int8')
df_education = df_education.sort_values('GISJOIN').reset_index()
df_education.drop_column('index')

In [8]:
df_education.dtypes

GISJOIN      int64
education     int8
dtype: object

In [9]:
df_education

Unnamed: 0,GISJOIN,education
0,1000100201001,0
1,1000100201001,0
2,1000100201001,0
3,1000100201001,0
4,1000100201001,0
...,...,...
204127475,56004509513003,14
204127476,56004509513003,14
204127477,56004509513003,14
204127478,56004509513003,14


In [10]:
df_education_with_gis_mappings = cudf.read_parquet('./GISJOIN_to_GISJOIN_og_mapping.parquet/*')
df_education_with_gis_mappings.age = df_education_with_gis_mappings.age.astype('int8')
# df_education_with_gis_mappings.drop_column('sex')

In [11]:
df_education_with_gis_mappings = df_education_with_gis_mappings.sort_values('GISJOIN').reset_index()
df_education_with_gis_mappings.drop_column('index')

In [12]:
df_education_with_gis_mappings.drop_column('GISJOIN_og')

In [13]:
df_education_with_gis_mappings

Unnamed: 0,p_id,GISJOIN,sex,age
0,32,1000100201001,1,46
1,33,1000100201001,1,47
2,34,1000100201001,1,49
3,35,1000100201001,1,49
4,36,1000100201001,1,46
...,...,...,...,...
204127475,308744092,56004509513003,1,57
204127476,308744093,56004509513003,1,56
204127477,308744094,56004509513003,1,55
204127478,308744095,56004509513003,1,55


In [14]:
df_education_with_gis_mappings['GISJOIN_education'] = df_education.GISJOIN.values

In [15]:
df_education_with_gis_mappings.query('GISJOIN != GISJOIN_education')

Unnamed: 0,p_id,GISJOIN,sex,age,GISJOIN_education


In [16]:
df_education_with_gis_mappings['education'] = df_education.education.values

In [17]:
df_education_with_gis_mappings.drop_column('GISJOIN_education')
df_education_with_gis_mappings.drop_column('GISJOIN')

In [18]:
df_education_with_gis_mappings.education.value_counts()

8     58971601
12    36159913
10    29250841
11    15354934
13    14780855
9     12869643
6      5130563
5      4827753
3      4823941
14     4033904
4      3904574
7      3837555
2      3717882
15     2409375
0      2291035
1      1763111
Name: education, dtype: int32

In [19]:
del(df_education)

### merge sex and education with lat-long table

In [20]:
df_education_with_gis_mappings

Unnamed: 0,p_id,sex,age,education
0,32,1,46,0
1,33,1,47,0
2,34,1,49,0
3,35,1,49,0
4,36,1,46,0
...,...,...,...,...
204127475,308744092,1,57,14
204127476,308744093,1,56,14
204127477,308744094,1,55,14
204127478,308744095,1,55,14


In [21]:
df = df.merge(df_education_with_gis_mappings, on='p_id', how='left')

In [22]:
df

Unnamed: 0,GISJOIN,x,y,sex_x,p_id,age_x,sex_y,age_y,education
0,1000300103003128,-9.783459e+06,3.612968e+06,0,68800,16,,,
1,1000300103003128,-9.783386e+06,3.612591e+06,0,68801,15,,,
2,1000300103003128,-9.781875e+06,3.612724e+06,0,68802,17,,,
3,1000300103003128,-9.784575e+06,3.611945e+06,0,68803,15,,,
4,1000300103003128,-9.782779e+06,3.611633e+06,0,68804,16,,,
...,...,...,...,...,...,...,...,...,...
308745304,56004509513003072,-1.160424e+07,5.441253e+06,0,308744694,35,0,35,8
308745305,56004509513003072,-1.160460e+07,5.441445e+06,0,308744695,35,0,35,8
308745306,56004509513003072,-1.160437e+07,5.441351e+06,0,308744696,38,0,38,8
308745307,56004509513003072,-1.160458e+07,5.441955e+06,0,308744697,35,0,35,8


In [23]:
df.drop_column('age_y')
df.drop_column('sex_y')

In [24]:
# df.education.value_counts().sum()

In [25]:
df.education = df.education.fillna(16)
df.rename({
    'sex_x':'sex',
    'age_x': 'age'
}, inplace=True)

In [26]:
del(df_education_with_gis_mappings)

In [27]:
df

Unnamed: 0,GISJOIN,x,y,sex,p_id,age,education
0,1000300103003128,-9.783459e+06,3.612968e+06,0,68800,16,16
1,1000300103003128,-9.783386e+06,3.612591e+06,0,68801,15,16
2,1000300103003128,-9.781875e+06,3.612724e+06,0,68802,17,16
3,1000300103003128,-9.784575e+06,3.611945e+06,0,68803,15,16
4,1000300103003128,-9.782779e+06,3.611633e+06,0,68804,16,16
...,...,...,...,...,...,...,...
308745304,56004509513003072,-1.160424e+07,5.441253e+06,0,308744694,35,8
308745305,56004509513003072,-1.160460e+07,5.441445e+06,0,308744695,35,8
308745306,56004509513003072,-1.160437e+07,5.441351e+06,0,308744696,38,8
308745307,56004509513003072,-1.160458e+07,5.441955e+06,0,308744697,35,8


In [28]:
df.to_parquet('../data/all_states_lat_long_education_age_sex.parquet')

  "Using CPU via PyArrow to write Parquet dataset, this will "
