# Combining cow with other attributes

In [1]:
import cudf
import pyarrow as pa
import pandas as pd
import numpy as np
import cupy as cp
import os

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [3]:
df = cudf.read_parquet('../data/all_states_lat_long_income_education_age_sex.parquet/*')

In [4]:
df = df.sort_values('GISJOIN')
df

Unnamed: 0,GISJOIN,x,y,sex,p_id,age,education,income
52128,1000100201001000,-9.626773e+06,3.824702e+06,1,35,49,0,19
52129,1000100201001000,-9.627132e+06,3.824709e+06,1,36,46,0,19
52130,1000100201001000,-9.626740e+06,3.825409e+06,1,37,48,0,19
52131,1000100201001000,-9.627007e+06,3.825576e+06,1,38,48,0,19
52132,1000100201001000,-9.626684e+06,3.825260e+06,1,39,47,0,19
...,...,...,...,...,...,...,...,...
308729240,56004509513003128,-1.159921e+07,5.441573e+06,1,308745304,35,12,4
308729241,56004509513003128,-1.159932e+07,5.441618e+06,1,308745305,35,12,4
308729242,56004509513003128,-1.159941e+07,5.441687e+06,1,308745306,38,12,4
308729243,56004509513003128,-1.159941e+07,5.441688e+06,1,308745307,35,12,4


In [5]:
df.dtypes

GISJOIN        int64
x            float64
y            float64
sex             int8
p_id           int32
age             int8
education       int8
income          int8
dtype: object

In [6]:
df_cow = cudf.read_parquet('./cow_step_2.parquet/*')

In [7]:
df_cow.cow = df_cow.cow.astype('int8')
df_cow = df_cow.sort_values('GISJOIN').reset_index()
df_cow.drop_column('index')

In [8]:
df_cow.dtypes

GISJOIN    int64
cow         int8
dtype: object

In [9]:
df_cow

Unnamed: 0,GISJOIN,cow
0,1000100201001,0
1,1000100201001,0
2,1000100201001,0
3,1000100201001,0
4,1000100201001,0
...,...,...
243227903,56004509513003,6
243227904,56004509513003,6
243227905,56004509513003,6
243227906,56004509513003,6


In [10]:
df_cow_with_gis_mappings = cudf.read_parquet('./GISJOIN_to_GISJOIN_og_mapping.parquet/*')
df_cow_with_gis_mappings.age = df_cow_with_gis_mappings.age.astype('int8')

In [11]:
df_cow_with_gis_mappings = df_cow_with_gis_mappings.sort_values('GISJOIN').reset_index()
df_cow_with_gis_mappings.drop_column('index')

In [12]:
df_cow_with_gis_mappings.drop_column('GISJOIN_og')

In [13]:
df_cow_with_gis_mappings

Unnamed: 0,GISJOIN,p_id,sex,age
0,1000100201001,477,1,72
1,1000100201001,478,1,81
2,1000100201001,479,1,80
3,1000100201001,421,1,17
4,1000100201001,423,1,17
...,...,...,...,...
243227903,56004509513003,308744465,0,93
243227904,56004509513003,308744466,0,96
243227905,56004509513003,308744467,0,92
243227906,56004509513003,308744468,0,85


In [14]:
df_cow_with_gis_mappings['GISJOIN_cow'] = df_cow.GISJOIN.values

In [15]:
df_cow_with_gis_mappings.query('GISJOIN != GISJOIN_cow')

Unnamed: 0,GISJOIN,p_id,sex,age,GISJOIN_cow


In [16]:
df_cow_with_gis_mappings['cow'] = df_cow.cow.values

In [17]:
df_cow_with_gis_mappings.drop_column('GISJOIN_cow')
df_cow_with_gis_mappings.drop_column('GISJOIN')

In [18]:
df_cow_with_gis_mappings.cow.value_counts()

0    161573949
3     19432921
2     19202718
6     15896895
4     11522611
1      8509315
5      6529395
7       560104
Name: cow, dtype: int32

In [19]:
del(df_cow)

### merge sex and cow with lat-long table

In [20]:
df_cow_with_gis_mappings

Unnamed: 0,p_id,sex,age,cow
0,477,1,72,0
1,478,1,81,0
2,479,1,80,0
3,421,1,17,0
4,423,1,17,0
...,...,...,...,...
243227903,308744465,0,93,6
243227904,308744466,0,96,6
243227905,308744467,0,92,6
243227906,308744468,0,85,6


In [21]:
df = df.merge(df_cow_with_gis_mappings, on='p_id', how='left')

In [22]:
df

Unnamed: 0,GISJOIN,x,y,sex_x,p_id,age_x,education,income,sex_y,age_y,cow
0,1000100203001009,-9.624182e+06,3.826258e+06,0,5088,13,16,20,,,
1,1000100203001009,-9.624108e+06,3.826253e+06,0,5089,12,16,20,,,
2,1000100203001009,-9.624094e+06,3.826276e+06,0,5090,13,16,20,,,
3,1000100203001009,-9.624169e+06,3.826045e+06,0,5091,10,16,20,,,
4,1000100203001009,-9.624141e+06,3.826127e+06,0,5092,12,16,20,,,
...,...,...,...,...,...,...,...,...,...,...,...
308745304,56004109752003864,-1.227103e+07,5.061271e+06,0,308712265,10,16,20,,,
308745305,56004109752003864,-1.227072e+07,5.062986e+06,0,308712266,11,16,20,,,
308745306,56004109752003864,-1.227213e+07,5.063594e+06,0,308712267,11,16,20,,,
308745307,56004109752003864,-1.227194e+07,5.061883e+06,0,308712268,11,16,20,,,


In [23]:
df.drop_column('age_y')
df.drop_column('sex_y')

In [24]:
df.cow = df.cow.fillna(8)
df.rename({
    'sex_x':'sex',
    'age_x': 'age'
}, inplace=True)

In [25]:
df.dtypes

GISJOIN        int64
x            float64
y            float64
sex             int8
p_id           int32
age             int8
education       int8
income          int8
cow             int8
dtype: object

In [26]:
del(df_cow_with_gis_mappings)

In [27]:
df

Unnamed: 0,GISJOIN,x,y,sex,p_id,age,education,income,cow
0,1000100203001009,-9.624182e+06,3.826258e+06,0,5088,13,16,20,8
1,1000100203001009,-9.624108e+06,3.826253e+06,0,5089,12,16,20,8
2,1000100203001009,-9.624094e+06,3.826276e+06,0,5090,13,16,20,8
3,1000100203001009,-9.624169e+06,3.826045e+06,0,5091,10,16,20,8
4,1000100203001009,-9.624141e+06,3.826127e+06,0,5092,12,16,20,8
...,...,...,...,...,...,...,...,...,...
308745304,56004109752003864,-1.227103e+07,5.061271e+06,0,308712265,10,16,20,8
308745305,56004109752003864,-1.227072e+07,5.062986e+06,0,308712266,11,16,20,8
308745306,56004109752003864,-1.227213e+07,5.063594e+06,0,308712267,11,16,20,8
308745307,56004109752003864,-1.227194e+07,5.061883e+06,0,308712268,11,16,20,8


In [28]:
df.to_parquet('../data/all_states_lat_long_cow_income_education_age_sex.parquet')

  "Using CPU via PyArrow to write Parquet dataset, this will "
