In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import geopandas as gpd

# process all the data for "the_year" and save yearly master data  

In [33]:
the_year = 2019

In [34]:
separate_data_folder = '../../data/point_data/separate_data/'
collated_data_folder = '../../data/point_data/collated_data/'

# get different data modalities: controls, outcomes and enviornmental

In [35]:
controls = pd.read_csv(separate_data_folder + 'controls.csv')
# replace quotes in column names
controls.columns = controls.columns.str.replace('"', '')
controls.columns = controls.columns.str.replace(':', '')
controls.columns = controls.columns.str.replace(',', '')
controls.rename(columns={c:"c_"+c for c in controls.columns if not c in ['geography code', 'geography']}, inplace=True)

controls = controls.drop_duplicates()


len(controls), len(controls.drop_duplicates()), len(set(controls.index)), len(controls['geography code'].unique())

(35672, 35672, 35672, 35672)

In [36]:
outcomes = pd.read_csv(separate_data_folder + '{}_outcomes.csv'.format(the_year))
outcomes.rename(columns={c:"o_"+c for c in outcomes.columns if not c in ['geography code', 'geography']}, inplace=True)

outcomes = outcomes.drop_duplicates()

len(outcomes), len(outcomes.drop_duplicates()), len(set(outcomes.index)), len(outcomes['geography code'].unique())

(32833, 32833, 32833, 32833)

In [37]:
env = pd.read_csv(separate_data_folder + '{}_environment.csv'.format(the_year)).rename(columns={'LSOA21CD':'geography code'})
del env['LSOA21NM']
env.rename(columns={c:"e_"+c for c in env.columns if not c in ['geography code', 'geography']}, inplace=True)


env = env.drop_duplicates()

len(env), len(env.drop_duplicates()), len(set(env.index)), len(env['geography code'].unique())

(36926, 36926, 36926, 33804)

# Merge the different modalities

In [38]:
data = controls.merge(outcomes, on=['geography code'])
data = data.merge(env, on=['geography code'])

In [39]:
len(data), len(data.drop_duplicates()), len(set(data.index)), len(data['geography code'].unique())

(34786, 34786, 34786, 31799)

### remove duplicates

We have some duplicates because the environmental code produces scores for some LSOAs twice since we process LSOAs in batches based on the region, and some LSOAs fall into two different regions. These rows were not dropped by drop_duplicates that we ran above simply because of the small-number differences between some environmental scores, stemming from the fact that for calculating some of them, we randomly subsample a subset of all the yearly satellite products. 

In [40]:
# group the dataframe by 'geography code' and select the first element from each group
data = data.groupby('geography code').first()
data = data.reset_index().rename(columns={'index': 'geography code'})
len(data), len(data.drop_duplicates()), len(set(data.index)), len(data['geography code'].unique())

(31799, 31799, 31799, 31799)

# ADD SPATIAL ELEMENTS INTO THE MASTER

# read Region and LSOA shapefiles

In [41]:
regions = gpd.read_file('../../data/auxiliary_data/regions_2022/Regions_(December_2022)_EN_BFC/Regions_(December_2022)_EN_BFC.shp')
lsoas = gpd.read_file('../../data/auxiliary_data/lsoas_2021/LSOA_(Dec_2021)_Boundaries_Generalised_Clipped_EW_(BGC)/LSOA_(Dec_2021)_Boundaries_Generalised_Clipped_EW_(BGC).shp')

In [42]:
len(lsoas)

35672

# spatial join LSOA REGION

In [43]:
lsoas_regions = gpd.sjoin(lsoas, regions, predicate='within')

In [44]:
len(lsoas_regions['LSOA21CD']), len(set(lsoas_regions['LSOA21CD']))

(30523, 30523)

In [45]:
lsoas_regions_mapping = lsoas_regions[['LSOA21CD', 'RGN22CD', 'RGN22NM', 'LSOA21NM']].drop_duplicates()

## save the mapping

This can be used later, if needed in some analyses so we save it.

In [46]:
# lsoas_regions_mapping.to_csv('../../data/auxiliary_data/lsoas_regions_mapping.csv', index=None)

## add geo columns to the point data. we will create two master files: 

### 1) one as a csv
### 2) second as a geojson

In [47]:
# region_geo_columns  = lsoas_regions[['LSOA21CD', 'RGN22CD', 'RGN22NM', 'LSOA21NM', 'geometry']]
geo_columns  = lsoas[['LSOA21CD', 'LSOA21NM', 'geometry']]

In [48]:
spatial_data = data.merge(geo_columns.\
                rename(columns={'LSOA21CD':'geography code'}), # , 'RGN22NM':'region'
                on='geography code')

In [49]:
# region_spatial_data = data.merge(region_geo_columns.\
#                 rename(columns={'LSOA21CD':'geography code', 'RGN22NM':'region'}),
#                 on='geography code')

In [50]:
spatial_data = spatial_data.set_index('geography code')
spatial_data.head()

Unnamed: 0_level_0,c_percent asian,c_percent black,c_percent mixed,c_percent white,c_percent christian,c_percent jewish,c_percent no religion,c_percent muslim,c_percent no central heating,c_percent wood heating,...,e_trees,e_grass,e_flooded_vegetation,e_crops,e_shrub_and_scrub,e_built,e_bare,e_snow_and_ice,LSOA21NM,geometry
geography code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E01000001,0.107191,0.007463,0.037992,0.801221,0.345763,0.019661,0.48339,0.021695,0.02619,0.0,...,0.053416,0.034901,0.038713,0.031422,0.06179,0.401096,0.094191,0.138707,City of London 001A,"POLYGON ((532105.312 182010.574, 532162.491 18..."
E01000002,0.130592,0.007937,0.04329,0.782107,0.33815,0.025289,0.491329,0.018786,0.019417,0.001214,...,0.047842,0.031679,0.038039,0.030665,0.060742,0.380617,0.099214,0.176378,City of London 001B,"POLYGON ((532634.497 181926.016, 532619.141 18..."
E01000003,0.094913,0.034739,0.062655,0.741315,0.34036,0.026658,0.479851,0.030998,0.035329,0.0,...,0.056071,0.0339,0.035725,0.030262,0.056361,0.45326,0.079852,0.156257,City of London 001C,"POLYGON ((532135.138 182198.131, 532158.250 18..."
E01000005,0.321526,0.108084,0.071753,0.385104,0.367514,0.012704,0.221416,0.309437,0.012526,0.0,...,0.046214,0.030626,0.035777,0.029781,0.064675,0.435324,0.13017,0.119792,City of London 001E,"POLYGON ((533808.018 180767.774, 533649.037 18..."
E01000006,0.479675,0.108401,0.03794,0.327913,0.371614,0.000542,0.070423,0.391116,0.027076,0.0,...,0.08752,0.043782,0.034725,0.039805,0.066398,0.508778,0.085872,0.052288,Barking and Dagenham 016A,"POLYGON ((545122.049 184314.931, 545271.849 18..."


In [51]:
# region_spatial_data = region_spatial_data.set_index('geography code')
# region_spatial_data.head()

In [52]:
# we see that we lose > 2K rows, i.e., LSOAs if we want to include region
len(spatial_data) #, len(region_spatial_data)

31799

# add cenotroid point for each LSOA

In [53]:
spatial_data = gpd.GeoDataFrame(spatial_data, geometry='geometry')

spatial_data = spatial_data.to_crs(lsoas.crs)
spatial_data['center_coordinates'] = spatial_data['geometry'].centroid

spatial_data["centroid_x"] = spatial_data["center_coordinates"].apply(lambda p: p.x)
spatial_data["centroid_y"] = spatial_data["center_coordinates"].apply(lambda p: p.y)

In [54]:
# region_spatial_data = gpd.GeoDataFrame(region_spatial_data, geometry='geometry')
# region_spatial_data = region_spatial_data.to_crs(lsoas.crs)
# region_spatial_data['center_coordinates'] = region_spatial_data['geometry'].centroid

# region_spatial_data["centroid_x"] = region_spatial_data["center_coordinates"].apply(lambda p: p.x)
# region_spatial_data["centroid_y"] = region_spatial_data["center_coordinates"].apply(lambda p: p.y)

In [55]:
print(spatial_data.shape) #, print(region_spatial_data.shape)

(31799, 108)


## save .csv master file

In [56]:
spatial_data.to_csv(collated_data_folder + '{}_spatial_raw_master.csv'.format(the_year))

In [57]:
# region_spatial_data.to_csv('../../data/point_data/{}_region_spatial_raw_master.csv'.format(the_year))

## save .geojson master file

In [58]:
# here, we keep LSOA shape the main geometry; but we also save LSOA cenotroid points saved. this is useful for some othet types of models, such as spatial regression
spatial_data_lsoa = spatial_data.copy()
# del spatial_data_lsoa['center_coordinates']
spatial_data_lsoa['center_coordinates'] = spatial_data_lsoa['center_coordinates'].astype(str)
spatial_data_lsoa.to_file(collated_data_folder + '{}_spatial_raw_master.geojson'.format(the_year), driver='GeoJSON')


# # here, we keep LSOA centroid points as the main geometry; but we also save polygon geometries of LSOAs as lsoa_geometry. the reason is that for spatiall fold validation we need point coordinates for geometry
# spatial_data.rename(columns={'geometry':'lsoa_geometry', 'center_coordinates':'geometry'}, inplace=True)
# # spatial_data['center_coordinates'] = spatial_data['center_coordinates'].astype(str)
# spatial_data['lsoa_geometry'] = spatial_data['lsoa_geometry'].astype(str)
# spatial_data.to_file('../../data/point_data/{}_spatial_raw_master.geojson'.format(the_year), driver='GeoJSON')

In [59]:
# for col in spatial_data.columns:
#     if spatial_data[col].dtype != 'float64':
#         print(col, spatial_data[col].dtype)

In [61]:
# region_spatial_data.rename(columns={'geometry':'lsoa_geometry', 'center_coordinates':'geometry'}, inplace=True)

# # spatial_data['center_coordinates'] = spatial_data['center_coordinates'].astype(str)
# region_spatial_data['lsoa_geometry'] = region_spatial_data['lsoa_geometry'].astype(str)

# region_spatial_data.to_file('../../data/point_data/{}_region_spatial_raw_master.geojson'.format(the_year), driver='GeoJSON')