In [5]:
import re
import numpy as np
import pandas as pd
import geopandas as gpd
from pathlib import Path
from netCDF4 import Dataset
from functools import partial, partialmethod
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from projections.temperature import NcConverter, Aggregator
from projections import raster, utils

In [2]:
def save_location_mapping(row_and_path):
    row, path = row_and_path
    shape = row['geometry']
    
    subset = raster.find_subset_with_intersection_area(IMAGE, shape)

    if subset.empty:
        with open(path, 'w') as f:
            f.write('')
        return

    subset['id'] = row['id']
    
    subset.to_csv(path, index=False)   


In [3]:
read_path = Path("../Data/Temperature")
filename = "cru_ts4.05.1901.2020.tmp.dat.nc"

output_path = utils.make_path('../Output/Temperature/')
partial_path = utils.make_path(output_path / "partial")
by_country_path = utils.make_path(output_path / "by_country")

output_path.mkdir(exist_ok=True)

# Map raster to polygons

In [4]:
geo_df = gpd.read_file('../Shapefiles/preprocessed/all_countries_with_eth.shp')

In [5]:
n_processes = 30

converter = NcConverter()
converter.read(read_path / filename)
IMAGE = converter.get_xarray("tmp", period=0)

iterator = partial(utils.yield_missing_shapes, save_path=partial_path, prefix='p0')

if n_processes == 1:
    for row_and_path in tqdm(iterator(geo_df)):
        save_location_mapping(row_and_path)
else:
    with ThreadPoolExecutor(n_processes) as tpe:
        for _ in tqdm(
            tpe.map(save_location_mapping, iterator(geo_df)), 
            total=geo_df.shape[0]
        ):
            pass

100%|█████████████████████████████████| 122772/122772 [9:06:54<00:00,  3.74it/s]


# Union portions

In [6]:
df_by_region = {}
for file in tqdm(partial_path.glob('*.csv'), desc='Reading'):
    try:
        df = pd.read_csv(file)
    except pd.errors.EmptyDataError:
        continue
        
    if 'id' not in df.columns:
        df['id'] = df['adm2']
        df['id'].fillna(df['adm1'], inplace=True)
        df['id'].fillna(df['adm0'], inplace=True)
    region = df.loc[0, 'id']
    df_by_region.setdefault(region, []).append(df)

for region, dfs in tqdm(df_by_region.items(), desc='Saving'):
    df = utils.combine_dataframes(dfs)
    df.to_feather(by_country_path / f'{region}.feather')
    
del df_by_region

Reading: 122772it [03:02, 671.42it/s]
Saving: 100%|████████████████████████████| 48140/48140 [01:12<00:00, 668.36it/s]


# Map all time periods and variables

In [12]:
converter = NcConverter()
converter.read(read_path / filename)

base_agg = Aggregator(
        by_country_path=by_country_path, 
        partial_path=partial_path, 
        in_memory=True
    )
mapping_dfs = base_agg.get_mapping_dfs()

def aggregate(image):
    agg = Aggregator(
        by_country_path=by_country_path, 
        partial_path=partial_path, 
        in_memory=True,
        mapping_dfs=mapping_dfs
    )
    agg.aggregate(image)

n_processes = 31

print(f"Running with {n_processes} processes")
for variable in ("tmp", "stn"):
    if n_processes == 1:
        for image in converter.iter_periods(variable):
            print(aggregate(image))
    else:
        tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)
        with ProcessPoolExecutor(n_processes) as tpe:
            for name in tpe.map(aggregate, converter.iter_periods(attribute_name=variable)):
                pass

Reading: 48140it [01:09, 697.31it/s]

Running with 31 processes





None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None


In [13]:
def rename_value_column(df, file_name):
    var, suffix = file_name.split('_')
    time_n = re.findall(r"p(\d+).csv", suffix)[0]
    yearmonth = time_n_to_yearmonth(time_n)
    df.rename(columns={"value": f"{var}_{yearmonth}"}, inplace=True)
    
    
def time_n_to_yearmonth(n):
    base_year = 1901
    n = int(n)
    offset = n // 12
    month = n % 12
    return f"{base_year + offset}{month + 1:02}"


df = None

for file in tqdm(output_path.glob('*.csv')):        
    field = utils.robust_read(file)
    if field.empty:
        continue
        
    field.set_index("id", inplace=True)
    rename_value_column(field, file.name)
        
    if df is None:
        df = field
    else:
        df = df.merge(
            field.drop(columns='intersection_area'), 
            left_index=True, 
            right_index=True, 
            how='outer'
        )
        
columns = sorted((x for x in df.columns if x != "intersection_area"))
df = df[["intersection_area"] + columns]
df.reset_index().to_csv(output_path / "temperature.csv", index=False)

In [33]:
for col in list(df.columns)[1:]:
    if len(col) == 9:
        new_name = col[:-1] + '0' + col[-1]
        df.rename(columns={col: new_name}, inplace=True)

In [28]:
columns = sorted((x for x in df.columns if x != "intersection_area"))
df = df[["intersection_area"] + columns]

In [22]:
df['stn_190101']

id
3917                NaN
ABW            3.000000
AFG.1.10_1     8.000000
AFG.1.11_1     8.000000
AFG.1.13_1     8.000000
                 ...   
ZWE__THONGA    5.000000
ZWE__TLOKWA    5.000000
ZWE__TONGA     3.027377
ZWE__VENDA     5.000000
ZWE__ZEZURU    3.160093
Name: stn_190101, Length: 47770, dtype: float64

In [25]:
columns = sorted((x for x in df.columns if x != "intersection_area"), key=lambda x: int(x.split('_')[-1]))
df = df[["intersection_area"] + columns]

In [35]:
df.reset_index().to_csv(output_path / "temperature.csv", index=False)

In [34]:
df.head()

Unnamed: 0_level_0,intersection_area,stn_190101,stn_190102,stn_190103,stn_190104,stn_190105,stn_190106,stn_190107,stn_190108,stn_190109,...,tmp_202003,tmp_202004,tmp_202005,tmp_202006,tmp_202007,tmp_202008,tmp_202009,tmp_202012,tmp_202012,tmp_202012
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3917,,,,,,,,,,,...,,,,,,,,,,
ABW,2209819.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,3.0,...,28.300001,28.800001,29.200001,29.5,29.4,29.9,29.800001,28.800001,28.0,27.5
AFG.1.10_1,903250300.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,11.374198,15.59284,21.451697,24.978059,27.751024,26.355811,20.25195,14.110807,6.510807,0.675125
AFG.1.11_1,4485928000.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,-0.847299,3.69997,8.547196,12.144378,15.895391,16.253476,10.386953,3.904004,-3.779418,-9.824648
AFG.1.13_1,1989237000.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,-0.44839,3.704092,8.484955,12.110209,15.661711,15.856131,10.627981,4.884626,-1.943497,-7.085602
