# segment data

In [9]:
import rasterio as rio
import numpy as np
from PIL import Image

In [60]:
def load_seg_data(filename, years):
    paths = []
    year = years[0]
    # get paths from filename + list of years covered
    for yr in years:
        path = filename.replace(str(year), str(yr))
        paths.append(path)
    print(paths)   
    datasets = []
    # read datasets from paths
    for path in paths:
        with rio.open(path) as src:
            data = src.read()
            data = np.squeeze(data).astype("float32")
            datasets.append(data)
    return datasets, src

def calc_bounds(data, src, target_coords, length = 10): 
    lat = target_coords[0]
    long = target_coords[1]
    
    lat_index = np.arange(0, data.shape[0])
    long_index = np.arange(0, data.shape[1])

    A = src.transform
    
    ymin, xmin = rio.transform.rowcol(A, long, lat - length)
    ymax, xmax = rio.transform.rowcol(A, long + length, lat)
    
    return xmin, xmax, ymin, ymax

def get_coord_string(coords):
    if coords[0] < 0:
        lat = str(coords[0]*-1) + "S"
    else:
        lat = str(coords[0]) + "N"
    if coords[1] < 0:
        long = str(coords[1]*-1) + "W"
    else:
        long = str(coords[1]) + "E"
    coord_string = lat + long
    
    return coord_string

def segment(filename, years, target_coords, name=""):
    data, src = load_seg_data(filename, years)
    xmin, xmax, ymin, ymax = calc_bounds(data[0], src, target_coords)
    
    for i in range(len(data)):
        seg_data = data[i][ymax:ymin, xmin:xmax]
        seg_img = Image.fromarray(seg_data)
        seg_img.save("../data/raw/segmented/{coords}/{name}_{yr}_{coords}.tif".format(coords = get_coord_string(target_coords), name = name, yr = years[i]))
        print("saved", years[i], "data") 

In [61]:
years = (2000, 2005, 2010, 2015)
filename = "../data/raw/population/gpw_v4_population_count_rev11_2000_30_sec.tif"

segment(filename, years, (-10, -60), name = "pop")

['../data/raw/population/gpw_v4_population_count_rev11_2000_30_sec.tif', '../data/raw/population/gpw_v4_population_count_rev11_2005_30_sec.tif', '../data/raw/population/gpw_v4_population_count_rev11_2010_30_sec.tif', '../data/raw/population/gpw_v4_population_count_rev11_2015_30_sec.tif']
saved 2000 data
saved 2005 data
saved 2010 data
saved 2015 data


# aggregate data

In [62]:
# core packages
import pandas as pd
from scipy.sparse import csr_matrix

# image packages
import rasterio as rio
import cv2

# visualization packages
import matplotlib.pyplot as plt

In [63]:
def load_agg_data(filename, years):
    paths = []
    colnames = []
    year = years[0]
    for yr in years:
        path = filename.replace(str(year), str(yr))
        paths.append(path)
        print(path)
        colnames.append("chirps_{yr}".format(yr = yr))
    print(colnames)
    datasets = []
    for path in paths:
        with rio.open(path) as src:
            data = src.read()
            data = np.squeeze(data).astype("float32")
            datasets.append(data)
    return datasets, colnames

def grid_data(filename, years, grid_size, smaller = False):
    dataset, colnames = load_agg_data(filename, years)
    grids = []
    
    # grid data
    for data in dataset:
        if smaller:
            grid = cv2.resize(data, dsize=(grid_size, grid_size), interpolation = cv2.INTER_LINEAR)
            grids.append(grid)
        else:
            grid = cv2.resize(data, dsize=(grid_size, grid_size), interpolation = cv2.INTER_AREA)
            grids.append(grid)
    
    # define dataframe and set row and cols
    row = np.arange(grid_size ** 2) // grid_size
    col = np.arange(grid_size ** 2) % grid_size
    
    df = pd.DataFrame(index = row * grid_size + col)
    df["row"] = row
    df["col"] = col
    
    print(len(grids))
    for i in range(len(grids)):
        df[colnames[i]] = grid.flatten()
    return df
    
def aggregate(filename, years, agg_data_path, grid_size, smaller):
    dataset = grid_data(filename, years, grid_size, smaller)
    print("complete")
    return dataset
    #merge_data(agg_data_path, dataset)

In [64]:
GRID_SIZE = 500
LAT, LONG = ("10S", "60W")
AGG_DATA_PATH = "../data/processed/agg_copies/aggregate.csv"

years = (2000, 2005, 2010, 2015)
filename = "../data/raw/segmented/10S60W/pop_2000_10S60W.tif"
aggregate(filename, years, AGG_DATA_PATH, GRID_SIZE, False)

../data/raw/segmented/10S60W/pop_2000_10S60W.tif
../data/raw/segmented/10S60W/pop_2005_10S60W.tif
../data/raw/segmented/10S60W/pop_2010_10S60W.tif
../data/raw/segmented/10S60W/pop_2015_10S60W.tif
['chirps_2000', 'chirps_2005', 'chirps_2010', 'chirps_2015']
4
complete


Unnamed: 0,row,col,chirps_2000,chirps_2005,chirps_2010,chirps_2015
0,0,0,0.073029,0.073029,0.073029,0.073029
1,0,1,0.073124,0.073124,0.073124,0.073124
2,0,2,0.073223,0.073223,0.073223,0.073223
3,0,3,0.073334,0.073334,0.073334,0.073334
4,0,4,0.082774,0.082774,0.082774,0.082774
...,...,...,...,...,...,...
249995,499,495,1.941345,1.941345,1.941345,1.941345
249996,499,496,2.125511,2.125511,2.125511,2.125511
249997,499,497,2.509143,2.509143,2.509143,2.509143
249998,499,498,2.128663,2.128663,2.128663,2.128663


In [54]:
# paths to segmented data (ADD YOUR PATH HERE)

# maize_data_path = "../data/raw/segmented/{lat}{long}/maize_{lat}{long}.tif".format(lat = LAT, long = LONG)
# paths.append(maize_data_path)
# colnames.append("maize")

# soy_data_path = "../data/raw/segmented/{lat}{long}/soy_{lat}{long}.tif".format(lat = LAT, long = LONG)
# paths.append(soy_data_path)
# colnames.append("soy")

# chirp_yrs = np.arange(2001, 2019)
# for yr in chirp_yrs: 
#     chirps_data_path = "../data/raw/segmented/{lat}{long}/chirps_{yr}_{lat}{long}.tif".format(yr = yr, lat = LAT, long = LONG)
#     paths.append(chirps_data_path)
#     colnames.append("chirps_{yr}".format(yr = yr))
    
# pop_yrs = [2000, 2005, 2010, 2015]
# for yr in pop_yrs:
#     pop_data_path = "../data/raw/segmented/{lat}{long}/pop_count_{yr}_{lat}{long}.tif".format(yr = yr, lat = LAT, long = LONG)
#     paths.append(pop_data_path)
#     colnames.append("pop_{yr}".format(yr = yr))

In [8]:
#aggregate(paths, "../data/processed/aggregate.csv", GRID_SIZE, True, colnames)

def merge_data(data, aggregate_path, path = False, name = "aggregate"):

    agg = pd.read_csv("../data/processed/aggregate.csv")
    merged_data = pd.merge(agg, data)

    merged_data.to_csv("../data/processed/{name}.csv".format(name = name), index = False)
    print(read_csv("../data/processed/{name}.csv".format(name = name)))