# Data Aggregation

## I. Import Essential Libraries

In [1]:
# Core packages
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

# Image packages
import rasterio as rio
import cv2

# Visualization packages
import matplotlib.pyplot as plt

## II. Load Data

In [103]:
def load_data(lat, long):
    defor_img_path = "../data/raw/segmented/{lat}{long}/defor_{lat}{long}.tif".format(lat = lat, long = long)
    maize_data_path = "../data/raw/segmented/{lat}{long}/maize_{lat}{long}.tif".format(lat = lat, long = long)
    soy_data_path = "../data/raw/segmented/{lat}{long}/soy_{lat}{long}.tif".format(lat = lat, long = long)


    # Loading deforestation data
    with rio.open(defor_img_path) as defor_src:
        defor_data = defor_src.read()

    # Loading maize data
    with rio.open(maize_data_path) as maize_src:
        maize_data = maize_src.read()
        
    # Loading soybean data
    with rio.open(soy_data_path) as soy_src:
        soy_data = soy_src.read()

    # Squeeze data (reduce trivial dimensions)
    defor_data = np.squeeze(defor_data).astype('float32')
    maize_data = np.squeeze(maize_data).astype('float32')
    soy_data = np.squeeze(soy_data).astype('float32')
    
    # Define result
    result = {"maize_data": maize_data, "soy_data": soy_data}
    #     # Split defor data by years and add to result
#     for year in np.unique(defor_data):
#         if year == 0:
#             continue
#         print("Splitting year {}".format(2000 + year))
#         # Split years
#         result["defor_{}_data".format(2000 + year)] = csr_matrix((defor_data == year).astype(int))
    
    return result

## III. Gridding Using OpenCV Inter Area
https://docs.opencv.org/2.4/modules/imgproc/doc/geometric_transformations.html#resize

In [104]:
def grid_data(data_dict, grid_size):
    result = dict()
    for key, val in data_dict.items():
        grid = cv2.resize(val, dsize=(grid_size, grid_size), interpolation = cv2.INTER_AREA)
        result["{}_grid".format(key)] = grid

    return result

## IV. Write To CSV

In [110]:
def write_csv(grid_dict, grid_size):
    row = np.arange(grid_size ** 2) // grid_size
    col = np.arange(grid_size ** 2) % grid_size
    df = pd.DataFrame(index = row * grid_size + col)
    df["row"] = row
    df["col"] = col

    for key, val in grid_dict.items():
        df[key] = val.flatten()
        
    output_path = "../data/processed/aggregate.csv"
    df.to_csv(output_path, index_label = "id")
        

In [106]:
GRID_SIZE = 500
LAT, LONG = ("10S", "60W")


In [107]:
data_dict = load_data(LAT, LONG)


In [108]:
grid_dict = grid_data(data_dict, GRID_SIZE)


In [111]:
write_csv(grid_dict, GRID_SIZE)
