# Data Aggregation

## I. Import Essential Libraries

In [2]:
# core packages
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

# image packages
import rasterio as rio
import cv2

# visualization packages
import matplotlib.pyplot as plt

#https://docs.opencv.org/2.4/modules/imgproc/doc/geometric_transformations.html#resize

In [32]:
# dictionary code, plus a function to aggregate the deforestation data
def load_data(path_dict):
    data_dict = dict()
    for key, value in path_dict.items():
        with rio.open(value) as src:
            data = src.read()
            data = np.squeeze(data).astype("float32")
            data_dict[key] = data
            plt.imshow(data)
            plt.show()
    
    # split defor data by years and add to result
    for year in np.unique(data_dict["defor"]):
        if year == 0:
            continue
        print("Splitting year {}".format(int(2000 + year)))
        # split years
        data_dict["defor_{}".format(int(2000 + year))] = csr_matrix((data_dict["defor"] == year).astype(int))
    
    data_dict.pop("defor")
    return data_dict
def grid_data(data_dict, grid_size, smaller=False):
    result = dict()
    for key, val in data_dict.items():
        if (key[0:5] == "defor"):
            #grid deforestation data
            grid = cv2.resize(np.array(val.todense()).astype("float32"), dsize=(grid_size, grid_size), interpolation = cv2.INTER_AREA)
        else:
            # grid non-deforestation data
            if smaller:
                grid = cv2.resize(val, dsize=(grid_size, grid_size), interpolation)
            else:
                grid = cv2.resize(val, dsize=(grid_size, grid_size), interpolation = cv2.INTER_AREA)
                
        result["{k}_grid".format(k = key)] = grid
    return result
def write_csv(grid_dict, grid_size):
    # get the rows and cols
    row = np.arange(grid_size ** 2) // grid_size
    col = np.arange(grid_size ** 2) % grid_size
    
    # define dataframe and set row and cols
    df = pd.DataFrame(index = row * grid_size + col)
    df["row"] = row
    df["col"] = col

    # add data to dataframe
    for key, val in grid_dict.items():
        df[key] = val.flatten()
        
    # write dataframe to csv
    output_path = "../data/processed/aggregate.csv"
    df.to_csv(output_path, index_label = "id") 
def grid_defor_data():
    grid_size = 500

    defor = []
    defor_grid = []
    colnames = []

    defor_img_path = "../data/raw/segmented/{lat}{long}/defor_{lat}{long}.tif".format(lat = LAT, long = LONG)

    with rio.open(defor_img_path) as src:
        defor_data = src.read()
        defor_data = np.squeeze(data).astype("float32")

    for year in np.unique(defor_data):
        if year == 0:
            continue
        print("Splitting year {}".format(int(2000 + year)))
        # split years
        defor.append(csr_matrix((data_dict["defor"] == year).astype(int)))
        colnames.append("defor_{yr}".format(int(2000 + year)))

    for val in defor:    
        grid = cv2.resize(np.array(val.todense()).astype("float32"), dsize=(grid_size, grid_size), interpolation = cv2.INTER_AREA)
        defor_grid.append(grid)

    row = np.arange(grid_size ** 2) // grid_size
    col = np.arange(grid_size ** 2) % grid_size

    defor_df = pd.DataFrame(index = row * grid_size + col)
    defor_df["row"] = row
    defor_df["col"] = col

    for i in range(len(grids)):
        defor_df[colnames[i]] = grid.flatten()
    
    return defor_df 

In [78]:
def make_paths(filename, years):
    paths = []
    year = years[0]
    s = filename.replace(str(year), "{year}")
    for yr in years:
        path = s.format(year = yr)
        paths.append(path)
    return paths

def load_data(paths):
    datasets = []
    for path in paths:
        with rio.open(path) as src:
            data = src.read()
            data = np.squeeze(data).astype("float32")
            datasets.append(data)
    return datasets

def grid_data(paths, grid_size, colnames, smaller = False):
    dataset = load_data(paths)
    
    grids = []
    
    # grid data
    for data in dataset:
        print(dataset)
        if smaller:
            grid = cv2.resize(data, dsize=(grid_size, grid_size), interpolation = cv2.INTER_LINEAR)
        else:
            grid = cv2.resize(data, dsize=(grid_size, grid_size), interpolation = cv2.INTER_AREA)
        grids.append(grid)
    
    # define dataframe and set row and cols
    row = np.arange(grid_size ** 2) // grid_size
    col = np.arange(grid_size ** 2) % grid_size
    
    df = pd.DataFrame(index = row * grid_size + col)
    df["row"] = row
    df["col"] = col
    
    print(len(grids))
    for i in range(len(grids)):
        df[colnames[i]] = grid.flatten()
    return df
    
def aggregate(paths, agg_data_path, grid_size, smaller, names):
    dataset = grid_data(paths, grid_size, smaller)
    print("complete")
    return dataset
    #merge_data(agg_data_path, dataset)

In [79]:
curr = grid_data(paths = ["../data/raw/segmented/10S60W/chirps_2001_10S60W.tif", "../data/raw/segmented/10S60W/chirps_2002_10S60W.tif"], grid_size = 500, colnames = ["chirps_2001", "chirps_2002"])
pab = grid_data(["../data/raw/segmented/10S60W/chirps-v2.0.2001_segmented.tif", "../data/raw/segmented/10S60W/chirps_2001_10S60W.tif", "../data/raw/segmented/10S60W/chirps_2002_10S60W.tif"], 500, ["chirps_pablo", "chirps_mine", "chirps_2002"])

[array([[2224.9453, 2213.5957, 2227.5608, ..., 1784.4402, 1723.7512,
        1712.4493],
       [2227.0645, 2256.6042, 2246.2063, ..., 1778.2191, 1771.9319,
        1734.282 ],
       [2230.7144, 2266.6091, 2308.1462, ..., 1774.7363, 1759.7183,
        1731.4569],
       ...,
       [ 861.0691,  897.737 ,  911.3354, ..., 1134.9917, 1134.5109,
        1121.02  ],
       [ 877.6898,  896.6368,  921.1529, ..., 1127.404 , 1124.0513,
        1110.252 ],
       [ 870.9298,  900.1557,  907.9926, ..., 1121.4924, 1122.8226,
        1118.6578]], dtype=float32), array([[1959.0697 , 1934.5134 , 1919.1494 , ..., 1661.246  , 1688.5756 ,
        1664.6273 ],
       [1919.6699 , 1927.8126 , 1899.8302 , ..., 1618.6786 , 1681.2275 ,
        1665.1589 ],
       [1929.1438 , 1897.0417 , 1877.9425 , ..., 1633.1919 , 1674.0199 ,
        1671.622  ],
       ...,
       [ 616.16327,  625.4192 ,  617.0574 , ..., 1197.5308 , 1210.5409 ,
        1228.7294 ],
       [ 619.23175,  614.3745 ,  618.47504, ..., 1182.

In [68]:
pab

Unnamed: 0,row,col,chirps_pablo,chirps_mine,chirps_2002
0,0,0,1959.069702,1959.069702,1959.069702
1,0,1,1959.069702,1959.069702,1959.069702
2,0,2,1946.791504,1946.791504,1946.791504
3,0,3,1934.513428,1934.513428,1934.513428
4,0,4,1934.513428,1934.513428,1934.513428
...,...,...,...,...,...
249995,499,495,1138.543823,1138.543823,1138.543823
249996,499,496,1138.543823,1138.543823,1138.543823
249997,499,497,1143.049805,1143.049805,1143.049805
249998,499,498,1147.555664,1147.555664,1147.555664


In [65]:
agg = pd.read_csv("../data/processed/aggregate.csv")                                                                                                                                                                                        
agg["chirps_2001_grid"]

0         2256.6042
1         2256.6042
2         2251.4053
3         2246.2063
4         2246.2063
            ...    
249995    1100.9149
249996    1100.9149
249997    1111.6480
249998    1122.3810
249999    1122.3810
Name: chirps_2001_grid, Length: 250000, dtype: float64

## II. set paths for desired dataset 
comment out code for data that you're not including, and add paths for a new dataset 

In [4]:
GRID_SIZE = 500
LAT, LONG = ("10S", "60W")
AGG_DATA_PATH = "../data/processed/agg_copies/aggregate.csv"

paths = []
colnames = []

# paths to segmented data (ADD YOUR PATH HERE)

# maize_data_path = "../data/raw/segmented/{lat}{long}/maize_{lat}{long}.tif".format(lat = LAT, long = LONG)
# paths.append(maize_data_path)
# colnames.append("maize")

# soy_data_path = "../data/raw/segmented/{lat}{long}/soy_{lat}{long}.tif".format(lat = LAT, long = LONG)
# paths.append(soy_data_path)
# colnames.append("soy")

# chirp_yrs = np.arange(2001, 2019)
# for yr in chirp_yrs: 
#     chirps_data_path = "../data/raw/segmented/{lat}{long}/chirps_{yr}_{lat}{long}.tif".format(yr = yr, lat = LAT, long = LONG)
#     paths.append(chirps_data_path)
#     colnames.append("chirps_{yr}".format(yr = yr))
    
pop_yrs = [2000, 2005, 2010, 2015]
for yr in pop_yrs:
    pop_data_path = "../data/raw/segmented/{lat}{long}/pop_count_{yr}_{lat}{long}.tif".format(yr = yr, lat = LAT, long = LONG)
    paths.append(pop_data_path)
    colnames.append("pop_{yr}".format(yr = yr))

In [26]:
aggregate(paths, "../data/processed/aggregate.csv", GRID_SIZE, True, colnames)

complete


In [31]:
data = pd.read_csv("../data/processed/agg.csv")
data2 = pd.read_csv("../data/processed/aggregate2.csv")

def merge_data(data, aggregate_path, path = False, name = "aggregate"):

    agg = pd.read_csv("../data/processed/aggregate.csv")
    merged_data = pd.merge(agg, data)

    merged_data.to_csv("../data/processed/{name}.csv".format(name = name), index = False)
    print(read_csv("../data/processed/{name}.csv".format(name = name)))

    

In [32]:
list(data)

['id',
 'row',
 'col',
 'maize_grid',
 'soy_grid',
 'defor_2001_grid',
 'defor_2002_grid',
 'defor_2003_grid',
 'defor_2004_grid',
 'defor_2005_grid',
 'defor_2006_grid',
 'defor_2007_grid',
 'defor_2008_grid',
 'defor_2009_grid',
 'defor_2010_grid',
 'defor_2011_grid',
 'defor_2012_grid',
 'defor_2013_grid',
 'defor_2014_grid',
 'defor_2015_grid',
 'defor_2016_grid',
 'defor_2017_grid',
 'defor_2018_grid',
 'chirps_2001_grid',
 'chirps_2002_grid',
 'chirps_2003_grid',
 'chirps_2004_grid',
 'chirps_2005_grid',
 'chirps_2006_grid',
 'chirps_2007_grid',
 'chirps_2008_grid',
 'chirps_2009_grid',
 'chirps_2010_grid',
 'chirps_2011_grid',
 'chirps_2012_grid',
 'chirps_2013_grid',
 'chirps_2014_grid',
 'chirps_2015_grid',
 'chirps_2016_grid',
 'chirps_2017_grid',
 'chirps_2018_grid',
 'pop_2000',
 'pop_2005',
 'pop_2010',
 'pop_2015']

In [33]:
list(data)
data.drop(data.iloc[:, 3:23], inplace = True, axis = 1)
list(data)


['id',
 'row',
 'col',
 'chirps_2001_grid',
 'chirps_2002_grid',
 'chirps_2003_grid',
 'chirps_2004_grid',
 'chirps_2005_grid',
 'chirps_2006_grid',
 'chirps_2007_grid',
 'chirps_2008_grid',
 'chirps_2009_grid',
 'chirps_2010_grid',
 'chirps_2011_grid',
 'chirps_2012_grid',
 'chirps_2013_grid',
 'chirps_2014_grid',
 'chirps_2015_grid',
 'chirps_2016_grid',
 'chirps_2017_grid',
 'chirps_2018_grid',
 'pop_2000',
 'pop_2005',
 'pop_2010',
 'pop_2015']

In [36]:
m = pd.merge(data2, data)
m.to_csv("../data/processed/aggregate.csv", index = False)

In [51]:
pop = agg_data[['id','pop_2000', 'pop_2005', 'pop_2010', 'pop_2015']]

In [1]:
m = pd.merge(agg_2, pop)
m.to_csv("../data/agg.csv", index = False)

NameError: name 'pd' is not defined