# Prepare Data for CNN

Prepares data for CNN. 
1. Outputs numpy arrays of DTL values and NTL labels
2. Creates parameter dictionary (eg, number of NTL labels)


Some notes on AWS:
1. Use the conda_python3 environment to install geopandas and rasterio (Takes a while).
2. Large minimum bin sizes will take a long time. A single band with a mostly full 16814 minimum bin size takes about 10-12 hours. 
3. A more powerful instance type does not seem to affect runtime.
4. Might be worth babysitting it while it runs. Time outs or connection issues seem to interrupt the process.

## Setup

In [68]:
### Libraries ###
import os, datetime
import numpy as np
import pandas as pd
import geopandas as gpd
import json
import rasterio
from rasterio.plot import show

from geopandas import GeoDataFrame
from shapely.geometry import Point

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix

import logging, os 

### User Defined Libraries ###
import config as cf
import feature_extraction as fe

### Set Seeds ###
seed_value = 42
# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
os.environ['PYTHONHASHSEED'] = str(seed_value)
# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)
# 3. Set the `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)

### Parameters / Paths ###
FINAL_TARGET_NAME = 'ntl_bins'
VIIRS_GDF_FILEPATH = cf.VIIRS_GDF_FILEPATH
DTL_DIRECTORY = cf.DTL_DIRECTORY


## Functions

In [58]:
def pd_to_gdp(df, lat_name = 'latitude', lon_name = 'longitude'):
    '''
    Converts a pandas dataframe with lat and long variables into
    geopandas point data

    Input:  df - pandas dataframe
            lat_name - name of latitude variable in df
            lon_name - name of longitude variable in df
    Output: geopandas dataframe
    '''

    geometry = [Point(xy) for xy in zip(df[lon_name], df[lat_name])]
    df = df.drop([lon_name, lat_name], axis=1)
    gdf = GeoDataFrame(df, crs="EPSG:4326", geometry=geometry)

    return gdf

def normalize(X):
    '''
    Normalizes features.
    '''
    return X.astype('float32') / 255.0

In [59]:
def pre_cnn_data(gdf, 
                 sat_suffix,
                 years, 
                 out_folder, 
                 image_height, 
                 image_width):
    
    '''
    Creates numpy arrays for CNN

    Input:  df - pandas dataframe
            lat_name - name of latitude variable in df
            lon_name - name of longitude variable in df
    Output: geopandas dataframe
    '''
    
    # Define bands based on satellite type
    if sat_suffix == "l7":
        bands_list = [['4', '3', '2'], ['5'], ['6'], ['7']]
        
    if sat_suffix == "l8":
        bands_list = 'TODO'
        
    if sat_suffix == "s2":
        bands_list = 'TODO'
        
    # Loop through bands and years and extract
    for year_i in years:
        
        # Folder with satellite imagery    
        dtl_directory = os.path.join(cf.DROPBOX_DIRECTORY, 
                                     'Data', 
                                     'Daytime Satellite Imagery', 
                                     sat_suffix, 
                                     str(year_i))
        
        # Loop through bands
        for bands_i in bands_list:
            
            DTL, processed_gdf = fe.map_DTL_NTL(gdf, 
                                                dtl_directory, 
                                                bands = bands_i, 
                                                img_height = image_height, 
                                                img_width = image_width, 
                                                year = year_i, 
                                                sat_suffix = 'l7')
            
            print(processed_gdf.shape)
            print(DTL.shape)
                
            bands_i_name = '_'.join(bands_i)   
            processed_gdf.to_pickle(os.path.join(out_folder, 'dep_var.pkl'))        
            np.save(os.path.join(out_folder, f'dtl_{sat_suffix}_b{bands_i_name}_rgb_{str(year_i)}.npy'), DTL)

    return "Done!"

## Params

In [60]:
image_height = 48 # VGG16 needs images to be rescale to 224x224
image_width = 48

## Process - VIIRS

In [63]:
viirs = pd.read_pickle(os.path.join(cf.DROPBOX_DIRECTORY, 'Data', 'VIIRS', 'FinalData', 'random_samples', 'viirs_random_sample.pkl'))
viirs = viirs.head(1000)

In [64]:
pre_cnn_data(gdf = viirs, 
             sat_suffix = 'l7', 
             years = [2014], 
             out_folder = os.path.join(cf.GD_CNN_DIRECTORY, 'VIIRS'), 
             image_height = image_height, 
             image_width = image_width)

0/1000
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
(1000, 12)
(1000, 48, 48, 3)
0/1000
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
(1000, 12)
(1000, 48, 48, 1)
0/1000
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
(1000, 12)
(1000, 48, 48, 1)
0/1000
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000
(1000, 12)
(1000, 48, 48, 1)


'Done!'

## Process OPM

In [67]:
# GPS_uid_crosswalk.csv
# opm_socioeconomic_geo.csv
opm_df = pd.read_csv(os.path.join(cf.SECURE_DATA_DIRECTORY, 'Data', 'OPM', 'FinalData - PII', 'GPS_uid_crosswalk.csv'))
#opm_df = opm_df[opm_df['latitude'].notnull()]
opm_df = pd_to_gdp(opm_df)
opm_df['geometry'] = opm_df.buffer(distance = 0.75/111.12).envelope

In [66]:
pre_cnn_data(gdf = opm_df, 
             sat_suffix = 'l7', 
             years = [2014], 
             out_folder = os.path.join(cf.GD_CNN_DIRECTORY, 'OPM'), 
             image_height = image_height, 
             image_width = image_width)

0/5361
100/5361
200/5361
300/5361
400/5361
500/5361
600/5361
700/5361
800/5361
900/5361
1000/5361
1100/5361
1200/5361
1300/5361
1400/5361
1500/5361
1600/5361
1700/5361
1800/5361
1900/5361
2000/5361
2100/5361
2200/5361
2300/5361
2400/5361
2500/5361
2600/5361
2700/5361
2800/5361
2900/5361
3000/5361
3100/5361
3200/5361
3300/5361
3400/5361
3500/5361
3600/5361
3700/5361
3800/5361
3900/5361
4000/5361
4100/5361
4200/5361
4300/5361
4400/5361
4500/5361
4600/5361
4700/5361
4800/5361
4900/5361
5000/5361
5100/5361
5200/5361
5300/5361
(5361, 3)
(5361, 48, 48, 3)
0/5361
100/5361
200/5361
300/5361
400/5361
500/5361
600/5361
700/5361
800/5361
900/5361
1000/5361
1100/5361
1200/5361
1300/5361
1400/5361
1500/5361
1600/5361
1700/5361
1800/5361
1900/5361
2000/5361
2100/5361
2200/5361
2300/5361
2400/5361
2500/5361
2600/5361
2700/5361
2800/5361
2900/5361
3000/5361
3100/5361
3200/5361
3300/5361
3400/5361
3500/5361
3600/5361
3700/5361
3800/5361
3900/5361
4000/5361
4100/5361
4200/5361
4300/5361
4400/5361
4500/5

'Done!'