In [10]:
import pandas as pd 
import rasterio as rio
import numpy as np

import glob

from joblib import Parallel, delayed


import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_context('poster')
sns.set_color_codes()
plot_kwds = {'alpha' : 0.25, 's' : 80, 'linewidths':0}

from sklearn.manifold import TSNE
import hdbscan
from utils.constants import DEPTS, YEARS
from sklearn.cluster import KMeans, DBSCAN


from scipy.spatial.distance import cdist
from scipy.stats import pearsonr


import shutil
import os

import pickle



In [2]:
def nodata_to_zero(array: np.array, no_data: int) -> np.array:
    array = np.where(array != no_data, array, 0)
    return array


def load_merged_subtile(file_name: str, width: int, height: int) -> np.array:
    # Channels, H, W
    subtile = np.zeros((6, width, height))

    with rio.open(file_name) as src:
        temp_arr = src.read()
        no_data = src.nodata

    if no_data != 0:
        temp_arr = nodata_to_zero(temp_arr, no_data)

    subtile[:, : temp_arr.shape[1], : temp_arr.shape[2]] = temp_arr

    return subtile


def load_pixel(file_name: str) -> np.array:
    with rio.open(file_name) as src:
        temp_arr = src.read()
        no_data = src.nodata

    if no_data != 0:
        temp_arr = nodata_to_zero(temp_arr, no_data)
    
    return temp_arr

def check_zero(number):
    if np.all(number == 0):
        return False  
    return True


def copy_paste_files(paths, dest_folder):
    if os.path.exists(dest_folder):
        shutil.rmtree(dest_folder)
    os.mkdir(dest_folder)

    for path in paths:
        shutil.copy(path, dest_folder) 


def flatten(t):
    return [item for sublist in t for item in sublist]



In [4]:
subDEPTS = ['Aisne']
subYEARS = ['2018']

def get_glob_paths(dept, year):
    return glob.glob(f"../french_dept_data/{dept}/{year}/split*_1/*")

paths = Parallel(n_jobs=8)(delayed(get_glob_paths)(dept, year) for year in subYEARS for dept in subDEPTS)

In [5]:

paths = flatten(paths)
print(len(paths))

20433


In [6]:
non_empty_pixels = Parallel(n_jobs=8)(delayed(load_pixel)(path) for path in paths)

In [7]:
non_zero_paths = []
non_empty_pixels_ = []

for pixel, path in zip(non_empty_pixels, paths):
    if check_zero(pixel):
        non_empty_pixels_.append(pixel)
        non_zero_paths.append(path)

In [8]:
len(non_zero_paths)

11556

In [9]:
pixel_groups = dict()

count = 0
while len(non_zero_paths) > 1:
    count += 1
    pixel_group = []
    pixel_paths_group = []
    pixel_group_indices = []
    pixel_a = non_empty_pixels_[0]
    for index, (pixel, path) in enumerate(zip(non_empty_pixels_, non_zero_paths)):
        if index == 0:
            pixel_group.append(pixel)
            pixel_paths_group.append(path)
            pixel_group_indices.append(index)
        else:
            if pearsonr(pixel_a.reshape(-1), pixel.reshape(-1))[0] > 0.9:
                if cdist(pixel_a.reshape(1,-1), pixel.reshape(1,-1))[0] < 50:
                    pixel_group.append(pixel)
                    pixel_paths_group.append(path)
                    pixel_group_indices.append(index)
            
    non_zero_paths = [j for i, j in enumerate(non_zero_paths) if i not in pixel_group_indices]
    non_empty_pixels_ = [j for i, j in enumerate(non_empty_pixels_) if i not in pixel_group_indices]

    print(len(pixel_paths_group))
    
    pixel_groups[count] = [pixel_group, pixel_paths_group]

    # print(len(non_zero_paths))
    



3273
3804
391
103
34
664
494
705
252
26
215
79
196
86
10
502
51
274
2
21
22
50
10
16
4
8
11
7
9
2
1
7
1
2
1
19
3
2
10
12
14
19
8
31
8
5
3
4
1
2
5
2
4
6
1
11
5
1
5
1
1
2
4
2
1
1
3
1
1
1
1
2
1
1
3
1
1
3
4
1
1
1
1
2


In [9]:
len(pixel_groups.keys())

84

In [11]:
with open('Aisne_2018_groups.pkl', 'wb') as f:
    pickle.dump(pixel_groups, f)

In [12]:
with open('Aisne_2018_groups.pkl', 'rb') as f:
    loaded_dict = pickle.load(f)

In [17]:
loaded_dict[1]

[[array([[[148]],
  
         [[144]],
  
         [[166]],
  
         [[201]],
  
         [[212]],
  
         [[198]]], dtype=uint8),
  array([[[138]],
  
         [[158]],
  
         [[183]],
  
         [[222]],
  
         [[226]],
  
         [[221]]], dtype=uint8),
  array([[[134]],
  
         [[140]],
  
         [[161]],
  
         [[184]],
  
         [[200]],
  
         [[177]]], dtype=uint8),
  array([[[145]],
  
         [[142]],
  
         [[167]],
  
         [[205]],
  
         [[216]],
  
         [[195]]], dtype=uint8),
  array([[[142]],
  
         [[143]],
  
         [[173]],
  
         [[207]],
  
         [[206]],
  
         [[172]]], dtype=uint8),
  array([[[156]],
  
         [[141]],
  
         [[165]],
  
         [[192]],
  
         [[199]],
  
         [[168]]], dtype=uint8),
  array([[[165]],
  
         [[172]],
  
         [[187]],
  
         [[201]],
  
         [[204]],
  
         [[189]]], dtype=uint8),
  array([[[140]],
  
         [[13