# Data cleaning sandbox

This is a quick sandbox to get an unbiased estimate of the effect of different levels of data cleaning on model performance
Not intended to be used later on in practice!

Now, we can (hopefully) import all the necessary libraries. If this should not be the case, please install the packages you do not have.

In [33]:
import earthnet as en
import numpy as np
import matplotlib.pyplot as plt
import os
from os.path import join
import statistics as st
import pickle
import glob
from numpy import genfromtxt
from numpy.random import shuffle
import re
import sys
from random import seed


In [34]:
sys.path.append(os.getcwd())
os.chdir(join(os.getcwd(), ".."))
print(os.getcwd()) # Should be top drought_impact_forecasting folder

c:\Users\Oto\Documents\GitHub\drought_impact_forecasting


In [35]:
baseline_scores = genfromtxt(join(os.getcwd(), "Data", "scores_last_frame.csv"), delimiter=',')
with open(join(os.getcwd(), "Data", "last_frame_data_paths.pkl"),'rb') as f:
    old_train_paths = pickle.load(f)

print(baseline_scores.shape)
print(len(old_train_paths))

# Glue together baseline scores and paths
path_arr = np.array(old_train_paths)
print(path_arr.shape)
scores = np.append(baseline_scores, np.zeros([len(old_train_paths),1]), axis=1)
scores[:,5] = range(0, len(old_train_paths))
print(scores.shape)

(23904, 5)
23904
(23904,)
(23904, 6)


In [36]:
print(type(scores[0,4]))
print(scores[:1,:])
print(scores[-1:,:])
print(scores[:,5])

<class 'numpy.float64'>
[[0.22392087 0.36124287 0.37945882 0.25867528 0.29122114 0.        ]]
[[2.13312454e-01 1.04168812e-01 2.60996524e-01 1.70339861e-01
  1.66737208e-01 2.39030000e+04]]
[0.0000e+00 1.0000e+00 2.0000e+00 ... 2.3901e+04 2.3902e+04 2.3903e+04]


In [37]:
sorted_scores = scores[scores[:, 4].argsort()]
np.savetxt("Data/scores_last_frame_sorted.csv", sorted_scores, delimiter=",")
print(sorted_scores[:1])
print(sorted_scores[-1:])
print(sorted_scores.shape)

print(sorted_scores[np.isnan(sorted_scores[:, 4])])

[[1.37477925e-01 5.07578757e-04 2.75217608e-01 1.05062470e-01
  2.00948160e-03 7.69000000e+03]]
[[2.11230905e-01            nan 5.17031017e-01 2.36936472e-01
             nan 2.21650000e+04]]
(23904, 6)
[[2.14623642e-01            nan 4.10139015e-01 2.06978211e-01
             nan 2.08410000e+04]
 [2.29622759e-01            nan 6.05806542e-01 2.50108325e-01
             nan 7.97000000e+02]
 [2.11230905e-01            nan 5.17031017e-01 2.36936472e-01
             nan 2.21650000e+04]]


In [41]:
threshold = 0.1
nan_samples = sorted_scores[np.isnan(sorted_scores[:, 4])]
print(nan_samples.shape[0])
bad_samples = sorted_scores[sorted_scores[:, 4]<threshold]
print(bad_samples.shape[0])
good_samples = sorted_scores[sorted_scores[:, 4]>=threshold]
print(good_samples.shape[0])
# Should add up to 23904

3
1046
22855


In [42]:
# Split 'Good' dataset into train/val_1,val_2
seed(1)
shuffle(good_samples)
val_2 = good_samples[:1000]
val_1 = good_samples[1000:1500]
train = good_samples[1500:]

train_data = path_arr[list(train[:,5].astype(int))].tolist()
print(len(train_data))
val_1_data = path_arr[list(val_1[:,5].astype(int))].tolist()
print(len(val_1_data))
val_2_data = path_arr[list(val_2[:,5].astype(int))].tolist()
print(len(val_2_data))

bad_data = path_arr[list(bad_samples[:,5].astype(int))].tolist()
nan_data = path_arr[list(nan_samples[:,5].astype(int))].tolist()

21355
500
1000


In [45]:
# Save 'Good' dataset
cur_dir = 'all_data_1'
if not os.path.exists(join('Data', cur_dir)):
    os.mkdir(join('Data', cur_dir))
with open(join(os.getcwd(), "Data", cur_dir, "train_data_paths.pkl"), "wb") as fp:
    pickle.dump(train_data, fp)
with open(join(os.getcwd(), "Data", cur_dir, "val_1_data_paths.pkl"), "wb") as fp:
    pickle.dump(val_1_data, fp)
with open(join(os.getcwd(), "Data", cur_dir, "val_2_data_paths.pkl"), "wb") as fp:
    pickle.dump(val_2_data, fp)

# Save 'OK' dataset
cur_dir = 'all_data_2'
if not os.path.exists(join('Data', cur_dir)):
    os.mkdir(join('Data', cur_dir))
with open(join(os.getcwd(), "Data", cur_dir, "train_data_paths.pkl"), "wb") as fp:
    pickle.dump(train_data + bad_data, fp)
with open(join(os.getcwd(), "Data", cur_dir, "val_1_data_paths.pkl"), "wb") as fp:
    pickle.dump(val_1_data, fp)
with open(join(os.getcwd(), "Data", cur_dir, "val_2_data_paths.pkl"), "wb") as fp:
    pickle.dump(val_2_data, fp)

# Save 'Bad' dataset
cur_dir = 'all_data_3'
if not os.path.exists(join('Data', cur_dir)):
    os.mkdir(join('Data', cur_dir))
with open(join(os.getcwd(), "Data", cur_dir, "train_data_paths.pkl"), "wb") as fp:
    pickle.dump(train_data + bad_data + nan_data, fp)
with open(join(os.getcwd(), "Data", cur_dir, "val_1_data_paths.pkl"), "wb") as fp:
    pickle.dump(val_1_data, fp)
with open(join(os.getcwd(), "Data", cur_dir, "val_2_data_paths.pkl"), "wb") as fp:
    pickle.dump(val_2_data, fp)


In [46]:
# Save 'Good' dataset
cur_dir = 'all_data_4'
if not os.path.exists(join('Data', cur_dir)):
    os.mkdir(join('Data', cur_dir))
with open(join(os.getcwd(), "Data", cur_dir, "train_data_paths.pkl"), "wb") as fp:
    pickle.dump(train_data, fp)
with open(join(os.getcwd(), "Data", cur_dir, "val_1_data_paths.pkl"), "wb") as fp:
    pickle.dump(val_1_data, fp)
with open(join(os.getcwd(), "Data", cur_dir, "val_2_data_paths.pkl"), "wb") as fp:
    pickle.dump(val_2_data, fp)

In [43]:
# test reloading works
with open(os.path.join(os.getcwd(), "Data", cur_dir, "train_data_paths.pkl"),'rb') as f:
    loaded_paths = pickle.load(f)
print(type(loaded_paths))

<class 'list'>
