In [1]:
import pandas as pd
import os
import sys
from pandarallel import pandarallel
import numpy as np
import pickle
import utils
import data_processing

In [None]:
pandarallel.initialize()

In [2]:
HOME_FOLDER = os.path.expanduser("~")
REPO_NAME = "cs325b-airquality"
REPO_FOLDER = os.path.join(HOME_FOLDER, REPO_NAME)
DATA_FOLDER = "data"
sys.path.append(REPO_FOLDER)
EPA_DATA_FOLDER = os.path.join(REPO_FOLDER, DATA_FOLDER, "epa")
SENTINEL_FOLDER = os.path.join(REPO_FOLDER, DATA_FOLDER, "sentinel")
SENTINEL_METADATA_FOLDER = os.path.join(SENTINEL_FOLDER, "metadata_2016")

In [None]:
#%%time
# Can take a while if running for the first time
#data_processing.rename_sentinel_files(SENTINEL_FOLDER)
#data_processing.rename_sentinel_files(SENTINEL_METADATA_FOLDER)

In [3]:
%%time
# load EPA df
PROCESSED_DATA_FOLDER = os.path.join(REPO_FOLDER, DATA_FOLDER, "processed_data")
train_csv = os.path.join(PROCESSED_DATA_FOLDER, "train_sites_master_csv_2016.csv")
val_csv = os.path.join(PROCESSED_DATA_FOLDER, "val_sites_master_csv_2016.csv")
test_csv = os.path.join(PROCESSED_DATA_FOLDER, "test_sites_master_csv_2016.csv")
tr_df = pd.read_csv(train_csv, index_col=0)
val_df = pd.read_csv(val_csv, index_col=0)
test_df = pd.read_csv(test_csv, index_col=0)
epa_df = pd.concat([tr_df, val_df, test_df])


CPU times: user 1.16 s, sys: 188 ms, total: 1.35 s
Wall time: 2.51 s


In [None]:
%%time
# can take anywhere from 10-15 minutes.
#dates = data_processing.load_sentinel_dates(SENTINEL_METADATA_FOLDER)
#with open('dates.pickle', 'wb') as handle:
#    pickle.dump(dates, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('dates.pickle', 'rb') as handle:
    dates = pickle.load(handle)
print(len(dates))

In [None]:
%%time

new_df = epa_df.assign(SENTINEL_FILENAME = "", SENTINEL_INDEX = -1)
new_df = new_df.parallel_apply(data_processing.add_sentinel_info, axis=1, 
                                      metadata_folder_path=SENTINEL_METADATA_FOLDER,
                                      sentinel_folder_path=SENTINEL_FOLDER,
                                      sentinel_dates = dates)



In [None]:
# Save master csv created, which includes sentinel values for entries 
# where metadata and npy image dimensions do not match up
#new_df.to_csv(os.path.join(utils.PROCESSED_DATA_FOLDER, "epa_sentinel_withdays.csv")#
#print(new_df)

In [None]:
# Save just the entries where the metadata and npy image dimensions match up
valid = new_df[new_df['SENTINEL_INDEX']!= -1]
# print(valid)
#valid.to_csv(os.path.join(utils.PROCESSED_DATA_FOLDER, "epa_sentinel_valid_withdays.csv")

In [6]:
# Read back dfs from csv after running repairing code
PROCESSED_DATA_FOLDER = os.path.join(REPO_FOLDER, DATA_FOLDER, "processed_data")
valid_csv = os.path.join(utils.PROCESSED_DATA_FOLDER, "epa_sentinel_valid_withdays.csv")
full_csv = os.path.join(utils.PROCESSED_DATA_FOLDER, "epa_sentinel_withdays.csv")
full_df = pd.read_csv(full_csv, index_col=0)
valid_df = pd.read_csv(valid_csv, index_col=0)

# Get entries that qualify as close enough: i.e. sentinel imag taken within 4 days of EPA measure 
close_enough = valid_df[valid_df['PM Reading/Image day difference'] < 5]

In [7]:
# Checking how many unique stations the repairing/limiting removes 
unique_stations_old = epa_df['Site ID'].unique()
unique_stations_full = full_df['Site ID'].unique()
unique_stations_valid = valid_df['Site ID'].unique()
unique_stations_close  = close_enough['Site ID'].unique()
print("Number of unique stations with old pairing: {}".format(len(unique_stations_old)))
print("Number of unique stations with new pairing: {}".format(len(unique_stations_full)))
print("Number of unique stations after removing entries with mismatched measurements: {}".format(len(unique_stations_valid)))
print("Number of unique stations after removing entries with date differences > 4: {}".format(len(unique_stations_close)))

Number of unique stations with old pairing: 1152
Number of unique stations with new pairing: 1152
Number of unique stations after removing entries with mismatched measurements: 738
Number of unique stations after removing entries with date differences > 4: 735


In [8]:
# Since we're splitting spatially across sites, need to ensure there are sufficient readings per site
# For now, limiting to sites that have at least 80 measurements over the year
site_measurement_counts = close_enough["Site ID"].value_counts()
site_measurement_counts = site_measurement_counts.to_frame(name='counts')
sites_with_sufficient_measurements = site_measurement_counts[site_measurement_counts['counts'] > 80]
all_suff_measure_sites = sites_with_sufficient_measurements.index.tolist()
num_sites_suff = len(sites_with_sufficient_measurements)
df_suff_close = close_enough[close_enough['Site ID'].isin(all_suff_measure_sites)]
print("Using {} sites out of {}".format(num_sites_suff, len(unique_stations_close)))

Using 258 sites out of 735


In [9]:
# Shuffle and plit into train/val/test 0.8, 0.1, 0.1 
#shuffled_sites = sites_with_sufficient_measurements.sample(frac=1)
#num_train, num_val, num_test = int(0.8*num_sites_suff), int(0.1*num_sites_suff), int(0.1*num_sites_suff)
#train = shuffled_sites[:num_train]
#val = shuffled_sites[num_train:num_train+num_val]
#test = shuffled_sites[num_train+num_val:]

#train_sites = train.index.tolist()
#val_sites = val.index.tolist()
#test_sites = test.index.tolist()

#train_df = df_suff[df_suff['Site ID'].isin(train_sites)]
#val_df = df_suff[df_suff['Site ID'].isin(val_sites)]
#test_df = df_suff[df_suff['Site ID'].isin(test_sites)]

In [10]:
# Get existing train/val/test df splits
train_df = pd.read_csv(os.path.join(utils.PROCESSED_DATA_FOLDER,"train_repaired_sufficient_stats_2016.csv"),index_col=0)
val_df = pd.read_csv(os.path.join(utils.PROCESSED_DATA_FOLDER, "val_repaired_sufficient_stats_2016.csv"),index_col=0)
test_df = pd.read_csv(os.path.join(utils.PROCESSED_DATA_FOLDER, "test_repaired_sufficient_stats_2016.csv"), index_col=0)

# Get the list of indices (Site-id dates) of sufficiently close entries 
df_suff_close_entries = df_suff_close['Site ID_Date'].tolist()  ## Only 33991 entries qualify as suff. close
train_entries = train_df['Site ID_Date'].tolist()               ## vs   66703 entries in train set alone for comp.

# Get only the rows that qualify as sufficiently close
train_close = train_df[train_df['Site ID_Date'].isin(df_suff_close_entries)] # has stats
val_close = val_df[val_df['Site ID_Date'].isin(df_suff_close_entries)]
test_close = test_df[test_df['Site ID_Date'].isin(df_suff_close_entries)]

# Merge so that the dfs have the day difference added as a column
# not working as expected right now, so need to manually add day difference 
train_close_with_days = pd.merge(train_close, df_suff_close, on=['Site ID_Date', 'POC'], how='inner')
val_close_with_days = pd.merge(val_close, df_suff_close, on=['Site ID_Date', 'POC'], how='left')
test_close_with_days = pd.merge(test_close, df_suff_close, on=['Site ID_Date', 'POC'], how='left')

train_close = train_close.reset_index()
train_close_with_days = train_close_with_days.reset_index()
val_close = val_close.reset_index()
val_close_with_days = val_close_with_days.reset_index()
test_close = test_close.reset_index()
test_close_with_days = test_close_with_days.reset_index()

train_close['PM Reading/Image day difference'] = train_close_with_days['PM Reading/Image day difference']
val_close['PM Reading/Image day difference'] = val_close_with_days['PM Reading/Image day difference']
test_close['PM Reading/Image day difference'] = test_close_with_days['PM Reading/Image day difference']

print("Train-repaired only considering days <5 away now has size: {}".format(len(train_close_with_days))) 
print("Val-repaired only considering days <5 away now has size: {}".format(len(val_close_with_days)))
print("Test-repaired only considering days <5 away now has size: {}".format(len(test_close_with_days)))

# Save to csvs once
#train_close.to_csv("train_repaired_sufficient_close_stats_2016.csv")
#val_close.to_csv("val_repaired_sufficient_close_stats_2016.csv")
#test_close.to_csv("test_repaired_sufficient_close_stats_2016.csv")


Train-repaired only considering days <5 away now has size: 26304
Val-repaired only considering days <5 away now has size: 4704
Test-repaired only considering days <5 away now has size: 2983


In [15]:
## Cloud removal on new repaired dataset
import cloud_remove
train_repaired = os.path.join(utils.PROCESSED_DATA_FOLDER, "train_repaired_sufficient_close_stats_2016.csv")
val_repaired = os.path.join(utils.PROCESSED_DATA_FOLDER, "val_repaired_sufficient_close_stats_2016.csv")
test_repaired = os.path.join(utils.PROCESSED_DATA_FOLDER, "test_repaired_sufficient_close_stats_2016.csv")

In [16]:
train_repaired_cloud_remove = os.path.join(utils.PROCESSED_DATA_FOLDER, "train_repaired_suff_stats_cloud_remove_2016.csv")
val_repaired_cloud_remove = os.path.join(utils.PROCESSED_DATA_FOLDER, "val_repaired_suff_stats_cloud_remove_2016.csv")
test_repaired_cloud_remove = os.path.join(utils.PROCESSED_DATA_FOLDER, "test_repaired_suff_stats_cloud_remove_2016.csv")

#cloud_remove.remove_sent_over_threshold(train_repaired, train_repaired_cloud_remove, threshold=2000)  
#cloud_remove.remove_sent_over_threshold(val_repaired, val_repaired_cloud_remove, threshold=2000)  
#cloud_remove.remove_sent_over_threshold(test_repaired, test_repaired_cloud_remove, threshold=2000)  




New pandarallel memory created - Size: 2000 MB
Pandarallel will run on 2 workers


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=13152), Label(value='0 / 13152')))…

Thresholded /home/sarahciresi/cs325b-airquality/data/processed_data/train_repaired_sufficient_close_stats_2016.csv file at 2000 + used Decision Tree. 
Initial df was 26304 rows. Thresholded df is 12998 rows.
