In [2]:
import pandas as pd
import numpy as np
import pathlib
import random
from imblearn.under_sampling import RandomUnderSampler

seed = 42
random.seed(seed)
rus = RandomUnderSampler(random_state=seed, sampling_strategy=0.05)

dataset_path = "./preprocessed/train"

dataset =  pathlib.Path(dataset_path)

clusters = [x for x in dataset.iterdir() if x.is_dir()]

identifiers = ["site_id", "mlid", "datetime"]

labels = ["rlf", "1-day-predict", "5-day-predict"]

## Generate for N workers

In [21]:
n_divisions = 4
output_path = f"./divided/train/{n_divisions}_workers"

output = pathlib.Path(output_path)
output.mkdir(parents=True, exist_ok=True)

random.shuffle(clusters)

subset_size = len(clusters)//n_divisions

cluster_sets = [ [clusters[i], clusters[i+1]] for i in range(0, subset_size*n_divisions, subset_size)]

for i in range(len(clusters)%n_divisions):
    cluster_sets[i].append(clusters[subset_size*n_divisions + i])

print(f"Combining subset's files")
for idx, cluster_set in enumerate(cluster_sets):
    (output/"raw").mkdir(parents=True, exist_ok=True)
    x_subset_file = open(output/f"raw/x_train_subset_{idx+1}.csv", "wb")
    y_subset_file = open(output/f"raw/y_train_subset_{idx+1}.csv", "wb")

    for cluster in cluster_set:
        for folder in [x for x in cluster.iterdir() if x.is_dir()]:
            for file in [x for x in folder.iterdir() if x.is_file()]:
                df = pd.read_csv(file)

                time_sentitive_features = [feature for feature in df.columns if feature not in labels and feature not in identifiers]
                np.savetxt(x_subset_file, df[time_sentitive_features].values, delimiter=",", fmt="%5.2f")
                np.savetxt(y_subset_file, df[["5-day-predict"]].values, delimiter=",", fmt="%d")

    x_subset_file.close()
    y_subset_file.close()

print("Undersampling")
for idx, cluster_set in enumerate(cluster_sets):
    (output/"under").mkdir(parents=True, exist_ok=True)
    
    X_train = np.loadtxt(output/f"raw/x_train_subset_{idx+1}.csv", delimiter=",")
    y_train = np.loadtxt(output/f"raw/y_train_subset_{idx+1}.csv", delimiter=",")

    final_X_array, final_y_array = rus.fit_resample(X_train, y_train)
    final_X_array = zip(final_X_array, rus.sample_indices_)
    final_y_array = zip(final_y_array, rus.sample_indices_)

    final_X_array = sorted(final_X_array, key=lambda x : x[1])
    final_y_array = sorted(final_y_array, key=lambda x : x[1])

    final_X_array = [x[0] for x in final_X_array]
    final_y_array = [x[0] for x in final_y_array]

    np.savetxt(output/f"under/x_train_subset_{idx+1}.csv", final_X_array, delimiter=",", fmt="%5.2f")
    np.savetxt(output/f"under/y_train_subset_{idx+1}.csv", final_y_array, delimiter=",", fmt="%d")

    x_subset_file.close()
    y_subset_file.close()

Combining subset's files
Undersampling


## Generate for a single worker

In [4]:
output_path = f"./divided/train/{1}_workers/"
output = pathlib.Path(output_path)
output.mkdir(parents=True, exist_ok=True)

(output/"raw").mkdir(parents=True, exist_ok=True)
x_file = open(output/f"raw/x_train.csv", "wb")
y_file = open(output/f"raw/y_train.csv", "wb")

# Merging files into the number of workers
for cluster in clusters:
    for folder in [x for x in cluster.iterdir() if x.is_dir()]:
        for file in [x for x in folder.iterdir() if x.is_file()]:
            df = pd.read_csv(file)

            time_sentitive_features = [feature for feature in df.columns if feature not in labels and feature not in identifiers]
            np.savetxt(x_file, df[time_sentitive_features].values, delimiter=",", fmt="%5.2f")
            np.savetxt(y_file, df[["5-day-predict"]].values, delimiter=",", fmt="%d")
            if df[time_sentitive_features].values.shape[1] != 620:
                print(file)

x_file.close() 
y_file.close()

#Undersampling
(output/"under").mkdir(parents=True, exist_ok=True)
X_train = np.loadtxt(output/"raw/x_train.csv", delimiter=",")
y_train = np.loadtxt(output/"raw/y_train.csv", delimiter=",")

final_X_array, final_y_array = rus.fit_resample(X_train, y_train)
final_X_array = zip(final_X_array, rus.sample_indices_)
final_y_array = zip(final_y_array, rus.sample_indices_)

final_X_array = sorted(final_X_array, key=lambda x : x[1])
final_y_array = sorted(final_y_array, key=lambda x : x[1])

final_X_array = [x[0] for x in final_X_array]
final_y_array = [x[0] for x in final_y_array]

np.savetxt(output/"under/x_train.csv", final_X_array, delimiter=",", fmt="%5.2f")
np.savetxt(output/"under/y_train.csv", final_y_array, delimiter=",", fmt="%d")

preprocessed/train/DENSE_TREE/RL_X;OEL/A1BQ_time_sentitive_features.csv
preprocessed/train/DENSE_TREE/RL_X;OEL/A1WD_time_sentitive_features.csv
preprocessed/train/VERYHIGH-SPARSE_BLOCK_BUILDINGS/RL_SECSF/A8CN_time_sentitive_features.csv
preprocessed/train/OPEN_IN_URBAN/RL_U7MRV/A8DY_time_sentitive_features.csv
preprocessed/train/OPEN_IN_URBAN/RL_[KBEO/A7XT_time_sentitive_features.csv
preprocessed/train/INDUSTRIAL_&_COMMERCIAL/RL_SKDCI/A2AB_time_sentitive_features.csv
preprocessed/train/INDUSTRIAL_&_COMMERCIAL/RL_SKDCI/A2SI_time_sentitive_features.csv
preprocessed/train/AVERAGE-MEDIUM_URBAN/RL_S9LQB/A2BA_time_sentitive_features.csv
preprocessed/train/AVERAGE-MEDIUM_URBAN/RL_S9LQB/A2IS_time_sentitive_features.csv


In [20]:
from sklearn.cluster import AgglomerativeClustering

input_path = "train"
input = pathlib.Path(input_path)

distances_file = input/"distances.tsv"
rl_sites_file = input/"rl-sites.tsv"

distances_df = pd.read_csv(distances_file, sep="\t", index_col=0)
rl_sites_df = pd.read_csv(rl_sites_file, sep="\t", index_col=0)

#print(np.unique(rl_sites_df["clutter_class"], return_counts=True))

rl_stations = rl_sites_df["site_id"].unique()

distances_df = distances_df.loc[rl_stations, rl_stations]

clustering = AgglomerativeClustering(n_clusters=8, metric='precomputed', linkage='complete').fit(distances_df.values)

labels = clustering.labels_

print(np.unique(labels, return_counts=True))


(array(['AVERAGE-DENSE URBAN', 'AVERAGE-MEDIUM URBAN',
       'AVERAGE-SPARSE URBAN', 'BUILTUP-VILLAGE', 'DENSE TREE',
       'GREEN HOUSE', 'HIGH-DENSE URBAN', 'HIGH-ISOLATED-BUILDINGS',
       'HIGH-MEDIUM URBAN', 'HIGH-SPARSE URBAN',
       'INDUSTRIAL & COMMERCIAL', 'INLAND WATER', 'LOW-DENSE URBAN',
       'LOW-MEDIUM URBAN', 'LOW-SPARSE URBAN', 'OPEN IN URBAN',
       'OPEN LAND', 'SPARSE TREE', 'VERYHIGH-DENSE BLOCK BUILDINGS',
       'VERYHIGH-MEDIUM BLOCK BUILDINGS',
       'VERYHIGH-SPARSE BLOCK BUILDINGS'], dtype=object), array([289, 214,  31,   7,  74,   1,  34,   5,  21,  17, 135,   1, 148,
       185,  46, 333,  47,  72,   2,   6,   6]))
