In [1]:
import pandas as pd
import numpy as np
import pathlib
import random
from imblearn.under_sampling import RandomUnderSampler
from sklearn.cluster import AgglomerativeClustering

seed = 42
random.seed(seed)
rus = RandomUnderSampler(random_state=seed, sampling_strategy=0.05)

input_path = "train"
input = pathlib.Path(input_path)

distances_file = input/"distances.tsv"
rl_sites_file = input/"rl-sites.tsv"

dataset_path = "./preprocessed/train"

dataset =  pathlib.Path(dataset_path)

identifiers = ["site_id", "mlid", "datetime"]

labels = ["rlf", "1-day-predict", "5-day-predict"]

distances_df = pd.read_csv(distances_file, sep="\t", index_col=0)
rl_sites_df = pd.read_csv(rl_sites_file, sep="\t", index_col=0)

rl_stations = rl_sites_df["site_id"].unique()

distances_df = distances_df.loc[rl_stations, rl_stations]

## Generate for N workers

In [2]:
for n_divisions in [2, 4, 8]:
    print(f"Creating subset with {n_divisions} workers")
    output_path = f"./divided/train/{n_divisions}_workers"

    output = pathlib.Path(output_path)
    output.mkdir(parents=True, exist_ok=True)

    #Define station grouping based on HC

    clustering = AgglomerativeClustering(n_clusters=8, metric='precomputed', linkage='complete').fit(distances_df.values)

    cluster_labels = clustering.labels_

    clusters = {}

    for idx, value in enumerate(cluster_labels):
        if value in clusters:
            clusters[value].append(rl_stations[idx])
        else:
            clusters[value] = [rl_stations[idx]]

    print(f"Combining subset's files")
    for idx in range(n_divisions):
        (output/"raw").mkdir(parents=True, exist_ok=True)
        x_subset_file = open(output/f"raw/x_train_subset_{idx+1}.csv", "wb")
        y_subset_file = open(output/f"raw/y_train_subset_{idx+1}.csv", "wb")

        for rl_sation in clusters[idx]:
            if (dataset/rl_sation).exists():
                for file in [x for x in (dataset/rl_sation).iterdir() if x.is_file()]:
                    df = pd.read_csv(file)

                    time_sentitive_features = [feature for feature in df.columns if feature not in labels and feature not in identifiers]
                    np.savetxt(x_subset_file, df[time_sentitive_features].values, delimiter=",", fmt="%5.2f")
                    np.savetxt(y_subset_file, df[["5-day-predict"]].values, delimiter=",", fmt="%d")

        x_subset_file.close()
        y_subset_file.close()

    print("Undersampling")
    for idx in range(n_divisions):
        (output/"under").mkdir(parents=True, exist_ok=True)
        
        X_train = np.loadtxt(output/f"raw/x_train_subset_{idx+1}.csv", delimiter=",")
        y_train = np.loadtxt(output/f"raw/y_train_subset_{idx+1}.csv", delimiter=",")

        final_X_array, final_y_array = rus.fit_resample(X_train, y_train)
        final_X_array = zip(final_X_array, rus.sample_indices_)
        final_y_array = zip(final_y_array, rus.sample_indices_)

        final_X_array = sorted(final_X_array, key=lambda x : x[1])
        final_y_array = sorted(final_y_array, key=lambda x : x[1])

        final_X_array = [x[0] for x in final_X_array]
        final_y_array = [x[0] for x in final_y_array]

        np.savetxt(output/f"under/x_train_subset_{idx+1}.csv", final_X_array, delimiter=",", fmt="%5.2f")
        np.savetxt(output/f"under/y_train_subset_{idx+1}.csv", final_y_array, delimiter=",", fmt="%d")

        x_subset_file.close()
        y_subset_file.close()

Creating subset with 2 workers
Combining subset's files
Undersampling
Creating subset with 4 workers
Combining subset's files
Undersampling
Creating subset with 8 workers
Combining subset's files
Undersampling


## Generate for a single worker

In [3]:
output_path = f"./divided/train/{1}_workers/"
output = pathlib.Path(output_path)
output.mkdir(parents=True, exist_ok=True)

(output/"raw").mkdir(parents=True, exist_ok=True)
x_file = open(output/f"raw/x_train.csv", "wb")
y_file = open(output/f"raw/y_train.csv", "wb")

rl_ids = [x for x in dataset.iterdir() if x.is_dir()]

# Merging files into the number of workers
for rl in rl_ids:
    for file in [x for x in rl.iterdir() if x.is_file()]:
        df = pd.read_csv(file)

        time_sentitive_features = [feature for feature in df.columns if feature not in labels and feature not in identifiers]
        np.savetxt(x_file, df[time_sentitive_features].values, delimiter=",", fmt="%5.2f")
        np.savetxt(y_file, df[["5-day-predict"]].values, delimiter=",", fmt="%d")
        if df[time_sentitive_features].values.shape[1] != 620:
            print(file)
            print(list(df[time_sentitive_features].columns))

x_file.close() 
y_file.close()

#Undersampling
(output/"under").mkdir(parents=True, exist_ok=True)
X_train = np.loadtxt(output/"raw/x_train.csv", delimiter=",")
y_train = np.loadtxt(output/"raw/y_train.csv", delimiter=",")

final_X_array, final_y_array = rus.fit_resample(X_train, y_train)
final_X_array = zip(final_X_array, rus.sample_indices_)
final_y_array = zip(final_y_array, rus.sample_indices_)

final_X_array = sorted(final_X_array, key=lambda x : x[1])
final_y_array = sorted(final_y_array, key=lambda x : x[1])

final_X_array = [x[0] for x in final_X_array]
final_y_array = [x[0] for x in final_y_array]

np.savetxt(output/"under/x_train.csv", final_X_array, delimiter=",", fmt="%5.2f")
np.savetxt(output/"under/y_train.csv", final_y_array, delimiter=",", fmt="%d")

In [None]:
#Undersampling
(output/"under").mkdir(parents=True, exist_ok=True)
X_train = np.loadtxt(output/"under/x_cv.csv", delimiter=",")
y_train = np.loadtxt(output/"under/y_cv.csv", delimiter=",")

final_X_array, final_y_array = rus.fit_resample(X_train, y_train)
final_X_array = zip(final_X_array, rus.sample_indices_)
final_y_array = zip(final_y_array, rus.sample_indices_)

final_X_array = sorted(final_X_array, key=lambda x : x[1])
final_y_array = sorted(final_y_array, key=lambda x : x[1])

final_X_array = [x[0] for x in final_X_array]
final_y_array = [x[0] for x in final_y_array]

np.savetxt(output/"under/x_cv_under.csv", final_X_array, delimiter=",", fmt="%5.2f")
np.savetxt(output/"under/y_cv_under.csv", final_y_array, delimiter=",", fmt="%d")