In [1]:
import pandas as pd
import numpy as np
import pathlib
import random
from imblearn.under_sampling import RandomUnderSampler
import gc
import psutil

seed = 42
random.seed(seed)
rus = RandomUnderSampler(random_state=seed, sampling_strategy=0.05)

n_divisions = 8
output_path = f"./divided_train_dataset_new_sample/{n_divisions}_workers"
dataset_path = "./preprocessed/"


output = pathlib.Path(output_path)
output.mkdir(parents=True, exist_ok=True)

dataset =  pathlib.Path(dataset_path)

clusters = [x for x in dataset.iterdir() if x.is_dir()]

random.shuffle(clusters)

subset_size = len(clusters)//n_divisions

cluster_sets = [ [clusters[i], clusters[i+1]] for i in range(0, subset_size*n_divisions, subset_size)]

for i in range(len(clusters)%n_divisions):
    cluster_sets[i].append(clusters[subset_size*n_divisions + i])

identifiers = ["site_id", "mlid", "datetime"]

labels = ["rlf", "1-day-predict", "5-day-predict"]

In [2]:
path_idx = 0

for idx, cluster_set in enumerate(cluster_sets):
    final_X_array = None
    final_y_array = None
    x_subset_file = open(output/f"x_train_subset_{idx+1}.csv", "wb")
    y_subset_file = open(output/f"y_train_subset_{idx+1}.csv", "wb")
    print(f"Combining subset's files")
    for cluster in cluster_set:
        for folder in [x for x in cluster.iterdir() if x.is_dir()]:
            for file in [x for x in folder.iterdir() if x.is_file()]:
                df = pd.read_csv(file)

                time_sentitive_features = [feature for feature in df.columns if feature not in labels and feature not in identifiers]

                if final_X_array is None:
                    final_X_array = df[time_sentitive_features].values
                    final_y_array = df[["5-day-predict"]].values
                else:
                    final_X_array = np.concatenate( (final_X_array, df[time_sentitive_features].values), axis=0)
                    final_y_array = np.concatenate( (final_y_array, df[["5-day-predict"]].values), axis=0)

        if psutil.virtual_memory().available < psutil.virtual_memory().total*0.1:
            print("Undersampling")
            final_y_array = final_y_array.astype('int') 
            final_X_array, final_y_array = rus.fit_resample(final_X_array, final_y_array)

            print("Saving")
            np.savetxt(x_subset_file, final_X_array, delimiter=",", fmt="%5.2f")
            np.savetxt(y_subset_file, final_y_array, delimiter=",", fmt="%d")
            del final_X_array
            del final_y_array
            gc.collect()

            final_X_array = None
            final_y_array = None
                    

        path_idx += 1

    print("Undersampling")
    final_y_array = final_y_array.astype('int') 
    final_X_array, final_y_array = rus.fit_resample(final_X_array, final_y_array)

    print("Saving")
    np.savetxt(x_subset_file, final_X_array, delimiter=",", fmt="%5.2f")
    np.savetxt(y_subset_file, final_y_array, delimiter=",", fmt="%d")
    del final_X_array
    del final_y_array
    gc.collect()

    x_subset_file.close()
    y_subset_file.close()

Combining subset's files
Undersampling
Saving
Combining subset's files
Undersampling
Saving
Combining subset's files
Undersampling
Saving
Combining subset's files
Undersampling
Saving
Combining subset's files
Undersampling
Saving
Combining subset's files
Undersampling
Saving
Combining subset's files
Undersampling
Saving
Combining subset's files
Undersampling
Saving
