In [1]:
import pandas as pd
import numpy as np
import pathlib
from imblearn.under_sampling import RandomUnderSampler
import gc
import psutil

seed = 42
rus = RandomUnderSampler(random_state=seed, sampling_strategy=0.5)

output_path = "./divided_test_dataset"
datasets_path = "./preprocessed_test/"

output = pathlib.Path(output_path)
output.mkdir(parents=True, exist_ok=True)
datasets = pathlib.Path(datasets_path)

identifiers = ["site_id", "mlid", "datetime"]

labels = ["rlf", "1-day-predict", "5-day-predict"]

In [2]:

final_X_array = None
final_y_array = None

x_test_file = open(output/f"x_test.csv", "wb")
y_test_file = open(output/f"y_test.csv", "wb")
x_test_under_file = open(output/f"x_test_undersampled.csv", "wb")
y_test_under_file = open(output/f"y_test_undersampled.csv", "wb")


for idx, dataset in enumerate(sorted([x for x in datasets.iterdir() if x.is_dir()])):
    print(idx, dataset)
    x_test_local_file = open(output/f"x_test_case{idx+1}.csv", "wb")
    y_test_local_file = open(output/f"y_test_case{idx+1}.csv", "wb")
    for cluster in [x for x in dataset.iterdir() if x.is_dir()]:
        for folder in [x for x in cluster.iterdir() if x.is_dir()]:
            for file in [x for x in folder.iterdir() if x.is_file()]:
                df = pd.read_csv(file)

                time_sentitive_features = [feature for feature in df.columns if feature not in labels and feature not in identifiers]

                if final_X_array is None:
                    final_X_array = df[time_sentitive_features].values
                    final_y_array = df[["5-day-predict"]].values
                else:
                    final_X_array = np.concatenate((final_X_array, df[time_sentitive_features].values), axis=0)
                    final_y_array = np.concatenate((final_y_array, df[["5-day-predict"]].values), axis=0)

        if psutil.virtual_memory().available < psutil.virtual_memory().total*0.1:

            print("Saving")
            np.savetxt(x_test_file, final_X_array, delimiter=",", fmt="%5.2f")
            np.savetxt(y_test_file, final_y_array, delimiter=",", fmt="%d")
            np.savetxt(x_test_local_file, final_X_array, delimiter=",", fmt="%5.2f")
            np.savetxt(y_test_local_file, final_y_array, delimiter=",", fmt="%d")

            print("Undersampling")
            final_y_array = final_y_array.astype('int') 
            final_X_array, final_y_array = rus.fit_resample(final_X_array, final_y_array)
            np.savetxt(x_test_under_file, final_X_array, delimiter=",", fmt="%5.2f")
            np.savetxt(y_test_under_file, final_y_array, delimiter=",", fmt="%d")

            del final_X_array
            del final_y_array
            gc.collect()

            final_X_array = None
            final_y_array = None
                    
print("Saving")
np.savetxt(x_test_file, final_X_array, delimiter=",", fmt="%5.2f")
np.savetxt(y_test_file, final_y_array, delimiter=",", fmt="%d")
np.savetxt(x_test_local_file, final_X_array, delimiter=",", fmt="%5.2f")
np.savetxt(y_test_local_file, final_y_array, delimiter=",", fmt="%d")

print("Undersampling here")
final_y_array = final_y_array.astype('int') 
final_X_array, final_y_array = rus.fit_resample(final_X_array, final_y_array)
np.savetxt(x_test_under_file, final_X_array, delimiter=",", fmt="%5.2f")
np.savetxt(y_test_under_file, final_y_array, delimiter=",", fmt="%d")

del final_X_array
del final_y_array
gc.collect()
x_test_local_file.close()
y_test_local_file.close()

x_test_file.close()
y_test_file.close()
x_test_under_file.close()
y_test_under_file.close()

0 preprocessed_test/case1
Saving
Undersampling here
