In [None]:
%load_ext cuml.accel

from cuml.common import logger;
logger.set_level(logger.level_enum.debug)

In [None]:
from google.colab import drive
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.metrics import accuracy_score, classification_report
import h5py
from scipy.stats import randint, uniform
drive.mount("/content/drive")
file_path="/content/drive/MyDrive/dataset_ts_light_version.hdf5"

In [None]:
with h5py.File(file_path, "r") as f:

  X_train= f["x_train"][:]
  y_train= f["y_train"][:]

  X_val= f["x_validation"][:]
  y_val= f["y_validation"][:]

  X_test= f["x_test"][:]
  y_test= f["y_test"][:]

  X_train = np.concatenate((X_train, X_val), axis=0)
  y_train = np.concatenate((y_train, y_val), axis=0)

del X_val, y_val

In [None]:
def preprocess(data):
  reshaped_data = data.reshape(data.shape[0], -1)
  means = reshaped_data.mean(axis=1, keepdims=True)
  stds = reshaped_data.std(axis=1, keepdims=True) + 1e-8
  preprocessed_data = (reshaped_data-means)/stds
  return preprocessed_data

X_train = preprocess(X_train)
X_test = preprocess(X_test)

X_train, y_train = shuffle(X_train, y_train, random_state=42)

In [None]:
n_estimators_range = (100, 180)
max_depth_range = (10, 18)
min_samples_leaf_range = (1, 8)
percent_of_data = 0.01

param_dist = {"n_estimators": randint(n_estimators_range[0], n_estimators_range[1]),
              "max_depth": randint(max_depth_range[0], max_depth_range[1]),
              "min_samples_leaf": randint(min_samples_leaf_range[0], min_samples_leaf_range[1]),
              }

rf = RandomForestClassifier()

rand_search = HalvingRandomSearchCV(rf, param_distributions=param_dist, max_resources=round(X_train.shape[0]*percent_of_data), verbose=10, random_state=42)

rand_search.fit(X_train, y_train)

del rf, X_train, y_train

y_pred = rand_search.best_estimator_.predict(X_test)

del X_test

accuracy = accuracy_score(y_test, y_pred)

wrong_images_index = []

for i in range(0, y_test.shape[0]):
  if y_pred[i] != y_test[i]:
    wrong_images_index.append(i)

del y_test, y_pred

file_path = "/content/drive/MyDrive/run_info.txt"
with open(file_path, "a") as f:
  f.write(f"\nNew run:\nParameter distribution:\nn_estimators_range: {n_estimators_range}, max_depth_range: {max_depth_range}, min_samples_leaf_range: {min_samples_leaf_range}, percent_of_data: {percent_of_data}\nAccuracy: {accuracy}\nBest hyperparameters: {rand_search.best_params_}\nIncorrectly classified images: {wrong_images_index}\nNumber of incorrectly classified images: {len(wrong_images_index)}")