# Large

In [None]:
# import os
# from repo2data.repo2data import Repo2Data

# # Download Data
# data_req_path = os.path.abspath("../binder/data_requirement.json")
# if os.path.exists(data_req_path):
#     repo2data = Repo2Data(data_req_path)
#     data_path = repo2data.install()

In [None]:
from src.Utils import save_results, get_means
from src.Dataset import dataset_csv
from src.ExistingAlgorithms import area, sklearn_available
from src.GaussianMixture import gaussian_mixture
from src.Parametric_UMAP import load_pumap

from sklearn.decomposition import PCA
import torch
import umap
import numpy as np

SKIP = 1
PATH_INIT_MEANS = 'Preprocess/Large/Mean_Clusters'
PATH_SAVE_LD = 'Preprocess/Large/Low_Dimension'
PATH_DATA = '../data/data_ottawa'
PATH_RESULTS = 'Results/Large'
PATH_SAVE_D = 'Results/Large/Density'

## Load Data

In [None]:
data_train, data_test = dataset_csv(
   PATH_DATA,
   SKIP = SKIP
)

In [None]:
data_test.shape

### AREA

In [None]:
X_AREA  = area(
   X_high = data_test+1, 
   filtering = True, 
   plot_filter = True, 
   threshold = 0.01, 
   critical_frequency = 0.1,
   save_path = PATH_SAVE_LD
)
X_AREA[::SKIP].shape

In [None]:
name_method = 'AREA'
gm = gaussian_mixture(
   X_AREA[::SKIP],
   data_test,
   number_cluster = 21,
   cluster_iter = 3,
   means_init = get_means(name_method, PATH_INIT_MEANS),
   tol = 1e-4,
   info_sweep = 0,
   plot_sweep = True
)

gm.plot_density(
   bw_adjust = 0.03, 
   plot_gaussians = True, 
   text = name_method,
   save_path = PATH_SAVE_D
)

In [None]:
gm.plot_confidence_1d(expected_prob = None)
save_results(
   gm = gm, 
   name_method = name_method, 
   path = PATH_RESULTS
)

### Parametric UMAP

In [None]:
data_test.shape

In [None]:
model = load_pumap('src/Parametric_UMAP/model 1D/modelLarge.ckpt').to('cpu')
BATCH = 10
X_PUMAP = np.zeros((data_test.shape[0], 1))

for b in range(BATCH):
   X_PUMAP[b::BATCH] = model.encoder(torch.from_numpy(data_test[b::BATCH]).view(-1,200).to(dtype=torch.float)).detach().numpy()

In [None]:
name_method = 'PUMAP 1D'
gm = gaussian_mixture(
   X_PUMAP,
   data_test,
   number_cluster = 21,
   cluster_iter = 5,
   means_init = get_means(name_method, PATH_INIT_MEANS),
   tol = 1e-4,
   info_sweep = 0,
   plot_sweep = False,
   latex = False
)

gm.plot_density(
   bw_adjust = 0.01,
   plot_gaussians = True, 
   text = name_method,
   save_path = PATH_SAVE_D
)

In [None]:
gm.plot_confidence_1d(expected_prob = None)
save_results(gm = gm, name_method = name_method, path = PATH_RESULTS)

### PCA

In [None]:
# pca = PCA(n_components=1).fit(data_train)
# X_PCA = pca.transform(data_test)
X_PCA = sklearn_available(
   X_train = data_train, 
   X_test = data_test, 
   path_save = PATH_SAVE_LD, 
   function = PCA, 
   n_components = 1, 
   random_state = 42
)

In [None]:
name_method = 'PCA 1D'
gm = gaussian_mixture(
   X_PCA[::SKIP],
   data_test,
   number_cluster = 17,
   cluster_iter = 5,
   means_init = get_means(name_method, PATH_INIT_MEANS),
   tol = 1e-4,
   info_sweep = 0,
   plot_sweep = False,
   latex = False
)

gm.plot_density(
   bw_adjust = 0.03,
   plot_gaussians = True, 
   text = name_method,
   save_path = PATH_SAVE_D
)

In [None]:
gm.plot_confidence_1d(expected_prob = None)
save_results(gm = gm, name_method = name_method, path = PATH_RESULTS)

### UMAP

In [None]:
X_l_UMAP = sklearn_available(
   data_train, 
   data_test, 
   PATH_SAVE_LD+'/umap1d', 
   umap.UMAP, '100.npy', 
   n_components=1, 
   n_neighbors=100, 
   random_state=42
)

In [None]:
name_method = 'UMAP 1D'
gm = gaussian_mixture(
   X_low = X_l_UMAP[::SKIP],
   X_high = data_test,
   number_cluster = 20,
   cluster_iter = 5,
   means_init = get_means(name_method, path = PATH_INIT_MEANS),
   tol = 1e-4
)

gm.plot_density(
   bw_adjust = 0.03,
   plot_gaussians = True,
   text = name_method,
   save_path = PATH_SAVE_D
)


In [None]:
gm.plot_confidence_1d(expected_prob = None)
save_results(
   gm = gm, 
   name_method = name_method, 
   path = PATH_RESULTS
)