# Data Clustering
Tools used in Data Clustering of the workflows/orders

In [None]:
# SUPPRESS WARNING

import warnings
from numba.core.errors import NumbaDeprecationWarning

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=NumbaDeprecationWarning)

### Import data

In [None]:
from src.data.data_parser import WorkflowsImport, OrdersImport

# IMPORT DATA
WORKFLOWS = WorkflowsImport.IMPORT_FROM_CSV()
ORDERS = OrdersImport.IMPORT_FROM_CSV()

### Provide clustering configuration

In [None]:
from src.clustering.clustering_evaluation import ClusteringMetrics
from src.clustering.clustering_pre_processing import ClusteringPreProcessing
from src.helpers.dimensionality_reducer import DimensionalityReducer
from src.clustering.clustering_methods import ClusteringMethod
from src.clustering.clustering import ClusteringObjective
from src.helpers.feature_encoder import WORKFLOW_FEATURES, ORDER_FEATURES


# DEFINE TYPE OF OBJECTS THAT ARE GOING TO BE CLUSTERED
CLUSTERING_OBJECTIVE = ClusteringObjective.WORKFLOWS

# DEFINE FEATURES DISPLAYED IN ANALYSIS
WORKFLOWS_DISPLAYABLE_DETAILS = [
    WORKFLOW_FEATURES.CPU, 
    WORKFLOW_FEATURES.MEMORY, 
    WORKFLOW_FEATURES.EPHEMERAL_STORAGE, 
    WORKFLOW_FEATURES.STORAGE, 
    WORKFLOW_FEATURES.PROCESSED_SIZE,
    WORKFLOW_FEATURES.DURATION,
    WORKFLOW_FEATURES.STEPS_NO, 
    WORKFLOW_FEATURES.PRIORITY
]

ORDERS_DISPLAYABLE_DETAILS = [
    ORDER_FEATURES.CPU, 
    ORDER_FEATURES.MEMORY, 
    ORDER_FEATURES.EPHEMERAL_STORAGE, 
    ORDER_FEATURES.STORAGE, 
    ORDER_FEATURES.PROCESSED_SIZE,
    ORDER_FEATURES.DURATION,
    ORDER_FEATURES.WORKFLOW_NO,
    ORDER_FEATURES.ORDER_STATUS_CODE,
    ORDER_FEATURES.ORDER_NAME_CODE
]

# DEFINE CLUSTERING PARAMETERS
CLUSTERING_METHOD = ClusteringMethod.K_MEANS
CLUSTERING_PARAMS = [9]

# SPECIFY PRE-PROCESSING OPERATIONS
REDUCTION_PARAMS = ()
DIMENSIONALITY_REDUCTION = DimensionalityReducer.PCA
PRE_PROCESSING_OPERATIONS = [
    ClusteringPreProcessing.ONLY_DB_RECORDS, 
    ClusteringPreProcessing.MERGE_STATUSES,
    ClusteringPreProcessing.FILTER_TEST_WORKFLOWS,
    ClusteringPreProcessing.FILTER_OUT_DOWNLOAD_WORKFLOWS
]

# DEFINE FEATURES USED IN CLUSTERING
WORKFLOWS_CLUSTERING_DETAILS = [
    WORKFLOW_FEATURES.DURATION,
    WORKFLOW_FEATURES.MEMORY,
    WORKFLOW_FEATURES.EPHEMERAL_STORAGE,
    WORKFLOW_FEATURES.STORAGE,
    WORKFLOW_FEATURES.PROCESSED_SIZE,
    WORKFLOW_FEATURES.EXECUTED_STEPS_NO,
    WORKFLOW_FEATURES.ARGO_STATUS_CODE,
    WORKFLOW_FEATURES.ARGO_OUTPUT_MSG_CODE,
    WORKFLOW_FEATURES.PROCESSOR_TYPE_CODE
]

ORDER_CLUSTERING_DETAILS = [
    ORDER_FEATURES.DURATION,
    ORDER_FEATURES.MEMORY,
    ORDER_FEATURES.WORKFLOW_NO,
    ORDER_FEATURES.EPHEMERAL_STORAGE,
    ORDER_FEATURES.STORAGE,
    ORDER_FEATURES.PROCESSED_SIZE,
    ORDER_FEATURES.ORDER_STATUS_CODE
]

# DEFINE EVALUATION METRICS
VALIDATION_METRICS = [ClusteringMetrics.SILHOUETTE, ClusteringMetrics.CALINSKI, ClusteringMetrics.DAVIES]

# SPECIFY IF CLUSTERING TEST SHOULD BE RUN
TEST_PARAMETERS = False

# SPECIFY IF RESULTS SHOULD BE SAVED
SAVE_RESULTS = True

# SPECIFY NAME OF THE FILE UNDER WHICH THE RESULTS ARE TO BE STORED
CLUSTERING_NAME = f'K-Means without download'

CLUSTERING_DETAILS = WORKFLOWS_CLUSTERING_DETAILS if CLUSTERING_OBJECTIVE == ClusteringObjective.WORKFLOWS else ORDER_CLUSTERING_DETAILS
DISPLAYABLE_DETAILS = WORKFLOWS_DISPLAYABLE_DETAILS if CLUSTERING_OBJECTIVE == ClusteringObjective.WORKFLOWS else ORDERS_DISPLAYABLE_DETAILS
CLUSTERING_DATA = WORKFLOWS if CLUSTERING_OBJECTIVE == ClusteringObjective.WORKFLOWS else ORDERS 

### Run data clustering

In [None]:
# CLUSTERING SECTION
from src.clustering.clustering import Clustering

clustering = Clustering(CLUSTERING_NAME, 
                        CLUSTERING_DATA, 
                        CLUSTERING_DETAILS, 
                        DISPLAYABLE_DETAILS, 
                        CLUSTERING_METHOD, 
                        VALIDATION_METRICS, 
                        DIMENSIONALITY_REDUCTION, 
                        CLUSTERING_OBJECTIVE,
                        PRE_PROCESSING_OPERATIONS)
clustering.run(CLUSTERING_PARAMS, REDUCTION_PARAMS, test_params=TEST_PARAMETERS, save_data=SAVE_RESULTS)