In [1]:
%pip install hdbscan
%pip install catboost
%pip install lightgbm
%pip install xgboost
%pip install imblearn
%pip install dill

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from typing import Tuple
from Agent.Utils.GeoClustering import GeoClustering
from Agent.Utils.Debug import print_caller_info

def geocluster(df: pd.DataFrame) -> Tuple[GeoClustering, pd.DataFrame]:

    df_geo = pd.DataFrame()

    # Replace commas with decimal points in the 'ACCT_WGS84_X' and 'ACCT_WGS84_Y' columns
    df_geo['ACCT_WGS84_X'] = df['ACCT_WGS84_X'].str.replace(',', '.')
    df_geo['ACCT_WGS84_Y'] = df['ACCT_WGS84_Y'].str.replace(',', '.')

    df_geo = df_geo.fillna(0)

    # Convert the columns to float data type
    df_geo = df_geo.astype({"ACCT_WGS84_X": "float32", "ACCT_WGS84_Y": "float32"})
    
    # Instantiate GeoClustering with the clustered_data
    geo_clustering = GeoClustering(df_geo)

    geo_data = geo_clustering.cluster(chunk_size=5000)

    return geo_clustering, geo_data

def geocluster_test(df: pd.DataFrame, geo_clustering) -> np.array:

    df_geo = pd.DataFrame()

    # Replace commas with decimal points in the 'ACCT_WGS84_X' and 'ACCT_WGS84_Y' columns
    df_geo['ACCT_WGS84_X'] = df['ACCT_WGS84_X'].str.replace(',', '.')
    df_geo['ACCT_WGS84_Y'] = df['ACCT_WGS84_Y'].str.replace(',', '.')

    df_geo = df_geo.fillna(0)

    # Convert the columns to float data type
    df_geo = df_geo.astype({"ACCT_WGS84_X": "float32", "ACCT_WGS84_Y": "float32"})
    
    geo_data = geo_clustering.predict(df_geo)

    return geo_data

In [3]:
from typing import Tuple
from Agent.Utils.Imputation import ImbDataProcessor

def imputation(df: pd.DataFrame) -> Tuple[ImbDataProcessor, pd.DataFrame]:

    processor = ImbDataProcessor(df)
    processor.process()
    processed_data = processor.get_processed_data()

    return processor, processed_data

def imputation_test(df: pd.DataFrame, processor: ImbDataProcessor) -> pd.DataFrame:

    new_data = processor.impute_new_data(df)
    return new_data

In [4]:
def one_hot_encoding(df: pd.DataFrame) -> pd.DataFrame:

    df['cluster_labels'] = df['cluster_labels'].astype('object')
    df['XRHSH'] = df['XRHSH'].astype('object')
    df['PARNO'] = df['PARNO'].astype('object')
    df['CONTRACT_CAPACITY'] = df['CONTRACT_CAPACITY'].astype('object')
    df['ACCT_CONTROL'] = df['ACCT_CONTROL'].astype('object')
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns

    one_hot_encoded_data = pd.get_dummies(df, columns=categorical_columns)
    return one_hot_encoded_data

In [5]:
def time_series_unpack(df: pd.DataFrame) -> pd.DataFrame:
    
    # Find the maximum length of time_series
    max_length = df['time_series'].apply(len).max()

    # Create new columns and fill them with the values from time_series
    for i in range(max_length):
        column_name = f'measurement_{i}'
        df[column_name] = df['time_series'].apply(lambda x: x[i] if i < len(x) else None)

    return df

In [6]:
import pandas as pd
from typing import Tuple
from Agent.Sensors.Catboost import CatBoostAnomalyDetector
from Agent.Sensors.LightGBM import LightGBMAnomalyDetector
from Agent.Sensors.XGBoost import XGBoostAnomalyDetector
from Agent.Sensors.RandomForest import RandomForestAnomalyDetector
import numpy as np
import pickle

def sensors(X_detectors_train: pd.DataFrame, X_agent_train: pd.DataFrame, y_detectors_train: pd.DataFrame, y_agent_train: pd.DataFrame) -> Tuple[dict, pd.DataFrame]:

    #  (Number of outliers / Total number of data points)
    outlier_fraction = (len(y_detectors_train[y_detectors_train == 1]) / X_detectors_train.shape[0])

    # Class importance weights
    weight_minority = 0.7
    weight_majority = 0.3

    # Calculate class weights for each sample
    sample_weights = np.array([weight_minority if y == 1 else weight_majority for y in y_detectors_train])

    print_caller_info(sample_weights.shape)
    print_caller_info(y_agent_train.shape)

    # Instantiate the detectors
    detectors = {
        "RandomForest": RandomForestAnomalyDetector(max_depth=25, random_state=0, outlier_fraction=outlier_fraction, class_weight={0: weight_majority, 1: weight_minority}, sample_weights=sample_weights),
        "XGBoost": XGBoostAnomalyDetector(outlier_fraction=outlier_fraction, class_weight=sample_weights, sample_weights=sample_weights),
        "CatBoost": CatBoostAnomalyDetector(outlier_fraction=outlier_fraction, class_weight=[weight_majority, weight_minority], sample_weights=sample_weights),
        "LightGBM": LightGBMAnomalyDetector(outlier_fraction=outlier_fraction, class_weight={0: weight_majority, 1: weight_minority}, sample_weights=sample_weights),
    }

    # Train the detectors and store their predict_proba results in a new DataFrame called sensors_df
    sensors_df = pd.DataFrame()
    for name, detector in detectors.items():
        print("Current detector: {}".format(name))
        detector.fit(X_detectors_train, y_detectors_train)
        detector.evaluate(X_agent_train, y_agent_train)
        sensors_df[name] = detector.proba
        print("")

    return detectors, sensors_df

def sensors_test(X_test: pd.DataFrame, detectors: dict) -> pd.DataFrame:

    # Class importance weights
    weight_minority = 0.7
    weight_majority = 0.3

    sensors_test = pd.DataFrame()
    for name, detector in detectors.items():
        print("Current detector: {}".format(name))
        detector.predict_proba(X_test)
        sensors_test[name] = detector.proba
        print("")

    return sensors_test



In [7]:
import numpy as np
from Agent.dqn_binary_classification_memory_optimized import BinaryClassificationEnv, DoubleDQN
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score
from sklearn.model_selection import train_test_split

def dqn(X_train: pd.DataFrame, y_train: pd.DataFrame) -> DoubleDQN:

    X_train = X_train.to_numpy()
    y_train = y_train.to_numpy()

    num_features = X_train.shape[1]

    # Initialize the environment and agent
    env = BinaryClassificationEnv(X_train, y_train)
    agent = DoubleDQN(state_space=num_features, action_space=2,
                learning_rate=0.01, discount_factor=0.3, buffer_size=100000)

    # Set the total number of iterations for tqdm
    total_iterations = 100000

    # Initialize tqdm with the total number of iterations
    for i in range(total_iterations):

        action = np.random.randint(2)
        next_state, reward = env.step(action)
        agent.store_transition(env.data[env.current_state],
                            action, reward, next_state, False)
        
    # Train the agent
    agent.train(batch_size=64, num_episodes=750)

    return agent


def dqn_test(X_test: pd.DataFrame, agent: DoubleDQN) -> dict:

    X_test = X_test.to_numpy()

    # Evaluate the agent on new data
    test_cases = len(X_test)

    q_values = agent.predict(X_test)

    actions = []
    for i in range(test_cases):

        actions.append(np.argmax(q_values[i]))

    return q_values

2023-04-21 10:52:06.491898: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
import pickle
from sklearn.model_selection import train_test_split

def custom_function(X_train, y_train, X_test, agent_split_size, random_state):
    
    # Train split
    print("[*] Training Process:")

    print("[-] Entering geocluster")
    geo_clustering, geo_data = geocluster(X_train)
    X_train["cluster_labels"] = geo_data["cluster_labels"]
    print("[-] Exiting geocluster")

    print("[-] Entering preprocessing")
    processor, processed_data = imputation(X_train)
    processed_data = time_series_unpack(processed_data)
    processed_data = processed_data.drop(['time_series', 'ACCT_NBR', 'SUCCESSOR', 'MS_METER_NBR','ACCT_WGS84_X', 'ACCT_WGS84_Y'], axis=1)
    one_hot_encoded_data = one_hot_encoding(processed_data)
    print("[-] Exiting preprocessing")

    X_detectors_train, X_agent_train, y_detectors_train, y_agent_train = train_test_split(
        one_hot_encoded_data, y_train, test_size=agent_split_size, random_state=random_state, stratify=y_train
    )

    print("[-] Entering sensors")
    detectors, sensors_df = sensors(X_detectors_train, X_agent_train, y_detectors_train, y_agent_train)
    print("[-] Exiting sensors")

    X_agent_train = X_agent_train.reset_index(drop=True)
    sensors_df = sensors_df.reset_index(drop=True)
    X_agent_train = pd.concat([X_agent_train, sensors_df], axis=1)

    print("[-] Entering agent")
    agent = dqn(X_agent_train, y_agent_train)
    print("[-] Exiting agent")

    # Test split
    print("[*] Testing Process:")

    print("[-] Entering geocluster")
    geo_test = geocluster_test(X_test, geo_clustering)
    X_test["cluster_labels"] = geo_test
    print("[-] Exiting geocluster")

    print("[-] Entering preprocessing")
    processed_test = imputation_test(X_test, processor)
    processed_test = time_series_unpack(processed_test)
    processed_test = processed_test.drop(['time_series', 'ACCT_NBR', 'SUCCESSOR', 'MS_METER_NBR','ACCT_WGS84_X', 'ACCT_WGS84_Y'], axis=1)
    one_hot_encoded_test = pd.get_dummies(processed_test)
    one_hot_encoded_test = one_hot_encoded_test.reindex(columns=one_hot_encoded_data.columns, fill_value=0)
    print("[-] Exiting preprocessing")

    print("[-] Entering sensors")
    sensors_df_test = sensors_test(one_hot_encoded_test, detectors)
    print("[-] Exiting sensors")

    one_hot_encoded_test = one_hot_encoded_test.reset_index(drop=True)
    sensors_df_test = sensors_df_test.reset_index(drop=True)
    X_test = pd.concat([one_hot_encoded_test, sensors_df_test], axis=1)

    print("[-] Entering agent")
    q_values = dqn_test(X_test, agent)
    print("[-] Exiting agent")

    with open("q_values.pkl", "wb") as f:

        pickle.dump(q_values, f)


In [9]:
import pandas as pd
from Agent.Utils.ModelRunner import ModelRunner

# Load the dataset (replace 'your_dataset.csv' with your actual file)
data = pd.read_pickle('clean_data.pkl')
test = pd.read_pickle('clean_data_test.pkl')

data = data.fillna(0)
X = data.drop(["label"], axis=1)
y = data["label"]

model_runner = ModelRunner(custom_function, X, y, random_states=[80])
model_runner.run(test)

[*] Training Process:
[-] Entering geocluster
[-] Exiting geocluster
[-] Entering preprocessing
[-] Exiting preprocessing
[-] Entering sensors
Variable: (765656,)
File: /tmp/ipykernel_170065/1327681358.py
Line: 22
Class: 
Method: sensors
Variable: (765657,)
File: /tmp/ipykernel_170065/1327681358.py
Line: 23
Class: 
Method: sensors
Current detector: RandomForest

Current detector: XGBoost

Current detector: CatBoost

Current detector: LightGBM

[-] Exiting sensors
[-] Entering agent
[-] Exiting agent
[*] Testing Process:
[-] Entering geocluster
[-] Exiting geocluster
[-] Entering preprocessing
[-] Exiting preprocessing
[-] Entering sensors
Current detector: RandomForest

Current detector: XGBoost

Current detector: CatBoost

Current detector: LightGBM

[-] Exiting sensors
[-] Entering agent
[-] Exiting agent


2023-04-21 11:22:48.577200: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [14]:
q_values_1 = pd.DataFrame(pd.read_pickle("q_values.pkl"))

q_values_1.describe()


Unnamed: 0,0,1
count,149520.0,149520.0
mean,3.1e-05,2.6e-05
std,1.8e-05,1.5e-05
min,2e-06,2e-06
25%,1.6e-05,1.4e-05
50%,2.7e-05,2.4e-05
75%,4.3e-05,3.6e-05
max,0.000152,0.000111
