# This file is to generate the proximity matricies; Data Agnostic
- Helpful to just change a few lines of code where to import the data from. 

In [39]:
# Load up data

#& Imports
import sys
sys.path.insert(0, '/yunity/arusty/PF-GAP')
import os

#Import libraries, MDS from SKlearn, and all the custom files
import pandas as pd
from Pipeline.functions import *

#Functions
from QGAP.qgap import QGAP
from Redcomets.Redcomets import REDCOMETS
from RFGAP_Rocket.RFGAP_Rocket import RFGAP_Rocket
from RDST.rdst import RDST_GAP

#& Data Loading
time_series = pd.read_csv("/yunity/arusty/PF-GAP/data/ftse_100_close_prices.csv", index_col=0)
static =  None
labels = np.array(pd.read_csv("/yunity/arusty/PF-GAP/data/ftse_100_sectors.csv")).flatten()
data_dir = "../../data/ftse_100/results/"


In [None]:
time_series.shape, labels.shape, static.shape if static is not None else "No static data"

((100, 1007), (100,), 'No static data')

# Generating the proximities

In [None]:
print("Beggining QGAP...")
qgap = QGAP(matrix_type="dense", interval_depth = 8, quantile_divisor = 8)
quant_prox = data_to_proximities(qgap, time_series, labels, static)
np.save(os.path.join(data_dir, "quant_prox.npy"), quant_prox)
print("---- QGAP Finished")
print("---- OOB Score: ", qgap._estimator.oob_score_)

Beggining QGAP...
---- QGAP Finished
---- OOB Score:  0.19


In [None]:
print("Beggining Redcomets...")
if static is None:
    redcomets = REDCOMETS(variant = 3, perc_length = 0.7, n_trees = 100) 
else:
    redcomets = REDCOMETS(variant = 3, perc_length = 0.7, n_trees = 100, static_data=static)
redcomets_prox = data_to_proximities(redcomets, time_series, labels)
np.save(os.path.join(data_dir, "redcomets_prox.npy"), redcomets_prox)
print("---- Redcomets Finished")
print("---- OOB Score: ", redcomets.get_ensemble_oob_score())

Beggining Redcomets...
---- Redcomets Finished
---- OOB Score:  0.910604166893765


In [None]:
print("Beggining RFGAP-Rockets...")
rf_rocket = RFGAP_Rocket(prediction_type = "classification", rocket = "Multi",
                         n_kernels=256) # Rocket Kwargs
rocket_prox = data_to_proximities(rf_rocket, time_series, labels, static)
np.save(os.path.join(data_dir, "rocket_prox.npy"), rocket_prox)
print("---- RFGAP-Rockets Finished")
print("---- OOB Score: ", rf_rocket.rf_gap.oob_score_)

Beggining RFGAP-Rockets...
---- RFGAP-Rockets Finished
---- OOB Score:  0.16


In [None]:
print("Beggining RDST...")
rdst = RDST_GAP(save_transformed_data = True, max_shapelets = 10000, 
                shapelet_lengths = None, alpha_similarity = 0.3)
rdst_prox = data_to_proximities(rdst, time_series, labels, static)

np.save(os.path.join(data_dir, "rdst_prox.npy"), rdst_prox)
print("---- RDST Finished")
print("---- OOB Score: ", rdst._estimator.oob_score_)


Beggining RDST...
---- RDST Finished
---- OOB Score:  0.19


In [None]:
print("Beggining Fresh Prince...")

# This requires a channel dimension
time_series_reshaped = np.expand_dims(time_series.values, axis=1)

from FreshPrince.FreshPrince import FreshPRINCE_GAP
fresh_prince = FreshPRINCE_GAP(default_fc_parameters="minimal", n_estimators=200)
fresh_prince_prox = data_to_proximities(fresh_prince, time_series_reshaped, labels, static)

np.save(os.path.join(data_dir, "fresh_prince_prox.npy"), fresh_prince_prox)
print("---- Fresh Prince Finished")

try:
    print("---- OOB Score: ", fresh_prince._estimator.oob_score_)
except:
    print("---- OOB Score: Not available for Fresh Prince")

Beggining Fresh Prince...
---- Fresh Prince Finished
---- OOB Score: Not available for Fresh Prince


In [37]:
print("Beggining PyF-GAP...")
sys.path.insert(0, '/yunity/arusty/PF-GAP/PFGAP')

#Check data shape to see if too large
if time_series.shape[1] > 500:
    print("---- PyF-GAP is not designed for large datasets, consider using a smaller dataset.")
    print("---- Truncating time series to 500 features.\n\n")
    time_series = time_series.iloc[:, -500:]
    
from PFGAP.PyPFGAP import PyPFGAP
pf = PyPFGAP()
pf_prox = data_to_proximities(pf, time_series, pd.Categorical(labels).codes.astype(str))

np.save(os.path.join(data_dir, "pf_prox.npy"), pf_prox)
print("\n\n---- PyF-GAP Finished")
print("---- OOB Score: Not available for PyF-GAP")

Beggining PyF-GAP...
---- PyF-GAP is not designed for large datasets, consider using a smaller dataset.
---- Truncating time series to 500 features.


reading file [train]:finished in 0:0:0.089
reading file [test]:finished in 0:0:0.025
Running on configurations...
Dataset: train, Training Data : 100x500 , Testing Data: 100x500, Train #Classes: 43, Test #Classes: 43
Repeats: 1 , Trees: 18 , Candidates per Split(r): 5
Output Dir: output, Export: 1, Verbosity: 1
Select DM per node: true , Shuffle Data: false, JVM WarmUp: false
----------------------------------------------------------------------------------------------------

-----------------Repetition No: 1 (train)   -----------------
Using: 4 MB, Free: 76 MB, Allocated Pool: 80 MB, Max Available: 1024 MB
core.ProximityForestResult@4f3f5b24
0.1.2.3.4.5.6.7.8.9.10.11.12.13.14.15.16.17.
Using: 407 MB, Free: 308 MB, Allocated Pool: 715 MB, Max Available: 1024 MB
*
Computing Forest Proximities...
Done Computing Forest Proximities. Computat

# Getting test data and Validation Results
- Now that we have our proximities, lets get some numbers on the board! Woot!

In [47]:
time_series.head()

Unnamed: 0,3,4,5,6,7,8,9,10,11,12,...,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009
AAF,-0.003807,-0.000637,0.010835,-0.019546,-0.001929,-0.025129,-0.008592,-0.002,-0.024716,-0.056849,...,0.0451,0.010788,0.026273,0.0208,0.017241,-0.016179,0.014096,0.002317,-0.003082,0.006182
KGF,-0.009946,0.00274,0.038251,-0.017544,-0.029018,-0.032644,0.01711,0.021028,-0.020595,-0.019159,...,0.06892,0.002875,-0.021294,0.011297,0.02441,-0.010905,0.007758,0.0,-0.002836,-0.011377
DPLM,-0.010774,-0.00594,-0.000996,-0.007976,0.003015,-0.033066,-0.011399,-0.003669,-0.028406,0.028695,...,0.020196,0.007353,-0.01123,0.008518,0.012387,0.002225,-0.006659,0.007821,-0.005543,-0.001672
SN,0.008463,-0.01895,0.011314,0.0,0.007913,0.010287,0.001608,0.003478,0.007731,-0.008466,...,0.024621,-0.034658,0.001915,0.001911,0.016214,0.005162,0.002801,-0.002328,0.004666,0.001858
PSON,-0.010769,-0.012753,0.012602,-0.011512,-0.016368,0.00448,-0.003504,-0.032928,0.022149,-0.088939,...,-0.014017,-0.004668,0.002558,0.008505,0.002319,0.00589,0.007528,-0.004566,0.005838,-0.000622


In [None]:
# Begin the Train test split stuff

#& Imports
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
from sklearn.model_selection import train_test_split

#// Split the data into train and test sets
#// if static is not None:
#//     X_train, X_test, y_train, y_test, static_train, static_test = train_test_split(
#//                                                                 time_series, labels, static, 
#//                                                                 test_size=0.2, random_state=42
#//                                                                 )
#// else:
#//     X_train, X_test, y_train, y_test = train_test_split(
#//                                                         time_series, labels, 
#//                                                         test_size=0.2, random_state=42
#//                                                         )



def get_cross_validation_results(get_predictions_method):
    # Set up cross-validation
    skf = StratifiedKFold(n_splits=6, shuffle=True, random_state=42)

    results = []

    for fold, (train_idx, test_idx) in enumerate(skf.split(time_series, labels)):
        print(f"Fold {fold + 1}/{skf.n_splits}: {len(train_idx)} train samples, {len(test_idx)} test samples")
        # Split data
        X_train, X_test = time_series[train_idx], time_series[test_idx]
        y_train, y_test = labels[train_idx], labels[test_idx]
    
        if static is not None:
            y_pred, prox_train, prox_test = get_predictions_method(X_train, y_train, X_test, static[train_idx], static[test_idx])
        else:
            y_pred, prox_train, prox_test = get_predictions_method(X_train, y_train, X_test, None, None)
        
        # RF metrics
        rf_f1 = f1_score(y_test, y_pred, average='weighted')
        rf_recall = recall_score(y_test, y_pred, average='weighted')
        rf_precision = precision_score(y_test, y_pred, average='weighted')
        rf_acc = accuracy_score(y_test, y_pred)

        #KNN metrics
        if prox_train is None or prox_test is None:
            acc = None
            f1 = None
            precision = None
            recall = None
        else:
            knn = KNeighborsClassifier(n_neighbors=4, metric='precomputed')
            knn.fit(1 - prox_train, y_train)
            y_pred = knn.predict(1- prox_test)

            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average='weighted')
            precision = precision_score(y_test, y_pred, average='weighted')
            recall = recall_score(y_test, y_pred, average='weighted')
            
        results.append({
            'fold': fold,
            'train_idx': train_idx,
            'test_idx': test_idx,
            'rf_f1': rf_f1,
            'rf_recall': rf_recall,
            'rf_precision': rf_precision,
            'rf_accuracy': rf_acc,
            'knn_accuracy': acc,
            'knn_f1': f1,
            'knn_precision': precision,
            'knn_recall': recall
        })

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df._append({
        'fold': 'Average',
        'rf_f1': results_df['rf_f1'].mean(),
        'rf_recall': results_df['rf_recall'].mean(),
        'rf_precision': results_df['rf_precision'].mean(),
        'rf_accuracy': results_df['rf_accuracy'].mean(),
        'knn_accuracy': results_df['knn_accuracy'].mean(),
        'knn_f1': results_df['knn_f1'].mean(),
        'knn_precision': results_df['knn_precision'].mean(),
        'knn_recall': results_df['knn_recall'].mean()
    }, ignore_index=True)

import matplotlib.pyplot as plt

def plot_cv_results(results_df):
    """
    Plots cross-validation metrics for each fold, ignoring the average row.
    Args:
        results_df (pd.DataFrame): DataFrame returned by get_cross_validation_results
    """
    # Exclude the average row
    plot_df = results_df[results_df['fold'] != 'Average']

    metrics = [
        ("rf_accuracy", "RF Accuracy"),
        ("rf_f1", "RF F1"),
        ("rf_precision", "RF Precision"),
        ("rf_recall", "RF Recall"),
        ("knn_accuracy", "KNN Accuracy"),
        ("knn_f1", "KNN F1"),
        ("knn_precision", "KNN Precision"),
        ("knn_recall", "KNN Recall"),
    ]
    plt.figure(figsize=(12, 6))
    for metric, label in metrics:
        if metric in plot_df.columns:
            plt.plot(plot_df["fold"], plot_df[metric], marker='o', label=label)
    plt.xlabel("Fold")
    plt.ylabel("Score")
    plt.ylim(0, 1)
    plt.title("Cross-Validation Metrics by Fold")
    plt.legend()
    plt.grid(True)
    plt.show()

#& Models Here

def get_rocket_pred(X_train, y_train, X_test, static_train, static_test):
    rocket = RFGAP_Rocket(prediction_type = "classification", rocket = "Multi",
                         n_kernels=256, prox_method = "rfgap")
    
    rocket.fit(X_train, y_train, static_train, weights = None)
    return rocket.predict(X_test, static_test), rocket.get_proximities().toarray(), rocket.get_test_proximities(X_test, static_test).toarray()


In [46]:
print("Beggining Rocket Cross Validation Tests...")
rocket_fold = get_cross_validation_results(get_rocket_pred)
rocket_fold.to_csv(os.path.join(data_dir, "rocket_scores.csv"), index=False)
print("---- Rocket Cross Validation Finished")
plot_cv_results(rocket_fold) 

Beggining Rocket Cross Validation Tests...
Fold 1/6: 83 train samples, 17 test samples




AttributeError: 'DataFrame' object has no attribute 'toarray'