In [3]:
# !pip install tslearn

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from scipy.spatial.distance import euclidean
from fastdtw import fastdtw

# Load the data
file_path =  'stage3_data_cleaning/v2/type1_label_merged_final_decoded_clean3.xlsx'
data = pd.read_excel(file_path)

# Assuming the last column is the label and the rest are features
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# Function to extract shapelets
def extract_shapelets(X, y, shapelet_size):
    shapelets = []
    labels = []
    for i in range(len(X)):
        series = X[i]
        label = y[i]
        for start in range(len(series) - shapelet_size + 1):
            shapelet = series[start:start + shapelet_size]
            shapelets.append(shapelet)
            labels.append(label)
    return np.array(shapelets), np.array(labels)

# Function to compute the distance between two sequences using DTW
def dtw_distance(s1, s2):
    distance, _ = fastdtw(s1, s2, dist=euclidean)
    return distance

# Function to evaluate shapelets
def evaluate_shapelets(shapelets, shapelet_labels, X, y):
    scores = []
    for shapelet, shapelet_label in zip(shapelets, shapelet_labels):
        distances = []
        for series in X:
            min_dist = float('inf')
            for start in range(len(series) - len(shapelet) + 1):
                subsequence = series[start:start + len(shapelet)]
                dist = dtw_distance(shapelet, subsequence)
                if dist < min_dist:
                    min_dist = dist
            distances.append(min_dist)
        clf = DecisionTreeClassifier()
        clf.fit(np.array(distances).reshape(-1, 1), y)
        score = accuracy_score(y, clf.predict(np.array(distances).reshape(-1, 1)))
        scores.append(score)
    return np.array(scores)

# Extract shapelets
shapelet_size = 10  # You can adjust the shapelet size
shapelets, shapelet_labels = extract_shapelets(X_train, y_train, shapelet_size)

# Evaluate shapelets
shapelet_scores = evaluate_shapelets(shapelets, shapelet_labels, X_train, y_train)

# Select top K shapelets
top_k = 5
top_shapelet_indices = np.argsort(shapelet_scores)[-top_k:]
top_shapelets = shapelets[top_shapelet_indices]

# Function to transform dataset using top shapelets
def transform_dataset(X, shapelets):
    transformed_X = []
    for series in X:
        transformed_series = []
        for shapelet in shapelets:
            min_dist = float('inf')
            for start in range(len(series) - len(shapelet) + 1):
                subsequence = series[start:start + len(shapelet)]
                dist = dtw_distance(shapelet, subsequence)
                if dist < min_dist:
                    min_dist = dist
            transformed_series.append(min_dist)
        transformed_X.append(transformed_series)
    return np.array(transformed_X)

# Transform training and testing datasets

X_train_transformed = transform_dataset(X_train, top_shapelets)
X_test_transformed = transform_dataset(X_test, top_shapelets)

# Train a classifier on the transformed dataset
clf = DecisionTreeClassifier()
clf.fit(X_train_transformed, y_train)

# Make predictions and evaluate
y_pred = clf.predict(X_test_transformed)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


ValueError: Input vector should be 1-D.

In [None]:
# !pip install fastdtw