In [66]:
# Imports the useful modules of several libraries.
from khiva.features import *
from khiva.distances import *
from khiva.library import *

import pandas as pd
import numpy as np

import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=Warning)


from sklearn.utils import shuffle
from sklearn.preprocessing import scale

from sklearn.model_selection import GridSearchCV
from sklearn import svm

import time

In [67]:
# Prepares the jupyter-notebook environment.
%config IPCompleter.greedy=True
%matplotlib inline

In [68]:
# Loads the local ids for each local.
file_names = []

for name in all_sites["SITE_ID"].values:
    file_names.append(name)

In [70]:
# Loads the metadata
all_sites = pd.read_csv("../../energy/data/data-enerNoc/all-data/meta/all_sites.csv")

In [71]:
# Prints the backend used. CPU, CUDA and OPENCL are available for khiva.
print(get_backend())

KHIVABackend.KHIVA_BACKEND_CPU


In [72]:
# Let's load 100 time series. 1,666,600 data points in total.
arr_tmp  = Array(np.load("../../energy/electric-consumption-rate-python/time-series-redimension-applied.npy"))

In [None]:
# Feature extraction.
# Calculates several features of each time series in order to create the features matrix. 
start = time.time()
features = np.stack([abs_energy(arr_tmp).to_numpy(),
                    absolute_sum_of_changes(arr_tmp).to_numpy(),
                    count_above_mean(arr_tmp).to_numpy(),
                    count_below_mean(arr_tmp).to_numpy(),
                    first_location_of_maximum(arr_tmp).to_numpy(),
                    first_location_of_minimum(arr_tmp).to_numpy(),
                    has_duplicates(arr_tmp).to_numpy(),
                    has_duplicate_max(arr_tmp).to_numpy(),
                    kurtosis(arr_tmp).to_numpy(),
                    last_location_of_maximum(arr_tmp).to_numpy(),
                    last_location_of_minimum(arr_tmp).to_numpy(),
                    has_duplicate_min(arr_tmp).to_numpy(),
                    longest_strike_above_mean(arr_tmp).to_numpy(),
                    longest_strike_below_mean(arr_tmp).to_numpy(),
                    maximum(arr_tmp).to_numpy(),
                    mean_absolute_change(arr_tmp).to_numpy(),
                    minimum(arr_tmp).to_numpy(),
                    number_crossing_m(arr_tmp, 0).to_numpy(),
                    mean(arr_tmp).to_numpy(),
                    median(arr_tmp).to_numpy(),
                    mean_change(arr_tmp).to_numpy(),
                    ratio_value_number_to_time_series_length(arr_tmp).to_numpy(),
                    skewness(arr_tmp).to_numpy(),
                    standard_deviation(arr_tmp).to_numpy(),
                    sum_of_reoccurring_values(arr_tmp).to_numpy(),
                    sum_values(arr_tmp).to_numpy(),
                    variance(arr_tmp).to_numpy(),
                    variance_larger_than_standard_deviation(arr_tmp).to_numpy()
                            ])
# Prints the time taken in order to get 28 features.
print("Time to extract the features : " + str(time.time() - start) )
# Let's transpose the Khiva array.
features = features.transpose()

In [62]:
# Defines the Features matrix and the Targets matrix
y = all_sites["SUB_INDUSTRY"].values
X = features

In [63]:
# Preprocesses the feature matrix.
X = scale(X)

In [64]:
# Makes several shuffles to distribute the samples.
for i in range(15):
    X, y, file_names = shuffle(X, y, file_names, random_state=0)

In [65]:
# Creates a model, fits it and predicts a subset of samples.
files_test = []
list_test_indices = []
for i in range(len(file_names)):
    if file_names[i] in [92, 45, 761, 10, 766, 400, 673, 49, 144, 496, 731, 281, 213, 197, 399]:
        list_test_indices.append(i)
        files_test.append(file_names[i])

X_train = np.delete(X, list_test_indices, 0)
X_test = np.take(X, list_test_indices, 0)
y_train = np.delete(y, list_test_indices)
y_test = np.take(y, list_test_indices)

k_range_parameter = {'degree':[3,4],'shrinking':[True,False],'probability':[True,False]}

# SVC is selected because of: >50 samples, predicting a category, labeled data and <100K samples. Good results.
clf = svm.SVC()

# Apply a gridSearchCV in order to get the best estimator during the training and cross-validation step.
mygridsearch = GridSearchCV(clf, k_range_parameter, cv = 10, scoring = 'accuracy' )
mygridsearch.fit(X_train, y_train)
print("MY TEST VECTOR: " + str(y_test))
print("MY PREDICTION: " + str(y_pred))
print("NUMBER OF ERRORS: " + str(sum(y_pred != y_test)))
print("ERROR RATE: " + str(1 - sum(y_pred == y_test) / float(len(y_pred))) + "%")
print("ACCURACY: " + str(sum(y_pred == y_test) / float(len(y_pred))) + "%")
print("PARAMETERS USED: "+ str(mygridsearch.best_params_))

MY TEST VECTOR: ['Grocer/Market' 'Primary/Secondary School' 'Food Processing'
 'Food Processing' 'Grocer/Market' 'Primary/Secondary School'
 'Grocer/Market' 'Food Processing' 'Shopping Center/Shopping Mall'
 'Shopping Center/Shopping Mall' 'Manufacturing'
 'Primary/Secondary School' 'Shopping Center/Shopping Mall'
 'Primary/Secondary School' 'Grocer/Market']
MY PREDICTION: ['Grocer/Market' 'Primary/Secondary School' 'Food Processing'
 'Food Processing' 'Grocer/Market' 'Primary/Secondary School'
 'Grocer/Market' 'Food Processing' 'Shopping Center/Shopping Mall'
 'Shopping Center/Shopping Mall' 'Food Processing'
 'Primary/Secondary School' 'Primary/Secondary School' 'Food Processing'
 'Grocer/Market']
NUMBER OF ERRORS: 3
ERROR RATE: 0.19999999999999996%
ACCURACY: 0.8%
PARAMETERS USED: {'degree': 3, 'probability': True, 'shrinking': True}
