In [4]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Only print warnings, ignore info and error 
import time
import numpy as np
import pandas as pd

from sktime.classification.interval_based import TimeSeriesForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score, make_scorer

import warnings
warnings.simplefilter("ignore", UserWarning)

# First Read the SITS Data

sktime requires 3d numpy arrays of the form

In [5]:
DATA_TRAIN = "./dataset/SITS-train-phase1-subset-1000.csv"


# Set univariate=True, if you use a classifier with multivariate capabilities
def read_data_sktime(DATA, univariate=False):
    data = pd.read_csv(DATA, delimiter="," , na_values=['?'], dtype='float', index_col="id")

    # Extract Data and Labels
    X = data.iloc[:,1:].values
    y = data.iloc[:,0].astype(int)

    if univariate:
        X = X.reshape(X.shape[0], 1, X.shape[1])
    else:
        X = X.reshape(X.shape[0], 3, X.shape[1]//3)
        
        
    print(X.shape)
    return X, y

X_train, y_train = read_data_sktime(DATA_TRAIN, univariate=True)

(1000, 1, 138)


# Train a sklearn Random Forest Model 

We are using GridSearch and Cross-Validation to train the model. 

In [6]:
param_grid = {
                "n_estimators": [100, 200],
                "random_state": [1],
                "n_jobs":[-1],
              }

# choose a classifier
clf = TimeSeriesForestClassifier()
scorer = make_scorer(f1_score, average='weighted')

# perform a grid-search
fit_time = time.perf_counter()
grid = GridSearchCV(clf, param_grid, cv = 5, scoring=scorer, refit=True, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)
fit_time = np.round(time.perf_counter() - fit_time, 5)

# get best model
best_clf = grid.best_estimator_
best_params = grid.best_params_
best_score = grid.best_score_

print("Best F1-Score:", best_score)
print("Time taken:", fit_time)

print("Detailed scores on train dataset:")
print()
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for \n\t %r"% (mean, std * 2, params))
    print()
    

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best F1-Score: 0.5976525388868892
Time taken: 8.15445
Detailed scores on train dataset:

0.582 (+/-0.060) for 
	 {'n_estimators': 100, 'n_jobs': -1, 'random_state': 1}

0.598 (+/-0.076) for 
	 {'n_estimators': 200, 'n_jobs': -1, 'random_state': 1}



# Submit your solution to Kaggle

<div class="alert alert-success alertsuccess" style="margin-top: 20px">
Create a submission named `submission.csv` using your model and upload it to kaggle:

- Phase 1: TODO

</div>

In [6]:
# Read the data
DATA_TEST = "./dataset/SITS-test-data-phase1-nolabel.csv"

X_test, _ = read_data_sktime(DATA_TEST, univariate=True)

(20000, 1, 138)


In [7]:
# Make a prediction
predictions = best_clf.predict(X_test)

In [8]:
# Create a submission file for kaggle
submission = pd.DataFrame({'PREDICTED': predictions})
submission.index.name="ID"

filename = 'baseline_tsf_submission_phase1.csv'
submission.to_csv(filename,index=True)
print('Saved file: ' + filename)

#Visualize the first 5 rows
submission.head()

Saved file: baseline_tsf_submission_phase1.csv


Unnamed: 0_level_0,PREDICTED
ID,Unnamed: 1_level_1
0,1
1,6
2,9
3,18
4,3


# Try again with Deep Learning CNN Classifier

In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Only print warnings, ignore info and error 

import time
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score, make_scorer

from sktime.classification.deep_learning import CNNClassifier

DATA_TRAIN = "./dataset/SITS-train-phase1-subset-1000.csv"


# Set univariate=True, if you use a classifier with multivariate capabilities
def read_data_sktime(DATA, univariate=False):
    data = pd.read_csv(DATA, delimiter="," , na_values=['?'], dtype='float', index_col="id")

    # Extract Data and Labels
    X = data.iloc[:,1:].values
    y = data.iloc[:,0].astype(int)

    if univariate:
        X = X.reshape(X.shape[0], 1, X.shape[1])
    else:
        X = X.reshape(X.shape[0], 3, X.shape[1]//3)
        
        
    print(X.shape)
    return X, y


X_train, y_train = read_data_sktime(DATA_TRAIN, univariate=True)


# param_grid = {
#                 "kernel_size": [3,5,7,9,11],
#                 "n_conv_layers": [2,3,5,7],
#               }

param_grid = {
                "kernel_size": [7],
                "n_conv_layers": [2],
              }

# choose a classifier
clf = CNNClassifier()
scorer = make_scorer(f1_score, average='weighted')

# perform a grid-search
fit_time = time.perf_counter()
grid = GridSearchCV(clf, param_grid, cv = 5, scoring=scorer, refit=True, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)
fit_time = np.round(time.perf_counter() - fit_time, 5)

# get best model
best_clf = grid.best_estimator_
best_params = grid.best_params_
best_score = grid.best_score_

print("Best F1-Score:", best_score)
print("Time taken:", fit_time)

print("Detailed scores on train dataset:")
print()
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for \n\t %r"% (mean, std * 2, params))
    print()

(1000, 1, 138)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


2022-11-06 13:06:43.041399: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-06 13:06:43.044265: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-06 13:06:43.068076: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-06 13:06:43.068575: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-06 13:06:43.080820: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been r

Best F1-Score: 0.5195263476624876
Time taken: 347.59891
Detailed scores on train dataset:

0.520 (+/-0.037) for 
	 {'kernel_size': 7, 'n_conv_layers': 2}



In [8]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [6]:
clf.get_params().keys()

dict_keys(['activation', 'avg_pool_size', 'batch_size', 'callbacks', 'kernel_size', 'loss', 'metrics', 'n_conv_layers', 'n_epochs', 'optimizer', 'random_state', 'use_bias', 'verbose'])