In [1]:
import os
import time
import numpy as np
import pandas as pd

from sktime.classification.interval_based import TimeSeriesForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score, make_scorer

import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FutureWarning)

In [2]:
import numba

# First Read the SITS Data

sktime requires 3d numpy arrays of the form

$$(#ts, #channels, #time stamps)$$

In [5]:
# you can use a larger subset, if you like, there are:
# - SITS-train-phase2-subset-1000000.csv.gz
# - SITS-train-phase2-subset-100000.csv.gz
# - SITS-train-phase2-subset-10000.csv.gz
DATA_TRAIN = "./dataset/SITS-train-phase2-subset-10000.csv.gz"

# Set univariate=True, if you use a classifier with multivariate capabilities
def read_data_sktime(DATA, univariate=False):
    data = pd.read_csv(DATA, delimiter="," , 
                       na_values=['?'], dtype='float', 
                       index_col="id", compression='gzip')

    # Fill NaN values
    # We use the most basic way with bfill and ffill to carry on the last values
    data.fillna(method='bfill', inplace=True, axis=1)
    data.fillna(method='ffill', inplace=True, axis=1)

    # Extract Data and Labels
    X = data.iloc[:,1:].values
    y = data.iloc[:,0].astype(int)


    if univariate:
        X = X.reshape(X.shape[0], 1, X.shape[1])
    else:
        X = X.reshape(X.shape[0], 3, X.shape[1]//3)
        
        
    print(X.shape)
    return X, y

X_train, y_train = read_data_sktime(DATA_TRAIN, univariate=True)

(10000, 1, 138)


# Train a sklearn Random Forest Model 

We are using GridSearch and Cross-Validation to train the model. 

In [6]:
param_grid = {
                "n_estimators": [100, 200],
                "random_state": [1],
                "n_jobs":[-1],
              }

# choose a classifier
clf = TimeSeriesForestClassifier()
scorer = make_scorer(f1_score, average='weighted')

# perform a grid-search
fit_time = time.perf_counter()
grid = GridSearchCV(clf, param_grid, cv = 5, scoring=scorer, refit=True, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)
fit_time = np.round(time.perf_counter() - fit_time, 5)

# get best model
best_clf = grid.best_estimator_
best_params = grid.best_params_
best_score = grid.best_score_

print("Best F1-Score:", best_score)
print("Time taken:", fit_time)

print("Detailed scores on train dataset:")
print()
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for \n\t %r"% (mean, std * 2, params))
    print()
    

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best F1-Score: 0.6770186730719441
Time taken: 384.10061
Detailed scores on train dataset:

0.676 (+/-0.007) for 
	 {'n_estimators': 100, 'n_jobs': -1, 'random_state': 1}

0.677 (+/-0.014) for 
	 {'n_estimators': 200, 'n_jobs': -1, 'random_state': 1}



# Submit your solution to Kaggle

<div class="alert alert-success alertsuccess" style="margin-top: 20px">
Create a submission named `submission.csv` using your model and upload it to kaggle:

- Phase 1: https://www.kaggle.com/competitions/sits-ws22-phase1

</div>

In [None]:
# Read the data
DATA_TEST = "../dataset/phase2/SITS-test-data-phase2-nolabel.csv.gz"

X_test, _ = read_data_sktime(DATA_TEST, univariate=True)

In [None]:
# Make a prediction
predictions = best_clf.predict(X_test)

In [None]:
# Create a submission file for kaggle
submission = pd.DataFrame({'PREDICTED': predictions})
submission.index.name="ID"

filename = 'baseline_tsf_submission_phase2.csv'
submission.to_csv(filename,index=True)
print('Saved file: ' + filename)

#Visualize the first 5 rows
submission.head()