In [376]:
#!pip install catboost

In [377]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

In [378]:
train = pd.read_csv('./data/train_actual.csv')
test = pd.read_csv('./data/test_actual.csv')
data = pd.read_csv('./data/full.csv')

train_80 = train.sample(frac = 0.8, random_state = 42)
 
# Creating dataframe with 
# rest of the 80% values
train_hold = train.drop(train_80.index)

test_hold = test.drop(train_80.index)


# Straightforward solutions
Let's assume, that the best method for each time series is the method with the highest number of best results.
The accuarcy we get from choosing that method for every time series and the average RMSSE for specified model.

In [379]:
def straightforward(raw_data, best_data, criteria, models):
    baseline = pd.DataFrame(index=criteria,
                            columns=['metric', 'model', 'accuracy', 'lost_rate', 'RMSSE'])
    assert np.array(models).shape == np.array(criteria).shape
    baseline['metric'] = criteria
    baseline['model'] = models

    total = len(best_data)
    for criterion, model in zip(criteria, models):
        # Calculate accuarcy
        tp = best_data[best_data[criterion+'_model']==model].count().iloc[0]
        
        true_results = raw_data[raw_data['model_name']==model]
        n = 0; mae = 0; lost = 0; rmsse = 0
        # For each time series calculate discrepancy between best result and
        # predicted method
        for ts, result in zip(best_data['naming_orig'], best_data[criterion]):
            batch = true_results[true_results['naming_orig']==ts]
            try:
                true_value = batch.iloc[batch[criterion].argmin()][criterion]
                rmsse = rmsse + true_value
                if true_value != result:
                    n = n + 1
                    mae = mae + np.abs(true_value - result)
            except:
                lost = lost + 1
            
        #baseline.loc[criterion, 'MAE'] = mae / n
        baseline.loc[criterion, 'lost_rate'] = lost / total
        baseline.loc[criterion, 'accuracy'] = tp / total
        baseline.loc[criterion, 'RMSSE'] = rmsse / total
        baseline.index = np.arange(len(criteria))
            
    return baseline

## Experiment I
We take the most frequent model from the **train** set and apply it to the **train** holdout as a prediction.

In [380]:
best_train_model = train_80['RMSSE_model'].mode().to_numpy()[0]
train_baseline = straightforward(data[data['split']=='validation'],
                                 train_hold, ['RMSSE'], [best_train_model])
train_baseline

Unnamed: 0,metric,model,accuracy,lost_rate,RMSSE
0,RMSSE,TFTTuningObjective_gl,0.137931,0.103448,0.877375


## Experiment II
We take the most frequent model from the **train** set and apply it to the **test** holdout as a prediction.

In [381]:
cv_baseline = straightforward(data[data['split']=='test'],
                              test_hold, ['RMSSE'], [best_train_model])
cv_baseline

Unnamed: 0,metric,model,accuracy,lost_rate,RMSSE
0,RMSSE,TFTTuningObjective_gl,0.149425,0.103448,1.001152


## Experiment III
We take models from the **train** holdout and apply them to the **test** holdout as a prediction

In [382]:
def validation2test(raw_data, val_data, test_data, criteria):
    result = pd.DataFrame(index=criteria,
                          columns=['metric', 'model', 'accuracy', 'lost_rate', 'RMSSE'])
    assert np.array(val_data).shape == np.array(test_data).shape
    result['metric'] = criteria
    result['model'] = 'N/A'
    
    total = len(val_data)
    for criterion in criteria:
        # Calculate accuarcy
        tp = val_data[val_data[criterion+'_model']==test_data[criterion+'_model']].count().iloc[0]
        
        n = 0; mae = 0; lost = 0; rmsse = 0
        
        for ts, metric in zip(test_data['naming_orig'], test_data[criterion]):
            batch = raw_data[raw_data['naming_orig']==ts]
            batch_val = val_data[val_data['naming_orig']==ts]
            value = batch[batch['model_name']==batch_val[criterion+'_model'].values[0]]
            try:
                true_value = value.iloc[value[criterion].argmin()][criterion]
                rmsse = rmsse + true_value
                if true_value != metric:
                    n = n + 1
                    mae = mae + np.abs(true_value - metric)
            except:
                lost = lost + 1
            
        #result.loc[criterion, 'MAE'] = mae / n
        result.loc[criterion, 'lost_rate'] = lost / total
        result.loc[criterion, 'accuracy'] = tp / total
        result.loc[criterion, 'RMSSE'] = rmsse / total
        result.index = np.arange(len(criteria))
            
    return result

In [383]:
val2test = validation2test(data[data['split']=='test'], 
                           train_hold, test_hold, ['RMSSE'])
val2test

Unnamed: 0,metric,model,accuracy,lost_rate,RMSSE
0,RMSSE,,0.172414,0.0,0.892177


# Classifier solutions

In [384]:

train_80 = train_80.set_index('naming_orig')
train_hold = train_hold.set_index('naming_orig')

X_train = train_80.drop('RMSSE_model', axis=1)
y_train = train_80['RMSSE_model']

X_test = train_hold.drop('RMSSE_model', axis=1)
y_test = train_hold['RMSSE_model']

model = CatBoostClassifier(iterations=18,  depth=7, learning_rate = 0.3)
#model = CatBoostClassifier(iterations=26,  depth=8, learning_rate = 0.23)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred, average = 'weighted')
print(f"Model f1_score on test set: {f1:.4f}")

0:	learn: 2.7321464	total: 77.7ms	remaining: 1.32s
1:	learn: 2.5996624	total: 191ms	remaining: 1.53s
2:	learn: 2.4873226	total: 376ms	remaining: 1.88s
3:	learn: 2.4121159	total: 560ms	remaining: 1.96s
4:	learn: 2.3008217	total: 760ms	remaining: 1.98s
5:	learn: 2.2261429	total: 948ms	remaining: 1.9s
6:	learn: 2.1665516	total: 1.15s	remaining: 1.8s
7:	learn: 2.1117317	total: 1.36s	remaining: 1.71s
8:	learn: 2.0606526	total: 1.55s	remaining: 1.55s
9:	learn: 2.0162702	total: 1.73s	remaining: 1.39s
10:	learn: 1.9760321	total: 1.93s	remaining: 1.23s
11:	learn: 1.9375247	total: 2.11s	remaining: 1.06s
12:	learn: 1.9035135	total: 2.29s	remaining: 882ms
13:	learn: 1.8730668	total: 2.49s	remaining: 712ms
14:	learn: 1.8305909	total: 2.69s	remaining: 538ms
15:	learn: 1.7960115	total: 2.85s	remaining: 357ms
16:	learn: 1.7658623	total: 2.93s	remaining: 172ms
17:	learn: 1.7395577	total: 3s	remaining: 0us
Model f1_score on test set: 0.1738


In [385]:

def give_results(raw_data, test, predict, criteria):
    result = pd.DataFrame(index=criteria,
                          columns=['metric', 'model', 'accuracy', 'lost_rate', 'RMSSE'])
    assert np.array(test).shape == np.array(predict).shape
    result['metric'] = criteria
    result['model'] = 'N/A'
    
    total = len(test)
    for criterion in criteria:
        # Calculate accuarcy
        tp = test[test[criterion+'_model']==predict[criterion+'_model']].count().iloc[0]
        
        n = 0; mae = 0; lost = 0; rmsse = 0

        for ts in test.index:

            pred_model = predict.loc[ts, criterion+'_model']
            batch = raw_data[raw_data['naming_orig']==ts]
            try:
                true_value = batch.iloc[batch[criterion].argmin()][criterion]
                pred_value = batch[batch['model_name']==pred_model][criterion].iloc[0]
                rmsse = rmsse + pred_value
                if true_value != pred_value:
                    n = n + 1
                    mae = mae + np.abs(true_value - pred_value)
                    
            except:
                lost = lost + 1
            
        #result.loc[criterion, 'MAE'] = mae / n
        result.loc[criterion, 'lost_rate'] = lost / total
        result.loc[criterion, 'accuracy'] = tp / total
        result.loc[criterion, 'RMSSE'] = rmsse / total
        result.index = np.arange(len(criteria))
            
    return result

In [386]:
pred = y_test.reset_index(drop=False)
pred['RMSSE_model'] = y_pred[:,0]
pred = pred.set_index('naming_orig')
y_test = y_test.to_frame('RMSSE_model')

In [387]:
exp_4 = give_results(data[data['split']=='validation'], y_test, pred, ['RMSSE'])
exp_4

Unnamed: 0,metric,model,accuracy,lost_rate,RMSSE
0,RMSSE,,0.241379,0.011494,0.751998


In [388]:
test_hold = test_hold.set_index('naming_orig')


X_train = train_80.drop('RMSSE_model', axis=1)
y_train = train_80['RMSSE_model']

X_test = test_hold.drop('RMSSE_model', axis=1)
y_true = test_hold['RMSSE_model']


model = CatBoostClassifier(iterations=18,  depth=7, learning_rate = 0.3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_true, y_pred)
print(f"Model accuracy on test set: {accuracy:.4f}")
#model = XGBClassifier(n_estimators=7, max_depth=2, objective='multi:softmax', learning_rate = 0.1)

0:	learn: 2.7321464	total: 79.9ms	remaining: 1.36s
1:	learn: 2.5996624	total: 160ms	remaining: 1.28s
2:	learn: 2.4873226	total: 293ms	remaining: 1.47s
3:	learn: 2.4121159	total: 481ms	remaining: 1.68s
4:	learn: 2.3008217	total: 679ms	remaining: 1.77s
5:	learn: 2.2261429	total: 878ms	remaining: 1.75s
6:	learn: 2.1665516	total: 1.1s	remaining: 1.73s
7:	learn: 2.1117317	total: 1.23s	remaining: 1.54s
8:	learn: 2.0606526	total: 1.32s	remaining: 1.32s
9:	learn: 2.0162702	total: 1.43s	remaining: 1.14s
10:	learn: 1.9760321	total: 1.59s	remaining: 1.01s
11:	learn: 1.9375247	total: 1.72s	remaining: 859ms
12:	learn: 1.9035135	total: 1.8s	remaining: 693ms
13:	learn: 1.8730668	total: 1.9s	remaining: 542ms
14:	learn: 1.8305909	total: 1.98s	remaining: 396ms
15:	learn: 1.7960115	total: 2.07s	remaining: 258ms
16:	learn: 1.7658623	total: 2.16s	remaining: 127ms
17:	learn: 1.7395577	total: 2.29s	remaining: 0us
Model accuracy on test set: 0.2529


In [389]:
pred = y_true.reset_index(drop=False)
pred['RMSSE_model'] = y_pred[:,0]
pred = pred.set_index('naming_orig')
y_true = y_true.to_frame('RMSSE_model')

In [390]:
exp_5 = give_results(data[data['split']=='test'], y_true, pred, ['RMSSE'])
exp_5

Unnamed: 0,metric,model,accuracy,lost_rate,RMSSE
0,RMSSE,,0.252874,0.011494,0.896407
