In [270]:
!pip install catboost



In [271]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

In [272]:
train = pd.read_csv('./data/full_train.csv')
test = pd.read_csv('./data/full_test.csv')
data = pd.read_csv('./data/full.csv')

# Straightforward solutions
Let's assume, that the best method for each time series is the method with the highest number of best results.
The accuarcy we get from choosing that method for every time series and the MAE between true best error and this method error will be taken as a baseline.

In [273]:
def straightforward(raw_data, best_data, criteria, models):
    baseline = pd.DataFrame(index=criteria,
                            columns=['metric', 'model', 'accuracy', 'lost_rate', 'MAE'])
    assert np.array(models).shape == np.array(criteria).shape
    baseline['metric'] = criteria
    baseline['model'] = models

    total = len(best_data)
    for criterion, model in zip(criteria, models):
        # Calculate accuarcy
        tp = best_data[best_data[criterion+'-model']==model].count().iloc[0]
        
        # For remaining FP results caculate MAE w.r.t. designated criterion
        # Get all rows with the best model
        true_results = raw_data[raw_data['model_name']==model]
        n = 0; mae = 0; lost = 0
        # For each time series calculate discrepancy between best result and
        # predicted method
        for ts, result in zip(best_data['naming_orig'], best_data[criterion]):
            batch = true_results[true_results['naming_orig']==ts]
            try:
                true_value = batch.iloc[batch[criterion].argmin()][criterion]
                if true_value != result:
                    n = n + 1
                    mae = mae + np.abs(true_value - result)
            except:
                lost = lost + 1
            
        baseline.loc[criterion, 'MAE'] = mae / n
        baseline.loc[criterion, 'lost_rate'] = lost / total
        baseline.loc[criterion, 'accuracy'] = tp / total
        baseline.index = np.arange(len(criteria))
            
    return baseline

## Experiment I
We take the most frequent model from the **train** dataset and apply it to the whole **train** dataset as a prediction.

In [274]:
best_train_model = train['RMSSE-model'].mode().to_numpy()[0]
train_baseline = straightforward(data[data['split']=='validation'],
                                 train, ['RMSSE'], [best_train_model])
train_baseline

Unnamed: 0,metric,model,accuracy,lost_rate,MAE
0,RMSSE,TFTTuningObjective_gl,0.150402,0.12744,0.320506


## Experiment II
We take the most frequent model from the **test** dataset and apply it to the whole **test** dataset as a prediction.

In [275]:
best_test_model = test['RMSSE-model'].mode().to_numpy()[0]
test_baseline = straightforward(data[data['split']=='validation'],
                                 test, ['RMSSE'], [best_test_model])
test_baseline

Unnamed: 0,metric,model,accuracy,lost_rate,MAE
0,RMSSE,Prophet,0.214696,0.0,0.195196


## Experiment III
We take the most frequent model from the **train** dataset and apply it to the whole **test** dataset as a prediction.

In [276]:
cv_baseline = straightforward(data[data['split']=='test'],
                              test, ['RMSSE'], [best_train_model])
cv_baseline

Unnamed: 0,metric,model,accuracy,lost_rate,MAE
0,RMSSE,TFTTuningObjective_gl,0.097589,0.12744,0.300779


## Experiment IV

In [277]:
def validation2test(raw_data, val_data, test_data, criteria):
    result = pd.DataFrame(index=criteria,
                          columns=['metric', 'model', 'accuracy', 'lost_rate', 'MAE'])
    assert np.array(val_data).shape == np.array(test_data).shape
    result['metric'] = criteria
    result['model'] = 'N/A'
    
    total = len(val_data)
    for criterion in criteria:
        # Calculate accuarcy
        tp = val_data[val_data[criterion+'-model']==test_data[criterion+'-model']].count().iloc[0]
        
        # For remaining FP results caculate MAE w.r.t. designated criterion
        # Get all rows with the best model
        n = 0; mae = 0; lost = 0
        # For each time series calculate discrepancy between best result and
        # predicted method
        for ts, metric in zip(test_data['naming_orig'], test_data[criterion]):
            batch = raw_data[raw_data['naming_orig']==ts]
            try:
                true_value = batch.iloc[batch[criterion].argmin()][criterion]
                if true_value != metric:
                    n = n + 1
                    mae = mae + np.abs(true_value - result)
            except:
                lost = lost + 1
            
        result.loc[criterion, 'MAE'] = mae / n
        result.loc[criterion, 'lost_rate'] = lost / total
        result.loc[criterion, 'accuracy'] = tp / total
        result.index = np.arange(len(criteria))
            
    return

In [278]:
val2test = validation2test(data[data['split']=='validation'], 
                           train, test, ['RMSSE'])
val2test

# Classifier solutions

In [279]:
train_data = pd.read_csv("./data/train_actual.csv")
# train_data.reset_index()
# train_data.set_index('naming_orig')
# train_data
train_data = train_data.set_index('naming_orig')

X = train_data.drop('RMSSE_model', axis=1)
y = train_data['RMSSE_model']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = CatBoostClassifier(iterations=18,  depth=7, learning_rate = 0.3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy on test set: {accuracy:.4f}")

0:	learn: 2.8105740	total: 65ms	remaining: 1.1s
1:	learn: 2.6663137	total: 126ms	remaining: 1.01s
2:	learn: 2.5409175	total: 193ms	remaining: 964ms
3:	learn: 2.4399706	total: 257ms	remaining: 898ms
4:	learn: 2.3339324	total: 324ms	remaining: 844ms
5:	learn: 2.2503682	total: 388ms	remaining: 777ms
6:	learn: 2.1868034	total: 452ms	remaining: 710ms
7:	learn: 2.1218856	total: 514ms	remaining: 643ms
8:	learn: 2.0614959	total: 586ms	remaining: 586ms
9:	learn: 2.0110989	total: 652ms	remaining: 522ms
10:	learn: 1.9618075	total: 717ms	remaining: 456ms
11:	learn: 1.9064416	total: 782ms	remaining: 391ms
12:	learn: 1.8683643	total: 851ms	remaining: 327ms
13:	learn: 1.8296889	total: 917ms	remaining: 262ms
14:	learn: 1.7971875	total: 980ms	remaining: 196ms
15:	learn: 1.7691655	total: 1.04s	remaining: 131ms
16:	learn: 1.7428628	total: 1.11s	remaining: 65.5ms
17:	learn: 1.7055079	total: 1.18s	remaining: 0us
Model accuracy on test set: 0.2989


In [280]:
def index2name(dataset, y):
    for i in range(len(y)):
        index = y[i,0]
        y[i,0] = dataset.iloc[index,0]
    return y

def give_results(raw_data, test, predict, criteria):
    result = pd.DataFrame(index=criteria,
                          columns=['metric', 'model', 'accuracy', 'lost_rate', 'MAE'])
    assert np.array(test).shape == np.array(predict).shape
    result['metric'] = criteria
    result['model'] = 'N/A'
    
    total = len(test)
    for criterion in criteria:
        # Calculate accuarcy
        tp = test[test[criterion+'_model']==predict[criterion+'_model']].count().iloc[0]
        
        # For remaining FP results caculate MAE w.r.t. designated criterion
        # Get all rows with the best model
        n = 0; mae = 0; lost = 0
        # For each time series calculate discrepancy between best result and
        # predicted method
        for ts in test.index:
            #true_model = test.loc[ts, criterion+'_model']
            pred_model = predict.loc[ts, criterion+'_model']
            batch = raw_data[raw_data['naming_orig']==ts]
            try:
                true_value = batch.iloc[batch[criterion].argmin()][criterion]
                pred_value = batch[batch['model_name']==pred_model][criterion].iloc[0]
                if true_value != pred_value:
                    n = n + 1
                    mae = mae + np.abs(true_value - pred_value)
            except:
                lost = lost + 1
            
        result.loc[criterion, 'MAE'] = mae / n
        result.loc[criterion, 'lost_rate'] = lost / total
        result.loc[criterion, 'accuracy'] = tp / total
        result.index = np.arange(len(criteria))
            
    return result

In [281]:
pred = y_test.reset_index(drop=False)
pred['RMSSE_model'] = y_pred[:,0]
pred = pred.set_index('naming_orig')
y_test = y_test.to_frame('RMSSE_model')

In [282]:
exp_4 = give_results(data[data['split']=='validation'], y_test, pred, ['RMSSE'])
exp_4

Unnamed: 0,metric,model,accuracy,lost_rate,MAE
0,RMSSE,,0.298851,0.0,0.088534


In [283]:
train_data = pd.read_csv("./data/train_actual.csv")
test_data = pd.read_csv("./data/test_actual.csv")

train_data = train_data.set_index('naming_orig')
test_data = test_data.set_index('naming_orig')

X_train = train_data.drop('RMSSE_model', axis=1)
y_train = train_data['RMSSE_model']

X_test = test_data.drop('RMSSE_model', axis=1)
y_true = test_data['RMSSE_model']

model = CatBoostClassifier(iterations=18,  depth=7, learning_rate = 0.3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_true, y_pred)
print(f"Model accuracy on test set: {accuracy:.4f}")
#model = XGBClassifier(n_estimators=7, max_depth=2, objective='multi:softmax', learning_rate = 0.1)

0:	learn: 2.7358668	total: 72.2ms	remaining: 1.23s
1:	learn: 2.5945471	total: 142ms	remaining: 1.14s
2:	learn: 2.4815120	total: 209ms	remaining: 1.05s
3:	learn: 2.3959946	total: 279ms	remaining: 976ms
4:	learn: 2.3102687	total: 352ms	remaining: 916ms
5:	learn: 2.2251430	total: 422ms	remaining: 845ms
6:	learn: 2.1600392	total: 493ms	remaining: 775ms
7:	learn: 2.0974175	total: 565ms	remaining: 706ms
8:	learn: 2.0402684	total: 635ms	remaining: 635ms
9:	learn: 2.0056483	total: 700ms	remaining: 560ms
10:	learn: 1.9657452	total: 773ms	remaining: 492ms
11:	learn: 1.9132387	total: 843ms	remaining: 422ms
12:	learn: 1.8759844	total: 910ms	remaining: 350ms
13:	learn: 1.8402355	total: 981ms	remaining: 280ms
14:	learn: 1.8133323	total: 1.05s	remaining: 210ms
15:	learn: 1.7774503	total: 1.12s	remaining: 140ms
16:	learn: 1.7440936	total: 1.19s	remaining: 70.1ms
17:	learn: 1.7163924	total: 1.26s	remaining: 0us
Model accuracy on test set: 0.2440


In [284]:
pred = y_true.reset_index(drop=False)
pred['RMSSE_model'] = y_pred[:,0]
pred = pred.set_index('naming_orig')
y_true = y_true.to_frame('RMSSE_model')

In [285]:
exp_5 = give_results(data[data['split']=='test'], y_true, pred, ['RMSSE'])
exp_5

Unnamed: 0,metric,model,accuracy,lost_rate,MAE
0,RMSSE,,0.243959,0.0,0.124881
