In [3]:
import pandas as pd
import numpy as np

In [4]:
train = pd.read_csv('data\preprocessed_train.csv')
test = pd.read_csv('data\preprocessed_test.csv')
features_danish = pd.read_csv('manual_features\danish_manual_features.csv', index_col=0)
features_mipt = pd.read_csv('manual_features\mipt_manual_features.csv', index_col=0)
features_nn = pd.read_csv('manual_features/nn_manual_features.csv', index_col=0)

In [5]:
print(f'features_danish {features_danish.columns}, lenght: {len(features_danish.columns)}')
print(f'features_mipt {features_mipt.columns}, lenght: {len(features_mipt.columns)}')
print(f'features_nn {features_nn.columns} lenght: {len(features_nn.columns)}')

features_danish Index(['alpha', 'beta', 'count_entropy', 'crossing_points', 'entropy',
       'flat_spots', 'frequency', 'intervals_mean', 'lumpiness', 'arch_lm',
       'sparsity', 'stability', 'nperiods', 'seasonal_period', 'trend',
       'spike', 'linearity', 'curvature'],
      dtype='object'), lenght: 18
features_mipt Index(['alpha', 'beta', 'count_entropy', 'crossing_points', 'entropy',
       'flat_spots', 'frequency', 'hurst', 'intervals_mean', 'lumpiness',
       'arch_lm', 'sparsity', 'stability', 'nperiods', 'seasonal_period',
       'trend', 'spike', 'linearity', 'curvature'],
      dtype='object'), lenght: 19
features_nn Index(['alpha', 'beta', 'count_entropy', 'crossing_points', 'entropy',
       'flat_spots', 'frequency', 'hurst', 'intervals_mean', 'intervals_sd',
       'lumpiness', 'x_acf1', 'x_acf10', 'diff1_acf1', 'diff1_acf10',
       'diff2_acf1', 'diff2_acf10', 'arch_lm', 'x_pacf5', 'diff1x_pacf5',
       'diff2x_pacf5', 'sparsity', 'stability', 'nperiods', 'seas

In [6]:
# Checking difference in features
unique_list3 = set(features_nn.columns) - (set(features_danish.columns) | set(features_mipt.columns))
# Manually adding "hurst", because it wasn't added during logical operation
unique_list3.update({'hurst'})

In [7]:
# Dropping different columns
for df, name in zip([features_danish, features_mipt, features_nn], ['features_danish', 'features_mipt', 'features_nn']):
    for col in unique_list3:
        if col in df.columns:
            df.drop(col, axis=1, inplace=True)
            print(f'Column {col} dropped from {name}')

Column hurst dropped from features_mipt
Column diff1_acf10 dropped from features_nn
Column hurst dropped from features_nn
Column diff2_acf10 dropped from features_nn
Column e_acf1 dropped from features_nn
Column intervals_sd dropped from features_nn
Column x_acf1 dropped from features_nn
Column diff1_acf1 dropped from features_nn
Column diff2_acf1 dropped from features_nn
Column x_acf10 dropped from features_nn
Column diff2x_pacf5 dropped from features_nn
Column x_pacf5 dropped from features_nn
Column diff1x_pacf5 dropped from features_nn
Column unitroot_pp dropped from features_nn
Column unitroot_kpss dropped from features_nn
Column e_acf10 dropped from features_nn


In [8]:
print(f'features_danish {features_danish.columns}, lenght: {len(features_danish.columns)}')
print(f'features_mipt {features_mipt.columns}, lenght: {len(features_mipt.columns)}')
print(f'features_nn {features_nn.columns} lenght: {len(features_nn.columns)}')

features_danish Index(['alpha', 'beta', 'count_entropy', 'crossing_points', 'entropy',
       'flat_spots', 'frequency', 'intervals_mean', 'lumpiness', 'arch_lm',
       'sparsity', 'stability', 'nperiods', 'seasonal_period', 'trend',
       'spike', 'linearity', 'curvature'],
      dtype='object'), lenght: 18
features_mipt Index(['alpha', 'beta', 'count_entropy', 'crossing_points', 'entropy',
       'flat_spots', 'frequency', 'intervals_mean', 'lumpiness', 'arch_lm',
       'sparsity', 'stability', 'nperiods', 'seasonal_period', 'trend',
       'spike', 'linearity', 'curvature'],
      dtype='object'), lenght: 18
features_nn Index(['alpha', 'beta', 'count_entropy', 'crossing_points', 'entropy',
       'flat_spots', 'frequency', 'intervals_mean', 'lumpiness', 'arch_lm',
       'sparsity', 'stability', 'nperiods', 'seasonal_period', 'trend',
       'spike', 'linearity', 'curvature'],
      dtype='object') lenght: 18


In [9]:
# Concating features
features = pd.concat([features_danish, features_mipt, features_nn], axis=0)
print(f'NaNs in features {np.sum(features.isna().sum())}')

NaNs in features 0


In [10]:
# Sorting features
features_sorted = features.sort_index(axis=0, ascending=True)

In [11]:
# Dropping Unamed: 0 column
train.drop(train.columns[0], inplace=True, axis=1)
test.drop(test.columns[0], inplace=True, axis=1)

In [12]:
# Setting index and sorting
train.set_index('naming_orig', inplace=True)
train_sorted = train.sort_index(axis=0, ascending=True)
test.set_index('naming_orig', inplace=True)
test_sorted = test.sort_index(axis=0, ascending=True)

In [13]:
# Features to drop for train and test data
drop_from_train = list(set(features.index)-set(train_sorted.index))
print(f'Features to drop fetures list for train {drop_from_train}')
drop_from_test = list(set(features.index)-set(test_sorted.index))
print(f'Features to drop fetures list for train {drop_from_test}')

Features to drop fetures list for train ['danish_atm_daily_111', 'danish_atm_daily_110', 'danish_atm_daily_112']
Features to drop fetures list for train ['danish_atm_daily_111', 'danish_atm_daily_110', 'danish_atm_daily_112']


In [14]:
# Dropping values from features
features_train = features.drop(drop_from_train, axis=0)
features_test = features.drop(drop_from_test, axis=0)

In [23]:
# Concatinating data
train_conc = pd.concat([features_train, train_sorted], axis=1)
test_conc = pd.concat([features_test, test_sorted], axis=1)

In [24]:
# Dropping SMAPE columns
train_conc.drop(['SMAPE_model', 'RMSE', 'SMAPE', 'RMSE_model'], axis=1, inplace=True)
test_conc.drop(['SMAPE_model', 'RMSE', 'SMAPE', 'RMSE_model'], axis=1, inplace=True)

In [26]:
# Setting index names
train_conc.index.name = 'naming_orig'
test_conc.index.name = 'naming_orig'

In [27]:
# Cheking NaNs
print(f'NaNs in train set {np.sum(train_conc.isna().sum())}')
print(f'NaNs in test set {np.sum(test_conc.isna().sum())}')

NaNs in train set 0
NaNs in test set 0


In [28]:
# Saving files
test_conc.to_csv('data/ready_test.csv')
train_conc.to_csv('data/ready_train.csv')