In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('data/train_dataset.csv')
test = pd.read_csv('data/test_dataset.csv')
features_danish = pd.read_csv('manual_features\danish_manual_features.csv', index_col=0)
features_mipt = pd.read_csv('manual_features\mipt_manual_features.csv', index_col=0)
features_nn = pd.read_csv('manual_features/nn_manual_features.csv', index_col=0)

In [3]:
print(f'features_danish {features_danish.columns}, lenght: {len(features_danish.columns)}')
print(f'features_mipt {features_mipt.columns}, lenght: {len(features_mipt.columns)}')
print(f'features_nn {features_nn.columns} lenght: {len(features_nn.columns)}')

features_danish Index(['alpha', 'beta', 'count_entropy', 'crossing_points', 'entropy',
       'flat_spots', 'frequency', 'intervals_mean', 'lumpiness', 'arch_lm',
       'sparsity', 'stability', 'nperiods', 'seasonal_period', 'trend',
       'spike', 'linearity', 'curvature'],
      dtype='object'), lenght: 18
features_mipt Index(['alpha', 'beta', 'count_entropy', 'crossing_points', 'entropy',
       'flat_spots', 'frequency', 'hurst', 'intervals_mean', 'lumpiness',
       'arch_lm', 'sparsity', 'stability', 'nperiods', 'seasonal_period',
       'trend', 'spike', 'linearity', 'curvature'],
      dtype='object'), lenght: 19
features_nn Index(['alpha', 'beta', 'count_entropy', 'crossing_points', 'entropy',
       'flat_spots', 'frequency', 'hurst', 'intervals_mean', 'intervals_sd',
       'lumpiness', 'x_acf1', 'x_acf10', 'diff1_acf1', 'diff1_acf10',
       'diff2_acf1', 'diff2_acf10', 'arch_lm', 'x_pacf5', 'diff1x_pacf5',
       'diff2x_pacf5', 'sparsity', 'stability', 'nperiods', 'seas

In [4]:
# Checking difference in features
unique_list3 = set(features_nn.columns) - (set(features_danish.columns) | set(features_mipt.columns))
# Manually adding "hurst", because it wasn't added during logical operation
unique_list3.update({'hurst'})

In [5]:
# Dropping different columns
for df, name in zip([features_danish, features_mipt, features_nn], ['features_danish', 'features_mipt', 'features_nn']):
    for col in unique_list3:
        if col in df.columns:
            df.drop(col, axis=1, inplace=True)
            print(f'Column {col} dropped from {name}')

Column hurst dropped from features_mipt
Column x_acf10 dropped from features_nn
Column e_acf10 dropped from features_nn
Column x_acf1 dropped from features_nn
Column diff1_acf1 dropped from features_nn
Column unitroot_pp dropped from features_nn
Column hurst dropped from features_nn
Column diff2_acf10 dropped from features_nn
Column e_acf1 dropped from features_nn
Column diff2x_pacf5 dropped from features_nn
Column intervals_sd dropped from features_nn
Column diff1_acf10 dropped from features_nn
Column diff1x_pacf5 dropped from features_nn
Column unitroot_kpss dropped from features_nn
Column x_pacf5 dropped from features_nn
Column diff2_acf1 dropped from features_nn


In [6]:
print(f'features_danish {features_danish.columns}, lenght: {len(features_danish.columns)}')
print(f'features_mipt {features_mipt.columns}, lenght: {len(features_mipt.columns)}')
print(f'features_nn {features_nn.columns} lenght: {len(features_nn.columns)}')

features_danish Index(['alpha', 'beta', 'count_entropy', 'crossing_points', 'entropy',
       'flat_spots', 'frequency', 'intervals_mean', 'lumpiness', 'arch_lm',
       'sparsity', 'stability', 'nperiods', 'seasonal_period', 'trend',
       'spike', 'linearity', 'curvature'],
      dtype='object'), lenght: 18
features_mipt Index(['alpha', 'beta', 'count_entropy', 'crossing_points', 'entropy',
       'flat_spots', 'frequency', 'intervals_mean', 'lumpiness', 'arch_lm',
       'sparsity', 'stability', 'nperiods', 'seasonal_period', 'trend',
       'spike', 'linearity', 'curvature'],
      dtype='object'), lenght: 18
features_nn Index(['alpha', 'beta', 'count_entropy', 'crossing_points', 'entropy',
       'flat_spots', 'frequency', 'intervals_mean', 'lumpiness', 'arch_lm',
       'sparsity', 'stability', 'nperiods', 'seasonal_period', 'trend',
       'spike', 'linearity', 'curvature'],
      dtype='object') lenght: 18


In [7]:
# Concating features
features = pd.concat([features_danish, features_mipt, features_nn], axis=0)
print(f'NaNs in features {np.sum(features.isna().sum())}')

NaNs in features 0


In [8]:
# Sorting features
features_sorted = features.sort_index(axis=0, ascending=True)

In [9]:
# Dropping Unamed: 0 column
train.drop(train.columns[0], inplace=True, axis=1)
test.drop(test.columns[0], inplace=True, axis=1)

In [10]:
# Setting index and sorting
train.set_index('naming_orig', inplace=True)
train_sorted = train.sort_index(axis=0, ascending=True)
test.set_index('naming_orig', inplace=True)
test_sorted = test.sort_index(axis=0, ascending=True)

In [11]:
features

Unnamed: 0,alpha,beta,count_entropy,crossing_points,entropy,flat_spots,frequency,intervals_mean,lumpiness,arch_lm,sparsity,stability,nperiods,seasonal_period,trend,spike,linearity,curvature
danish_atm_daily_0,1.419392e-01,0.005905,-171289.625528,115.0,0.832942,6.0,1.0,59.400000,0.003311,0.271016,0.026230,0.008920,0.0,1.0,0.048930,2.149307e-07,-0.132583,0.238538
danish_atm_daily_1,1.389748e-01,0.017725,-186506.124217,67.0,0.838796,108.0,1.0,100.666667,0.001346,0.000178,0.009836,0.001621,0.0,1.0,0.050821,8.532189e-08,0.174946,-0.168956
danish_atm_daily_10,3.349371e-01,0.000108,-126910.211280,101.0,0.865232,10.0,1.0,31.111111,0.008169,0.436217,0.081967,0.031092,0.0,1.0,0.409299,2.016390e-07,-0.086137,-0.062198
danish_atm_daily_100,1.490116e-08,0.000000,-47743.428902,114.0,0.826302,6.0,1.0,32.222222,0.006076,0.231757,0.049180,0.010172,0.0,1.0,0.043788,3.614013e-07,-0.127912,0.437014
danish_atm_daily_101,8.007617e-02,0.000004,-84083.905902,53.0,0.782084,126.0,1.0,49.500000,0.003420,0.207726,0.026230,0.004782,0.0,1.0,0.055317,2.114193e-07,0.109905,0.301157
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nn5_95,4.765327e-02,0.000000,-49099.855304,277.0,0.705937,4.0,1.0,50.857143,0.003220,0.122940,0.025992,0.004174,0.0,1.0,0.016721,4.094732e-08,0.042712,0.128781
nn5_96,1.450796e-01,0.000000,-27296.508481,246.0,0.699711,6.0,1.0,41.882353,0.002516,0.057562,0.025992,0.003063,0.0,1.0,0.007398,4.184013e-08,0.078892,0.067768
nn5_97,2.427338e-02,0.000000,-46489.182478,298.0,0.724865,5.0,1.0,59.666667,0.002378,0.090045,0.020520,0.003044,0.0,1.0,0.021053,3.280696e-08,-0.073337,0.130115
nn5_98,6.303816e-02,0.000000,-27547.457264,233.0,0.772084,5.0,1.0,47.533333,0.002795,0.064954,0.024624,0.003550,0.0,1.0,0.011235,3.969010e-08,-0.013026,0.072570


In [12]:
# Features to drop for train and test data
drop_from_train = list(set(features.index)-set(train_sorted.index))
print(f'Features to drop fetures list for train {drop_from_train}')
drop_from_test = list(set(features.index)-set(test_sorted.index))
print(f'Features to drop fetures list for train {drop_from_test}')

Features to drop fetures list for train ['mipt_alpha_366', 'danish_atm_daily_110', 'danish_atm_daily_111', 'danish_atm_daily_112', 'mipt_alpha_368']
Features to drop fetures list for train ['danish_atm_daily_110', 'danish_atm_daily_111', 'danish_atm_daily_112']


In [13]:
# Dropping values from features
features_train = features.drop(drop_from_train, axis=0)
features_test = features.drop(drop_from_test, axis=0)

In [14]:
# Concatinating data
train_conc = pd.concat([features_train, train_sorted], axis=1)
test_conc = pd.concat([features_test, test_sorted], axis=1)

In [15]:
# Cheking NaNs
print(f'NaNs in train set {np.sum(train_conc.isna().sum())}')
print(f'NaNs in test set {np.sum(test_conc.isna().sum())}')

NaNs in train set 0
NaNs in test set 0


In [16]:
# Saving files
test.to_csv('data/test.csv')
train.to_csv('data/train.csv')