In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
fake_ids = np.load('../data/synthetic_samples_indexes.npy')

In [3]:
good_test = test.loc[~test.index.isin(fake_ids)]

In [4]:
all_data = pd.concat([train[train.columns[2:]], good_test[test.columns[1:]]])

### No modifications on the data 

In [5]:
vc_dicts = np.asarray([None] * 200)

In [6]:
for i, c in enumerate(train[train.columns[2:]]):
    vc_dicts[i] = all_data[c].value_counts().astype(np.int32).to_dict()

In [13]:
np.save('../data/value_counters.npy', vc_dicts)

### Count values across all features

In [34]:
all_counts = all_data.stack().value_counts()
np.save('../data/value_counters_all.npy', np.array([all_counts.to_dict()]))

### `np.round` to 3 digits

In [61]:
all_data_rounded_3 = np.round(all_data, decimals=3)

In [62]:
vc_dicts_r_3 = np.asarray([None] * 200)

In [63]:
for i, c in enumerate(train[train.columns[2:]]):
    vc_dicts_r_3[i] = all_data_rounded_3[c].value_counts().astype(np.int32).to_dict()

In [64]:
np.save('../data/value_counters_rounded_3.npy', vc_dicts_r_3)

### `np.round` to 2 digits 

In [34]:
all_data_rounded_2 = np.round(all_data, decimals=2)
vc_dicts_r_2 = np.asarray([None] * 200)

for i, c in enumerate(train[train.columns[2:]]):
    vc_dicts_r_2[i] = all_data_rounded_2[c].value_counts().astype(np.int32).to_dict()

np.save('../data/value_counters_rounded_2.npy', vc_dicts_r_2)

### Value counts for neighbour values 

In [67]:
cnt_df = pd.DataFrame.from_dict(vc_dicts[0], orient='index', columns=['counts'])

In [68]:
cnt_df.sort_index().head(10)

Unnamed: 0,counts
0.1887,1
0.4084,1
0.4528,1
0.5979,1
0.8935,1
0.922,1
0.9302,2
0.9817,1
1.01,1
1.0147,1


In [69]:
cnt_df.sort_index().shift(-1, fill_value=1).head(10) # right neighbor
# cnt_df.sort_index().shift(1, fill_value=1).head(10) # left neighbor

Unnamed: 0,counts
0.1887,1
0.4084,1
0.4528,1
0.5979,1
0.8935,1
0.922,2
0.9302,1
0.9817,1
1.01,1
1.0147,1


### Rank generator 

In [25]:
ranks_dicts = np.asarray([None] * 200)

In [26]:
for i, c in enumerate(train[train.columns[2:]]):
    ranks_dicts[i] = \
        dict((float(k), int(v)) for k, v in zip(
            all_data[c].unique(),
            np.multiply(all_data[c].rank().unique(), 10)))

In [27]:
np.save('../data/ranks_mapping.npy', ranks_dicts)