# LENCODER

Transformations of python's dataframes and arraya that I can understand.

## Install

```
pip install git+https://github.com/sbartek/lencoder
```

In [1]:
import numpy as np
import pandas as pd

items = ['a', 'b', 'c']

## Encoder

In [2]:
from lencoder import Encoder

enc = Encoder(items).create_dicts()
encoded = enc.encode(np.array(['a', 'b']))
print(encoded)

[1 2]


One number (here `0`) is reserved for `nan`s or not encoded labels.

In [3]:
enc.item2num

{'<NAN>': 0, 'a': 1, 'b': 2, 'c': 3}

In [4]:
enc.encode(np.array(['not existing', 'b']))

array([0, 2])

If you do not want to reserve something for `nan`s use:

In [5]:
enc = Encoder(items, add_nan=False).create_dicts()
enc.item2num

{'a': 0, 'b': 1, 'c': 2}

## One-Hot-Encoding

### One-Hot-Encoding adding nans

In [6]:
from lencoder import OneHotEncoder

ohenc = OneHotEncoder(items).create_dicts()
encoded = ohenc.encode(np.array(['a', 'b']))
print(encoded)

[[False  True False False]
 [False False  True False]]


#### Encoding new items gives the same result as nan

In [7]:
ohenc.encode(np.array(['something new', 'hehehe']))

array([[ True, False, False, False],
       [ True, False, False, False]])

### One-Hot-Encoding no nans

In [8]:
ohenc = OneHotEncoder(items, add_nan=False).create_dicts()
encoded = ohenc.encode(np.array(['a', 'b']))
print(encoded)

[[ True False False]
 [False  True False]]


In [9]:
ohenc.decode(encoded)

array(['a', 'b'], dtype='<U1')

#### Save and load from disk

In [10]:
ohenc.dump_dicts('ohe_nonans_')

In [11]:
ohenc_from_saved = OneHotEncoder.create_from_saved_dicts('ohe_nonans_')

In [12]:
ohenc_from_saved.encode(np.array(['a', 'b']))

array([[ True, False, False],
       [False,  True, False]])

### One-Hot-Encoding of columns, directly

In [13]:
items_df = pd.DataFrame({'col': items})

In [14]:
from lencoder import ColumnOneHotEncoder

In [15]:
cohenc = ColumnOneHotEncoder(items=items_df['col'], colname='col', add_nan=False)
cohenc.create_dicts().encode(items_df)

Unnamed: 0,col_0,col_1,col_2
0,True,False,False
1,False,True,False
2,False,False,True


In [16]:
items_df

Unnamed: 0,col
0,a
1,b
2,c


In [17]:
cohenc = ColumnOneHotEncoder(items=items_df['col'], colname='col')
cohenc.create_dicts().encode(items_df)

Unnamed: 0,col_0,col_1,col_2,col_3
0,False,True,False,False
1,False,False,True,False
2,False,False,False,True


#### Save and load from disk

In [18]:
cohenc.dump_dicts("column_ohe_with_nans")

In [19]:
saved_cohenc = ColumnOneHotEncoder.create_from_saved_dicts("column_ohe_with_nans")

In [20]:
saved_cohenc.encode(items_df)

Unnamed: 0,col_0,col_1,col_2,col_3
0,False,True,False,False
1,False,False,True,False
2,False,False,False,True


# Value Encoding

In [21]:
df = pd.DataFrame({
            'days': sorted(list(range(4)) * 4),
            'group': sorted(['A', 'B'] * 2) * 4,
            'value1': sorted(list(range(8)) * 2),
            'value2': list(range(8)) * 2
        })
df

Unnamed: 0,days,group,value1,value2
0,0,A,0,0
1,0,A,0,1
2,0,B,1,2
3,0,B,1,3
4,1,A,2,4
5,1,A,2,5
6,1,B,3,6
7,1,B,3,7
8,2,A,4,0
9,2,A,4,1


In [22]:
from lencoder.value_encoder import ValueEncoder
venc = ValueEncoder(
            df, ['days'], ['value1', 'value2'],
            aggregations=['mean', 'sum'])
encoded_df = venc.encode()
encoded_df

Unnamed: 0,days,days:value1:mean,days:value1:sum,days:value2:mean,days:value2:sum
0,0,0.5,2,1.5,6
1,1,2.5,10,5.5,22
2,2,4.5,18,1.5,6
3,3,6.5,26,5.5,22


In [23]:
df.value_encodigs(
            ['days'], ['value1', 'value2'],
            aggregations=['mean', 'sum'])

Unnamed: 0,days,days:value1:mean,days:value1:sum,days:value2:mean,days:value2:sum
0,0,0.5,2,1.5,6
1,1,2.5,10,5.5,22
2,2,4.5,18,1.5,6
3,3,6.5,26,5.5,22


## with lags

In [24]:
import lencoder.value_encoder_with_lags

df.value_encodigs_with_lags(
            'days', ['group'], ['value1', 'value2'],
            aggregations=['mean', 'sum'], lags=[1, 2])

Unnamed: 0,days,group,days_group:value1:mean,days_group:value1:sum,days_group:value2:mean,days_group:value2:sum,days_group:value1:mean_1,days_group:value1:sum_1,days_group:value2:mean_1,days_group:value2:sum_1,days_group:value1:mean_2,days_group:value1:sum_2,days_group:value2:mean_2,days_group:value2:sum_2
0,0,A,0,0,0.5,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,B,1,2,2.5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,A,2,4,4.5,9,0.0,0.0,0.5,1.0,0.0,0.0,0.0,0.0
3,1,B,3,6,6.5,13,1.0,2.0,2.5,5.0,0.0,0.0,0.0,0.0
4,2,A,4,8,0.5,1,2.0,4.0,4.5,9.0,0.0,0.0,0.5,1.0
5,2,B,5,10,2.5,5,3.0,6.0,6.5,13.0,1.0,2.0,2.5,5.0
6,3,A,6,12,4.5,9,4.0,8.0,0.5,1.0,2.0,4.0,4.5,9.0
7,3,B,7,14,6.5,13,5.0,10.0,2.5,5.0,3.0,6.0,6.5,13.0
