---

<a href="https://github.com/rraadd88/roux/blob/master/examples/roux_lib_df.ipynb"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>

## ⌗ Tables/Dataframes.

**Optional requirements**

In [40]:
# to show logging messages
import logging
logging.getLogger().setLevel(logging.INFO)

## Import `r`oux-`d`ataframe attributes

In [41]:
import roux.lib.dfs as rd

## Basic data checks 

**Demo data**

In [42]:
import seaborn as sns
import numpy as np
data=sns.load_dataset('iris')
data=(
    data
    .assign(
    **{
        ## insert missing values 
        "sepal_length with missing values":lambda df: df['sepal_length'].apply(lambda x: np.nan if x>5 else x),
    }
    )
)
data.head(1)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_length with missing values
0,5.1,3.5,1.4,0.2,setosa,


### Missing values 

In [43]:
## check
data.rd.check_na(subset=['sepal_length with missing values'],perc=True)

sepal_length with missing values    78.666667
dtype: float64

In [44]:
## check and return. for usage in chained openrations
data.rd.check_na(subset=['sepal_length with missing values'],out=False).head(1)

INFO:root:sepal_length with missing values = 118


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_length with missing values
0,5.1,3.5,1.4,0.2,setosa,


### Duplicates

In [45]:
data.rd.check_dups(subset='sepal_length').head()

INFO:root:duplicate rows: 94.0% (141/150)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_length with missing values
38,4.4,3.0,1.3,0.2,setosa,4.4
42,4.4,3.2,1.3,0.2,setosa,4.4
8,4.4,2.9,1.4,0.2,setosa,4.4
3,4.6,3.1,1.5,0.2,setosa,4.6
47,4.6,3.2,1.4,0.2,setosa,4.6


In [46]:
# for testing
assert data.shape==(150, 6), data.shape

### Unique values

In [47]:
data.rd.check_nunique(subset=['species'])

species    3
dtype: int64

In [48]:
## check and return. for usage in chained openrations
data.rd.check_nunique(subset=['species'],out=False).head(1)

INFO:root:nunique: species = 3


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_length with missing values
0,5.1,3.5,1.4,0.2,setosa,


## Validate

### Declarative data validations

In [49]:
## validate no missing values and duplicates `assert_*` attributes
_=(data
   .drop_duplicates()
   
   ## validate no missing values
   .rd.assert_no_na(subset=['sepal_length'])
   ## validate no duplicates
   .rd.assert_no_dups()
  )

In [50]:
## validate no missing values and no duplicates 'together' using `assert_dense` attribute
_=(data
   .drop_duplicates()
   .drop(['sepal_length with missing values'],axis=1)
   
   .rd.assert_dense()
  )

## Logging
### Changes in the dataframe shapes

In [51]:
_=data.log.drop_duplicates()

INFO:root:drop_duplicates: shape changed: (150, 6)->(149, 6), width constant


In [52]:
## within pipes
_=(data
   .log.drop_duplicates()
   .log('sepal_length')
   .log('sepal_length',groupby='species')
  )

INFO:root:drop_duplicates: shape changed: (150, 6)->(149, 6), width constant
INFO:root:shape = (149, 6) nunique: sepal_length = 35
INFO:root:shape = (149, 6) by "species", nunique "sepal_length": setosa = 15; versicolor = 21; virginica = 21


## Filter 
### Using a dictionary

In [53]:
out=data.rd.filter_rows({'species':'setosa'})
out.head(1)

INFO:root:(150, 6)
INFO:root:(50, 6)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_length with missing values
0,5.1,3.5,1.4,0.2,setosa,


In [54]:
# for testing
assert out.shape==(50, 6), out.shape

## Merge

**Demo data**

In [55]:
data2=data.groupby('species').head(1)

### Validation of changes in table shapes

In [56]:
out=data.log.merge(
    right=data2,
    how='inner',
    on='species',
    validate='m:1',
    validate_equal_length=True,
    # validate_no_decrease_length=True,
    )
out.head(1)

INFO:root:merge: shape changed: (150, 6)->(150, 11), length constant


Unnamed: 0,sepal_length_x,sepal_width_x,petal_length_x,petal_width_x,species,sepal_length with missing values_x,sepal_length_y,sepal_width_y,petal_length_y,petal_width_y,sepal_length with missing values_y
0,5.1,3.5,1.4,0.2,setosa,,5.1,3.5,1.4,0.2,


In [57]:
# for testing
assert out.shape==(150, 11), out.shape

## Melt

**Demo data with paired IDs and values**

In [58]:
np.random.seed(1)
data=(
    data
    .assign(
    **{
        ## create paired ids
        "id 1":lambda df: range(len(df)),
        "id 2":lambda df: range(len(df))[::-1],
    }
    )
    .rename(
        columns={
            "sepal_length": "value 1",
            "sepal_width": "value 2",
            
        },
        errors='raise',
    )
    .loc[:,['id 1','id 2','value 1','value 2']]
).sample(5).reset_index(drop=True)
## example order
order=list(set(data['id 1'].tolist()+data['id 2'].tolist()))
order=list(np.random.choice(order,len(order),replace=False))
data

Unnamed: 0,id 1,id 2,value 1,value 2
0,14,135,5.8,4.0
1,98,51,5.1,2.5
2,75,74,6.6,3.0
3,16,133,5.4,3.9
4,131,18,7.9,3.8


In [59]:
# for testing
assert data.shape==(5,4), data.shape

### Tables with paired IDs

In [60]:
out=data.rd.melt_paired(suffixes=['1','2'])
out

Unnamed: 0,suffix,id,value
0,1,14,5.8
1,1,98,5.1
2,1,75,6.6
3,1,16,5.4
4,1,131,7.9
0,2,135,4.0
1,2,51,2.5
2,2,74,3.0
3,2,133,3.9
4,2,18,3.8


In [61]:
# for testing
assert out.shape==(10,3), out.shape

## Sort 

### Paired columns by values

In [62]:
out=data.rd.sort_columns_by_values(['id 1','id 2'])
out

INFO:root:(equal, sorted) items: {(False, False): 2, (False, True): 3}


Unnamed: 0,equal,sorted,id 1,id 2,value 1,value 2
0,False,False,14,135,5.8,4.0
3,False,False,16,133,5.4,3.9
1,False,True,51,98,2.5,5.1
2,False,True,74,75,3.0,6.6
4,False,True,18,131,3.8,7.9


In [63]:
# for testing
assert out['id 1'].tolist()==[14, 16, 51, 74, 18], out['id 1'].tolist()

### Paired columns by order

In [64]:
order

[133, 135, 14, 51, 131, 18, 75, 16, 98, 74]

In [65]:
out=data.rd.sort_columns_by_values(
    ['id 1','id 2'],
    order=order, # order of the ids
    clean=True,
)
out

INFO:root:(equal, sorted) items: {(False, False): 2, (False, True): 3}


Unnamed: 0,id 1,id 2,value 1,value 2
2,75,74,6.6,3.0
4,131,18,7.9,3.8
0,135,14,4.0,5.8
1,51,98,2.5,5.1
3,133,16,3.9,5.4


In [66]:
# for testing
assert out['id 1'].tolist()==[75, 131, 135, 51, 133], out['id 1'].tolist()

## Mapping between columns

**Demo data**

In [67]:
data=(
    data
    .loc[:,['id 1']].head(10)
    .assign(
    **{
        ## create shuffled ids
        "id 2":lambda df: np.random.choice(df['id 1'],len(df)),
    }
    )
)
data

Unnamed: 0,id 1,id 2
0,14,16
1,98,98
2,75,98
3,16,131
4,131,131


### Check counts

In [68]:
out=data.rd.check_mappings(subset=['id 1','id 2'])
out

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mappings count
mapping,id 1 count,id 2 count,Unnamed: 3_level_1
1:1,1,1,1
m:1,2,1,4


In [69]:
# for testing
assert out['mappings count'].tolist()==[1, 4], out['mappings count'].tolist()

### Classify

In [70]:
out=data.rd.classify_mappings(subset=['id 1','id 2'])
out

Unnamed: 0,Unnamed: 1,Unnamed: 2,id 1,id 2,id 1 count,id 2 count,mapping
0,0,0,14,16,1,1,1:1
1,2,2,75,98,2,1,m:1
1,3,1,98,98,2,1,m:1
2,1,3,16,131,2,1,m:1
2,4,4,131,131,2,1,m:1


In [71]:
# for testing
assert out['mapping'].tolist()==['1:1', 'm:1', 'm:1', 'm:1', 'm:1'], out['mapping'].tolist()

### Filter

In [72]:
out=data.rd.get_mappings(
    subset=['id 1','id 2'],
    keep="2:1", # keep 3:1 mappings
    clean=False,
)
out

INFO:root:query: shape changed: (5, 5)->(4, 5), width constant


Unnamed: 0,Unnamed: 1,Unnamed: 2,id 1,id 2,id 1 count,id 2 count,mapping
1,2,2,75,98,2,1,m:1
1,3,1,98,98,2,1,m:1
2,1,3,16,131,2,1,m:1
2,4,4,131,131,2,1,m:1


In [73]:
# for testing
assert out.shape==(4,5), out

In [74]:
out=data.rd.get_mappings(
    subset=['id 1','id 2'],
    keep='1:1', ## keep 1:1 mappings
    clean=True,
    )
out

INFO:root:query: shape changed: (5, 3)->(1, 3), width constant


Unnamed: 0,Unnamed: 1,Unnamed: 2,id 1,id 2,mapping
0,0,0,14,16,1:1


In [75]:
# for testing
assert out['mapping'].tolist()==['1:1'], out['mapping'].tolist()

### Validate

In [76]:
data.rd.classify_mappings(subset=['id 1','id 2'])

Unnamed: 0,Unnamed: 1,Unnamed: 2,id 1,id 2,id 1 count,id 2 count,mapping
0,0,0,14,16,1,1,1:1
1,2,2,75,98,2,1,m:1
1,3,1,98,98,2,1,m:1
2,1,3,16,131,2,1,m:1
2,4,4,131,131,2,1,m:1


In [77]:
# for testing
assert len(data)==5

In [78]:
data.head(1).rd.assert_1_1_mappings(subset=['id 1','id 2'])

Bad pipe message: %s [b"\x9fEJ\t1\x16\x84B\xcd\xb6\xdb\xb6\x85\x13\xac\xb4=7 C7\xb2v\xd1\xd6\xe7\x00\x93GC\xb2\x81\xd6\x167W\xa0I-\xdc\x98\xf7\x89\x03z\x95\x03;\x1d?Y\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01\x00+\x00\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 \x82\xe8~\xd4\xea\xdc@?\xb1'<\x99\xd6\xaf\xd8"]
Bad pipe message: %s [b'\xcf\xc06\\\xec\xadqI\xd0\xad\xd9\x96\x94\xd9\x02\x0f\xac\xff\x00\x00\xa6\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0', b"S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0s\xc0w\x00\xc4\x00\xc3\xc0#\xc0'\x00g\x00@\x

## Documentation
[`roux.lib.df`](https://github.com/rraadd88/roux#module-rouxlibdf)