---

<a href="https://github.com/rraadd88/roux/blob/master/examples/roux_lib_df.ipynb"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>

## ⌗ Tables/Dataframes.

**Optional requirements**

In [1]:
# to show logging messages
import logging
logging.getLogger().setLevel(logging.INFO)

## Import `r`oux-`d`ataframe attributes

In [2]:
import roux.lib.dfs as rd

## Basic data checks 

**Demo data**

In [3]:
import seaborn as sns
import numpy as np
data=sns.load_dataset('iris')
data=(
    data
    .assign(
    **{
        ## insert missing values 
        "sepal_length with missing values":lambda df: df['sepal_length'].apply(lambda x: np.nan if x>5 else x),
    }
    )
)
data.head(1)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_length with missing values
0,5.1,3.5,1.4,0.2,setosa,


### Missing values 

In [4]:
## check
data.rd.check_na(subset=['sepal_length with missing values'])

sepal_length with missing values    118
dtype: int64

### Duplicates

In [5]:
data.rd.check_dups(subset='sepal_length')

INFO:root:duplicate rows: 94.0% (141/150)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_length with missing values
38,4.4,3.0,1.3,0.2,setosa,4.4
42,4.4,3.2,1.3,0.2,setosa,4.4
8,4.4,2.9,1.4,0.2,setosa,4.4
3,4.6,3.1,1.5,0.2,setosa,4.6
47,4.6,3.2,1.4,0.2,setosa,4.6
...,...,...,...,...,...,...
109,7.2,3.6,6.1,2.5,virginica,
135,7.7,3.0,6.1,2.3,virginica,
117,7.7,3.8,6.7,2.2,virginica,
118,7.7,2.6,6.9,2.3,virginica,


### Unique values

In [6]:
data.rd.check_nunique(subset=['species'])

species    3
dtype: int64

## Validate

### Declarative data validations

In [7]:
## validate no missing values and duplicates `assert_*` attributes
_=(data
   .drop_duplicates()
   
   ## validate no missing values
   .rd.assert_no_na(subset=['sepal_length'])
   ## validate no duplicates
   .rd.assert_no_dups()
  )

In [8]:
## validate no missing values and no duplicates 'together' using `assert_dense` attribute
_=(data
   .drop_duplicates()
   .drop(['sepal_length with missing values'],axis=1)
   
   .rd.assert_dense()
  )

## Logging
### Changes in the dataframe shapes

In [9]:
_=data.log.drop_duplicates()

INFO:root:drop_duplicates: shape changed: (150, 6)->(149, 6), width constant


In [10]:
## within pipes
_=(data
   .log.drop_duplicates()
   .log('sepal_length')
   .log('sepal_length',groupby='species')
  )

INFO:root:drop_duplicates: shape changed: (150, 6)->(149, 6), width constant
INFO:root:shape = (149, 6) nunique: sepal_length = 35
INFO:root:shape = (149, 6) by "species", nunique "sepal_length": setosa = 15; versicolor = 21; virginica = 21


## Filter 
### Using a dictionary

In [11]:
_=data.rd.filter_rows({'species':'setosa'})

INFO:root:(150, 6)
INFO:root:(50, 6)


## Merge

**Demo data**

In [12]:
data2=data.groupby('species').head(1)

### Validation of changes in table shapes

In [13]:
data.log.merge(
    right=data2,
    how='inner',
    on='species',
    validate='m:1',
    validate_equal_length=True,
    # validate_no_decrease_length=True,
    )
data.head(1)

INFO:root:merge: shape changed: (150, 6)->(150, 11), length constant


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_length with missing values
0,5.1,3.5,1.4,0.2,setosa,


## Melt

**Demo data with paired IDs and values**

In [14]:
data=(
    data
    .assign(
    **{
        ## create paired ids
        "id 1":lambda df: range(len(df)),
        "id 2":lambda df: range(len(df))[::-1],
    }
    )
    .rename(
        columns={
            "sepal_length": "value 1",
            "sepal_width": "value 2",
            
        },
        errors='raise',
    )
    .loc[:,['id 1','id 2','value 1','value 2']]
).sample(5).reset_index(drop=True)
## example order
order=list(set(data['id 1'].tolist()+data['id 2'].tolist()))
order=list(np.random.choice(order,len(order),replace=False))
data

Unnamed: 0,id 1,id 2,value 1,value 2
0,104,45,6.5,3.0
1,122,27,7.7,2.8
2,41,108,4.5,2.3
3,135,14,7.7,3.0
4,92,57,5.8,2.6


### Tables with paired IDs

In [15]:
data.rd.melt_paired(suffixes=['1','2'])

Unnamed: 0,suffix,id,value
0,1,104,6.5
1,1,122,7.7
2,1,41,4.5
3,1,135,7.7
4,1,92,5.8
0,2,45,3.0
1,2,27,2.8
2,2,108,2.3
3,2,14,3.0
4,2,57,2.6


## Sort 

### Paired columns by values

In [16]:
data.rd.sort_columns_by_values(['id 1','id 2'])

INFO:root:(equal, sorted) items: {(False, False): 1, (False, True): 4}


Unnamed: 0,equal,sorted,id 1,id 2,value 1,value 2
2,False,False,41,108,4.5,2.3
0,False,True,45,104,3.0,6.5
1,False,True,27,122,2.8,7.7
3,False,True,14,135,3.0,7.7
4,False,True,57,92,2.6,5.8


### Paired columns by order

In [17]:
order

[27, 104, 135, 57, 122, 108, 14, 92, 41, 45]

In [18]:
data.rd.sort_columns_by_values(
    ['id 1','id 2'],
    order=order, # order of the ids
    clean=True,
)

INFO:root:(equal, sorted) items: {(False, False): 2, (False, True): 3}


Unnamed: 0,id 1,id 2,value 1,value 2
0,104,45,6.5,3.0
3,135,14,7.7,3.0
1,27,122,2.8,7.7
2,108,41,2.3,4.5
4,57,92,2.6,5.8


## Mapping between columns

**Demo data**

In [19]:
np.random.seed(0)
data=(
    data
    .loc[:,['id 1']].head(10)
    .assign(
    **{
        ## create shuffled ids
        "id 2":lambda df: np.random.choice(df['id 1'],len(df)),
    }
    )
)
data

Unnamed: 0,id 1,id 2
0,104,92
1,122,104
2,41,135
3,135,135
4,92,135


### Check counts

In [20]:
data.rd.check_mappings(subset=['id 1','id 2'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mappings count
mapping,id 1 count,id 2 count,Unnamed: 3_level_1
1:1,1,1,2
m:1,3,1,3


### Classify

In [21]:
data.rd.classify_mappings(subset=['id 1','id 2'])

Unnamed: 0,id 1,id 2,id 1 count,id 2 count,mapping
0,104,92,1,1,1:1
1,122,104,1,1,1:1
2,41,135,3,1,m:1
3,135,135,3,1,m:1
4,92,135,3,1,m:1


### Filter

In [22]:
data.rd.get_mappings(
    subset=['id 1','id 2'],
    keep="3:1", # keep 3:1 mappings
    clean=False,
)

INFO:root:query: shape changed: (5, 5)->(3, 5), width constant


Unnamed: 0,id 1,id 2,id 1 count,id 2 count,mapping
2,41,135,3,1,m:1
3,135,135,3,1,m:1
4,92,135,3,1,m:1


In [23]:
data=data.rd.get_mappings(
    subset=['id 1','id 2'],
    keep='1:1', ## keep 1:1 mappings
    clean=True,
    )
data

INFO:root:query: shape changed: (5, 3)->(2, 3), width constant


Unnamed: 0,id 1,id 2,mapping
0,104,92,1:1
1,122,104,1:1


### Validate

In [24]:
data.rd.assert_1_1_mappings(subset=['id 1','id 2'])

## Documentation
[`roux.lib.df`](https://github.com/rraadd88/roux#module-rouxlibdf)