---

<a href="https://github.com/rraadd88/roux/blob/master/examples/roux_lib_df.ipynb"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>

## ⌗ Tables/Dataframes.

**Optional requirements**

In [1]:
# to show logging messages
import logging
logging.getLogger().setLevel(logging.INFO)

## Import `r`oux-`d`ataframe attributes

In [2]:
import roux.lib.dfs as rd

## Basic data checks 

**Demo data**

In [3]:
import seaborn as sns
import numpy as np
data=sns.load_dataset('iris')
data=(
    data
    .assign(
    **{
        ## insert missing values 
        "sepal_length with missing values":lambda df: df['sepal_length'].apply(lambda x: np.nan if x>5 else x),
    }
    )
)
data.head(1)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_length with missing values
0,5.1,3.5,1.4,0.2,setosa,


### Missing values 

In [4]:
## check
data.rd.check_na(subset=['sepal_length with missing values'],perc=True)

sepal_length with missing values    78.666667
dtype: float64

In [7]:
## check and return. for usage in chained openrations
data.rd.check_na(subset=['sepal_length with missing values'],out=False).head(1)

INFO:root:sepal_length with missing values = 118


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_length with missing values
0,5.1,3.5,1.4,0.2,setosa,


### Duplicates

In [9]:
data.rd.check_dups(subset='sepal_length').head()

INFO:root:duplicate rows: 94.0% (141/150)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_length with missing values
38,4.4,3.0,1.3,0.2,setosa,4.4
42,4.4,3.2,1.3,0.2,setosa,4.4
8,4.4,2.9,1.4,0.2,setosa,4.4
3,4.6,3.1,1.5,0.2,setosa,4.6
47,4.6,3.2,1.4,0.2,setosa,4.6


### Unique values

In [10]:
data.rd.check_nunique(subset=['species'])

species    3
dtype: int64

In [12]:
## check and return. for usage in chained openrations
data.rd.check_nunique(subset=['species'],out=False).head(1)

INFO:root:nunique: species = 3


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_length with missing values
0,5.1,3.5,1.4,0.2,setosa,


## Validate

### Declarative data validations

In [None]:
## validate no missing values and duplicates `assert_*` attributes
_=(data
   .drop_duplicates()
   
   ## validate no missing values
   .rd.assert_no_na(subset=['sepal_length'])
   ## validate no duplicates
   .rd.assert_no_dups()
  )

In [None]:
## validate no missing values and no duplicates 'together' using `assert_dense` attribute
_=(data
   .drop_duplicates()
   .drop(['sepal_length with missing values'],axis=1)
   
   .rd.assert_dense()
  )

## Logging
### Changes in the dataframe shapes

In [None]:
_=data.log.drop_duplicates()

In [None]:
## within pipes
_=(data
   .log.drop_duplicates()
   .log('sepal_length')
   .log('sepal_length',groupby='species')
  )

## Filter 
### Using a dictionary

In [None]:
_=data.rd.filter_rows({'species':'setosa'})

## Merge

**Demo data**

In [None]:
data2=data.groupby('species').head(1)

### Validation of changes in table shapes

In [None]:
data.log.merge(
    right=data2,
    how='inner',
    on='species',
    validate='m:1',
    validate_equal_length=True,
    # validate_no_decrease_length=True,
    )
data.head(1)

## Melt

**Demo data with paired IDs and values**

In [None]:
data=(
    data
    .assign(
    **{
        ## create paired ids
        "id 1":lambda df: range(len(df)),
        "id 2":lambda df: range(len(df))[::-1],
    }
    )
    .rename(
        columns={
            "sepal_length": "value 1",
            "sepal_width": "value 2",
            
        },
        errors='raise',
    )
    .loc[:,['id 1','id 2','value 1','value 2']]
).sample(5).reset_index(drop=True)
## example order
order=list(set(data['id 1'].tolist()+data['id 2'].tolist()))
order=list(np.random.choice(order,len(order),replace=False))
data

### Tables with paired IDs

In [None]:
data.rd.melt_paired(suffixes=['1','2'])

## Sort 

### Paired columns by values

In [None]:
data.rd.sort_columns_by_values(['id 1','id 2'])

### Paired columns by order

In [None]:
order

In [None]:
data.rd.sort_columns_by_values(
    ['id 1','id 2'],
    order=order, # order of the ids
    clean=True,
)

## Mapping between columns

**Demo data**

In [None]:
np.random.seed(0)
data=(
    data
    .loc[:,['id 1']].head(10)
    .assign(
    **{
        ## create shuffled ids
        "id 2":lambda df: np.random.choice(df['id 1'],len(df)),
    }
    )
)
data

### Check counts

In [None]:
data.rd.check_mappings(subset=['id 1','id 2'])

### Classify

In [None]:
data.rd.classify_mappings(subset=['id 1','id 2'])

### Filter

In [None]:
data.rd.get_mappings(
    subset=['id 1','id 2'],
    keep="3:1", # keep 3:1 mappings
    clean=False,
)

In [None]:
data=data.rd.get_mappings(
    subset=['id 1','id 2'],
    keep='1:1', ## keep 1:1 mappings
    clean=True,
    )
data

### Validate

In [None]:
data.rd.assert_1_1_mappings(subset=['id 1','id 2'])

## Documentation
[`roux.lib.df`](https://github.com/rraadd88/roux#module-rouxlibdf)