In [None]:
#| hide
# run the following line to clean up README.md file every time you change this index (otherwise dataframes are not rendered correctly)
# sed -i "/<style scoped>/,/<\/style>/d" README.md 

# pandasmore

> Extends pandas with common functions used in finance and economics research

The full documentation site is [here](https://ionmihai.github.io/pandasmore/), and the GitHub page is [here](https://github.com/ionmihai/pandasmore).

Here is a short description of some of the main functions (more details below and in the [documentation](https://ionmihai.github.io/pandasmore/core.html)):

- `setup_tseries`: cleans up dates and sets them as the index
- `setup_panel`: cleans up dates and panel id's and sets them as the index (panel id, period date)
- `lag`: robust lagging that accounts for panel structure, unsorted or duplicate dates, or gaps in the time-series

## Install

```sh
pip install pandasmore
```

## How to use

First, we set up an example dataset to showcase the functions in this module.

In [None]:
import pandas as pd
import numpy as np
import pandasmore as pdr

In [None]:
raw = pd.DataFrame(np.random.rand(15,2), 
                    columns=list('AB'), 
                    index=pd.MultiIndex.from_product(
                        [[1,2, np.nan],[np.nan,'2010-01','2010-02','2010-02','2010-04']],
                        names = ['firm_id','date'])
                      ).reset_index()
raw

Unnamed: 0,firm_id,date,A,B
0,1.0,,0.943132,0.981995
1,1.0,2010-01,0.328816,0.473158
2,1.0,2010-02,0.177921,0.835497
3,1.0,2010-02,0.928199,0.743025
4,1.0,2010-04,0.857208,0.742693
5,2.0,,0.14747,0.357477
6,2.0,2010-01,0.172676,0.978518
7,2.0,2010-02,0.391758,0.574734
8,2.0,2010-02,0.824737,0.86334
9,2.0,2010-04,0.847638,0.293925


In [None]:
df = pdr.setup_tseries(raw.query('firm_id==1'),
                        time_var='date', time_var_format="%Y-%m",
                        freq='M')
df

Unnamed: 0_level_0,date,dtdate,firm_id,A,B
Mdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01,2010-01,2010-01-01,1.0,0.328816,0.473158
2010-02,2010-02,2010-02-01,1.0,0.928199,0.743025
2010-04,2010-04,2010-04-01,1.0,0.857208,0.742693


In [None]:
df = pdr.setup_panel(raw,
                        panel_ids='firm_id',
                        time_var='date', time_var_format="%Y-%m",
                        freq='M')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,date,dtdate,A,B
firm_id,Mdate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2010-01,2010-01,2010-01-01,0.328816,0.473158
1,2010-02,2010-02,2010-02-01,0.928199,0.743025
1,2010-04,2010-04,2010-04-01,0.857208,0.742693
2,2010-01,2010-01,2010-01-01,0.172676,0.978518
2,2010-02,2010-02,2010-02-01,0.824737,0.86334
2,2010-04,2010-04,2010-04-01,0.847638,0.293925


In [None]:
pdr.lag(df['A'])

permno  Mdate  
1       2010-01         NaN
        2010-02    0.698770
        2010-04         NaN
2       2010-01         NaN
        2010-02    0.834091
        2010-04         NaN
Name: A_lag1, dtype: float64