# Reference Dataframe

A simple one index dataframe

In [6]:
import pandas as pd
d = {'a': [1, 2, 3], 'b': [10, 20, 30]}
df = pd.DataFrame(d, index=(['cat', 'mouse', 'elephant']))
df

Unnamed: 0,a,b
cat,1,10
mouse,2,20
elephant,3,30


A bit more complicated one with repeat rows for an index

In [14]:
import pandas as pd
import numpy as np   
idx = ['cat', 'dog', 'mouse']
cols = ['a', 'b']
n_rows = 2
idx = idx*n_rows
df = pd.DataFrame(3+2.5*np.random.randn(len(idx), len(cols)), 
                columns=cols, 
                    index=pd.Series(idx).sample(len(idx))).abs().astype(int)
df

Unnamed: 0,a,b
dog,3,5
mouse,3,4
cat,1,1
cat,0,5
dog,0,1
mouse,0,7


In [1]:
import pandas as pd
import numpy as np
import random
random.seed(1234)

i = ['dog', 'cat', 'rabbit', 'elephant'] * 20

type = pd.Series((random.choice(['X', 'Y']) for _ in range(len(i))), name='type')
dates = pd.Series(pd.date_range(start=pd.to_datetime('now', utc=True)-pd.DateOffset(len(i)-1), 
            end=pd.to_datetime('now', utc=True)), name='date').dt.date

df = pd.DataFrame(np.random.randn(len(i), 2), index=i, \
            columns=list('AB')).rename_axis('animal').reset_index()
            
df = pd.concat([dates, type, df], axis=1)
df

Unnamed: 0,date,type,animal,A,B
0,2022-03-15,Y,dog,0.730807,-0.255509
1,2022-03-16,X,cat,-0.550745,1.592094
2,2022-03-17,X,rabbit,-0.566575,-0.489147
3,2022-03-18,X,elephant,1.877894,-1.318074
4,2022-03-19,X,dog,1.277121,-0.385321
...,...,...,...,...,...
75,2022-05-29,Y,elephant,-0.929735,1.557274
76,2022-05-30,X,dog,2.422177,0.479677
77,2022-05-31,Y,cat,-0.135682,0.389643
78,2022-06-01,Y,rabbit,1.203847,-0.750442


Make a small, but complex dataframe

In [5]:
d = '''          date type    animal         A         B        flag
0   2022-03-15    Y       dog -1.558471 -0.592981   True
1   2022-03-16    X       cat -1.301200 -0.735802   False
2   2022-03-17    X    rabbit  nan  1.712587        False
3   2022-03-18    X  elephant -0.819591  1.745640   True
4   2022-03-19    X       dog -0.528120 -0.583105   True
5   2022-03-19    Y       dog nan         nan       False'''

import pandas as pd  
from io import StringIO

df = pd.read_csv(StringIO(d), sep='\s+')
df = df.assign(date = pd.to_datetime(df.date, infer_datetime_format=True)) # convert date to datetime format
df

Unnamed: 0,date,type,animal,A,B,flag
0,2022-03-15,Y,dog,-1.558471,-0.592981,True
1,2022-03-16,X,cat,-1.3012,-0.735802,False
2,2022-03-17,X,rabbit,,1.712587,False
3,2022-03-18,X,elephant,-0.819591,1.74564,True
4,2022-03-19,X,dog,-0.52812,-0.583105,True
5,2022-03-19,Y,dog,,,False


# Series
## Random date series between two dates

In [None]:
# Generate list of random dates
start = datetime.date(2018,1,1)
end = datetime.date(2018,12,31)
dates = [start + (end-start)* random.random() for i in range(6)]
dates

## Random int Series

In [None]:
pd.Series(np.random.randint(2, high=10, size=5), name='Value')

## Random Letter Series

In [None]:
import string, random
pd.Series(random.choice(string.ascii_uppercase) for _ in range(100))

## Random letter + number series
(for e.g. machine number generation)

In [None]:
['N'+ str(random.randint(i, i+5)) for i in range(6)]

## Sample dataframe and dictionary

In [None]:
import pandas as pd
import numpy as np

i = ['dog', 'cat', 'rabbit', 'elephant'] * 3

df = pd.DataFrame(np.random.randn(len(i), 2), index=i, columns=list('AB'))
df

In [None]:
b_dict = {'elephant': 2.0, 'dog': 5.0}
b_dict

# Generate fake OHLC data
One can use the following code to generate fake daily ohlc data

In [None]:
# Generate fake stock data
import pandas as pd
import numpy as np
import ta

np.random.seed(10)

# generate ohlc data
days = 500
dates = pd.date_range('20180101', periods=days)

# . generate close
steps = np.random.normal(loc=0, scale=1.0, size=days)
steps[0]=0 # set the first step to 0
c = 72 + np.cumsum(steps) # simulate daily close starting from 72

# . generate opens

# . change between close and open
c_o_change = np.random.uniform(low=-0.05, high=0.05, size=days-1)

o=c[:-1]*c_o_change+c[:-1] # open
o=np.append(72.3, o)

h_change = np.random.normal(loc=0, scale=0.008, size=days)
l_change = np.random.uniform(low=-0.008, high=-0.01, size=days)

# . high and low
h=pd.DataFrame([o, c]).max()*(1+abs(h_change))
l=pd.DataFrame([o, c]).min()*(1-abs(l_change))

# . generate the bollinger bands
bb=ta.volatility.BollingerBands(close=pd.Series(c), n=20, ndev=2)

bb_ma=bb.bollinger_mavg()
bb_hi=bb.bollinger_hband()
bb_lo=bb.bollinger_lband()

volume = abs(np.random.normal(loc=500000, scale=499999, size=days).astype('int'))

df=pd.DataFrame({'date': dates, 'open': o, 'high': h, 'low': l, 'close': c, 'volume': volume,
                 'bb_ma': bb_ma, 'bb_hi': bb_hi, 'bb_lo': bb_lo})

df        