# Setup

This is following [10 minutes to pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html) for pandas 1.4.3

In [1]:
import numpy as np
import pandas as pd

# Object creation

In [2]:
# Series - pass a list to pd.Series
s1 = pd.Series([1, 2, 3, np.nan, 5])

In [3]:
s1

0    1.0
1    2.0
2    3.0
3    NaN
4    5.0
dtype: float64

In [4]:
# create an index to use in the dataframe
# default frequency is D (day), so creates a DatetimeIndex of 10 days
dates = pd.date_range('20220619', periods=10)
dates

DatetimeIndex(['2022-06-19', '2022-06-20', '2022-06-21', '2022-06-22',
               '2022-06-23', '2022-06-24', '2022-06-25', '2022-06-26',
               '2022-06-27', '2022-06-28'],
              dtype='datetime64[ns]', freq='D')

just in case you are wondering, here's [all the "offset aliases"](https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases) for Datetime

In [5]:
# Dataframe - pass a Numpy array
# create a 10 row, 4 col random number array, index by dates, give some column names
df = pd.DataFrame(np.random.randn(10, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2022-06-19,-0.805378,-0.758876,-0.244702,-0.381889
2022-06-20,0.52474,-1.717769,-0.811127,1.416768
2022-06-21,2.587805,0.536143,0.30727,-1.24479
2022-06-22,1.699733,0.67221,0.397416,-0.393851
2022-06-23,-0.863254,0.554632,1.45218,-0.478468
2022-06-24,-0.480088,-0.473114,-0.385186,-1.583464
2022-06-25,-0.572114,-0.725765,-0.200521,0.377106
2022-06-26,0.832732,2.100882,0.337314,0.673399
2022-06-27,-1.19737,0.418434,-0.344305,-1.074607
2022-06-28,0.564749,0.323936,-0.839551,0.007879


In [6]:
# Create a dataframe by passing a dictionary of objects
# where each object can be converted into a series-like structure
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20220619"),
        "C": pd.Series(1, index=list(range(6)), dtype="float32"),
        "D": np.array([3]*6, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train", "test", "train"]),
        "F": list("foofoo"),
        "G": "foo"
    }
)

df2

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2022-06-19,1.0,3,test,f,foo
1,1.0,2022-06-19,1.0,3,train,o,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
3,1.0,2022-06-19,1.0,3,train,f,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo


In [7]:
# the datatype of each of the columns would be different
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
G            object
dtype: object

# Viewing Data

In [8]:
# top of the dataframe
df.head()

Unnamed: 0,A,B,C,D
2022-06-19,-0.805378,-0.758876,-0.244702,-0.381889
2022-06-20,0.52474,-1.717769,-0.811127,1.416768
2022-06-21,2.587805,0.536143,0.30727,-1.24479
2022-06-22,1.699733,0.67221,0.397416,-0.393851
2022-06-23,-0.863254,0.554632,1.45218,-0.478468


In [9]:
# bottom 3 records of the dataframe
df.tail(3)

Unnamed: 0,A,B,C,D
2022-06-26,0.832732,2.100882,0.337314,0.673399
2022-06-27,-1.19737,0.418434,-0.344305,-1.074607
2022-06-28,0.564749,0.323936,-0.839551,0.007879


In [10]:
# index of the df
df.index

DatetimeIndex(['2022-06-19', '2022-06-20', '2022-06-21', '2022-06-22',
               '2022-06-23', '2022-06-24', '2022-06-25', '2022-06-26',
               '2022-06-27', '2022-06-28'],
              dtype='datetime64[ns]', freq='D')

In [11]:
# columns of the df
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [12]:
# .to_numpy gives a NumPy representation of the dataframe
# this is expensive if all columns are of different data type
df.to_numpy()

array([[-0.80537753, -0.75887588, -0.24470169, -0.3818888 ],
       [ 0.52473961, -1.71776914, -0.81112659,  1.41676827],
       [ 2.58780539,  0.53614261,  0.30726958, -1.24478999],
       [ 1.69973303,  0.67221026,  0.39741601, -0.39385081],
       [-0.86325438,  0.55463198,  1.45217997, -0.47846802],
       [-0.48008815, -0.47311416, -0.38518557, -1.58346434],
       [-0.57211371, -0.72576525, -0.20052069,  0.37710557],
       [ 0.83273194,  2.10088222,  0.33731367,  0.67339896],
       [-1.19737049,  0.41843375, -0.34430491, -1.0746068 ],
       [ 0.56474932,  0.32393601, -0.83955105,  0.00787862]])

In [13]:
df2.to_numpy()

array([[1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'f',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'f',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'o',
        'foo']], dtype=object)

In [14]:
# quick summary stats
df.describe()

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,0.229156,0.093071,-0.033121,-0.268192
std,1.235304,1.050462,0.682623,0.920461
min,-1.19737,-1.717769,-0.839551,-1.583464
25%,-0.747062,-0.662602,-0.374965,-0.925572
50%,0.022326,0.371185,-0.222611,-0.38787
75%,0.765736,0.55001,0.329803,0.284799
max,2.587805,2.100882,1.45218,1.416768


1. **count** = Count number of non-NA/null observations
1. **max** = Maximum of the values in the object
1. **min** = Minimum of the values in the object
1. **mean** = Mean of the values
1. **std** = Standard deviation of the observations
1. **25%** = Default lower percentile
1. **50%** = 50 percentile - same as the median 
1. **75%** = Default upper percentile

In [15]:
# change the percentiles
df.describe(percentiles=[.1, .5, .9])

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,0.229156,0.093071,-0.033121,-0.268192
std,1.235304,1.050462,0.682623,0.920461
min,-1.19737,-1.717769,-0.839551,-1.583464
10%,-0.896666,-0.854765,-0.813969,-1.278657
50%,0.022326,0.371185,-0.222611,-0.38787
90%,1.78854,0.815077,0.502892,0.747736
max,2.587805,2.100882,1.45218,1.416768


In [16]:
# E, F, G in d2 are not numeric, so do not come up in describe()
df2.describe()

Unnamed: 0,A,C,D
count,6.0,6.0,6.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [17]:
# transpose the data
# turn rows to columns and vice versa
df.T

Unnamed: 0,2022-06-19,2022-06-20,2022-06-21,2022-06-22,2022-06-23,2022-06-24,2022-06-25,2022-06-26,2022-06-27,2022-06-28
A,-0.805378,0.52474,2.587805,1.699733,-0.863254,-0.480088,-0.572114,0.832732,-1.19737,0.564749
B,-0.758876,-1.717769,0.536143,0.67221,0.554632,-0.473114,-0.725765,2.100882,0.418434,0.323936
C,-0.244702,-0.811127,0.30727,0.397416,1.45218,-0.385186,-0.200521,0.337314,-0.344305,-0.839551
D,-0.381889,1.416768,-1.24479,-0.393851,-0.478468,-1.583464,0.377106,0.673399,-1.074607,0.007879


### [Axis in pandas](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.set_axis.html?highlight=set_ax#pandas.DataFrame.set_axis)

```axis{0 or ‘index’, 1 or ‘columns’}```

Always remember that when you specify:
* ```axis=0``` or ```axis='index'``` you indicate that the operation should be along the _index_ (aka across the rows). More often than not indexes will go from top to bottom (vertically), but sometimes they may not (for e.g. if you use columns as index or when you have hierarchical or multi-index data).
* ```axis=1``` or ```axis='columns'``` indicates the operation is along the columns

When in doubt use the explicit version ```'index'``` or ```'columns'``` instead of ```0``` or ```1```

In [18]:
# sort along the axis - 1 = horizontal
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2022-06-19,-0.381889,-0.244702,-0.758876,-0.805378
2022-06-20,1.416768,-0.811127,-1.717769,0.52474
2022-06-21,-1.24479,0.30727,0.536143,2.587805
2022-06-22,-0.393851,0.397416,0.67221,1.699733
2022-06-23,-0.478468,1.45218,0.554632,-0.863254
2022-06-24,-1.583464,-0.385186,-0.473114,-0.480088
2022-06-25,0.377106,-0.200521,-0.725765,-0.572114
2022-06-26,0.673399,0.337314,2.100882,0.832732
2022-06-27,-1.074607,-0.344305,0.418434,-1.19737
2022-06-28,0.007879,-0.839551,0.323936,0.564749


In [19]:
# sort along the axis - 0 = vertical
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2022-06-28,0.564749,0.323936,-0.839551,0.007879
2022-06-27,-1.19737,0.418434,-0.344305,-1.074607
2022-06-26,0.832732,2.100882,0.337314,0.673399
2022-06-25,-0.572114,-0.725765,-0.200521,0.377106
2022-06-24,-0.480088,-0.473114,-0.385186,-1.583464
2022-06-23,-0.863254,0.554632,1.45218,-0.478468
2022-06-22,1.699733,0.67221,0.397416,-0.393851
2022-06-21,2.587805,0.536143,0.30727,-1.24479
2022-06-20,0.52474,-1.717769,-0.811127,1.416768
2022-06-19,-0.805378,-0.758876,-0.244702,-0.381889


In [20]:
# sort ascending by values in a column
df.sort_values(by="A")

Unnamed: 0,A,B,C,D
2022-06-27,-1.19737,0.418434,-0.344305,-1.074607
2022-06-23,-0.863254,0.554632,1.45218,-0.478468
2022-06-19,-0.805378,-0.758876,-0.244702,-0.381889
2022-06-25,-0.572114,-0.725765,-0.200521,0.377106
2022-06-24,-0.480088,-0.473114,-0.385186,-1.583464
2022-06-20,0.52474,-1.717769,-0.811127,1.416768
2022-06-28,0.564749,0.323936,-0.839551,0.007879
2022-06-26,0.832732,2.100882,0.337314,0.673399
2022-06-22,1.699733,0.67221,0.397416,-0.393851
2022-06-21,2.587805,0.536143,0.30727,-1.24479


In [21]:
# sort by non-numerical values
df2.sort_values(by="F", ascending=False)

Unnamed: 0,A,B,C,D,E,F,G
1,1.0,2022-06-19,1.0,3,train,o,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo
0,1.0,2022-06-19,1.0,3,test,f,foo
3,1.0,2022-06-19,1.0,3,train,f,foo


In [22]:
# sort by two or more columns
df2.sort_values(by=["F", "E"])

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2022-06-19,1.0,3,test,f,foo
3,1.0,2022-06-19,1.0,3,train,f,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
1,1.0,2022-06-19,1.0,3,train,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo


# Selection

For production prefer the following instead of other data access methods (typical python methods like ["col"] or [a:b] slices etc.):

```.at, .iat, .loc and .iloc.```

## Getting

In [23]:
# selecting a single column returns a Series object
df["A"]

2022-06-19   -0.805378
2022-06-20    0.524740
2022-06-21    2.587805
2022-06-22    1.699733
2022-06-23   -0.863254
2022-06-24   -0.480088
2022-06-25   -0.572114
2022-06-26    0.832732
2022-06-27   -1.197370
2022-06-28    0.564749
Freq: D, Name: A, dtype: float64

In [24]:
# selecting a slice
df[1:5]

Unnamed: 0,A,B,C,D
2022-06-20,0.52474,-1.717769,-0.811127,1.416768
2022-06-21,2.587805,0.536143,0.30727,-1.24479
2022-06-22,1.699733,0.67221,0.397416,-0.393851
2022-06-23,-0.863254,0.554632,1.45218,-0.478468


## Selection by label

In [25]:
# selecting based on a label
df.loc[dates[0]]

A   -0.805378
B   -0.758876
C   -0.244702
D   -0.381889
Name: 2022-06-19 00:00:00, dtype: float64

In [26]:
# select on a multi-axis by lable
# I honestly do not know what the comment above means...help!
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2022-06-19,-0.805378,-0.758876
2022-06-20,0.52474,-1.717769
2022-06-21,2.587805,0.536143
2022-06-22,1.699733,0.67221
2022-06-23,-0.863254,0.554632
2022-06-24,-0.480088,-0.473114
2022-06-25,-0.572114,-0.725765
2022-06-26,0.832732,2.100882
2022-06-27,-1.19737,0.418434
2022-06-28,0.564749,0.323936


In [27]:
# specific index value results in reduction of dimensions
res = df.loc["2022-06-20"]
print(res)
print("res.shape = ", res.shape, " vs. df.shape = ", df.shape)

A    0.524740
B   -1.717769
C   -0.811127
D    1.416768
Name: 2022-06-20 00:00:00, dtype: float64
res.shape =  (4,)  vs. df.shape =  (10, 4)


In [28]:
# get to a specific scalar:
#
# method one
df.loc[dates[0], "A"]

-0.8053775282311182

In [29]:
#
# method two (slightly faster than method one)
df.at[dates[0], "A"]

-0.8053775282311182

## Selection by position

In [30]:
df.iloc[2]

A    2.587805
B    0.536143
C    0.307270
D   -1.244790
Name: 2022-06-21 00:00:00, dtype: float64

In [31]:
# slices - similar to NumPy / Python - [row:slice, col:slice]
df.iloc[1:5, 0:2]

Unnamed: 0,A,B
2022-06-20,0.52474,-1.717769
2022-06-21,2.587805,0.536143
2022-06-22,1.699733,0.67221
2022-06-23,-0.863254,0.554632


In [32]:
# by list of locations - similar to NumPy / Python - [[list of rows], [list of cols]]
df.iloc[[0, 1, 2, 6], [0, 2]]

Unnamed: 0,A,C
2022-06-19,-0.805378,-0.244702
2022-06-20,0.52474,-0.811127
2022-06-21,2.587805,0.30727
2022-06-25,-0.572114,-0.200521


In [33]:
# by list of locations - similar to NumPy / Python - [[list of rows], [list of cols]]
# change the order of columns, repeact a column
df.iloc[[0, 1, 2, 6], [2, 1, 0, 2]]

Unnamed: 0,C,B,A,C.1
2022-06-19,-0.244702,-0.758876,-0.805378,-0.244702
2022-06-20,-0.811127,-1.717769,0.52474,-0.811127
2022-06-21,0.30727,0.536143,2.587805,0.30727
2022-06-25,-0.200521,-0.725765,-0.572114,-0.200521


In [34]:
# slice rows explicitly, keep all columns
df.iloc[[1, 2], :]

Unnamed: 0,A,B,C,D
2022-06-20,0.52474,-1.717769,-0.811127,1.416768
2022-06-21,2.587805,0.536143,0.30727,-1.24479


In [35]:
# slice columns, keep all rows
df.iloc[:, [2, 3]]

Unnamed: 0,C,D
2022-06-19,-0.244702,-0.381889
2022-06-20,-0.811127,1.416768
2022-06-21,0.30727,-1.24479
2022-06-22,0.397416,-0.393851
2022-06-23,1.45218,-0.478468
2022-06-24,-0.385186,-1.583464
2022-06-25,-0.200521,0.377106
2022-06-26,0.337314,0.673399
2022-06-27,-0.344305,-1.074607
2022-06-28,-0.839551,0.007879


In [36]:
# everything, because you can
df.iloc[:, :]

Unnamed: 0,A,B,C,D
2022-06-19,-0.805378,-0.758876,-0.244702,-0.381889
2022-06-20,0.52474,-1.717769,-0.811127,1.416768
2022-06-21,2.587805,0.536143,0.30727,-1.24479
2022-06-22,1.699733,0.67221,0.397416,-0.393851
2022-06-23,-0.863254,0.554632,1.45218,-0.478468
2022-06-24,-0.480088,-0.473114,-0.385186,-1.583464
2022-06-25,-0.572114,-0.725765,-0.200521,0.377106
2022-06-26,0.832732,2.100882,0.337314,0.673399
2022-06-27,-1.19737,0.418434,-0.344305,-1.074607
2022-06-28,0.564749,0.323936,-0.839551,0.007879


In [37]:
# get to a scalar (2 methods, just like before)
#
# method one: use iloc
df.iloc[1, 2]

-0.8111265943178737

In [38]:
#
# method two: use iat
df.iat[1, 2]

-0.8111265943178737

## Boolean Indexing

In [39]:
# use a value found in a single col to get data
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2022-06-20,0.52474,-1.717769,-0.811127,1.416768
2022-06-21,2.587805,0.536143,0.30727,-1.24479
2022-06-22,1.699733,0.67221,0.397416,-0.393851
2022-06-26,0.832732,2.100882,0.337314,0.673399
2022-06-28,0.564749,0.323936,-0.839551,0.007879


In [40]:
# boolean across the entire DF - vals that don't match go NaN
df[df > 0]

Unnamed: 0,A,B,C,D
2022-06-19,,,,
2022-06-20,0.52474,,,1.416768
2022-06-21,2.587805,0.536143,0.30727,
2022-06-22,1.699733,0.67221,0.397416,
2022-06-23,,0.554632,1.45218,
2022-06-24,,,,
2022-06-25,,,,0.377106
2022-06-26,0.832732,2.100882,0.337314,0.673399
2022-06-27,,0.418434,,
2022-06-28,0.564749,0.323936,,0.007879


In [41]:
# add another column
df11 = df.copy()
df11["E"] = ["one", "two", "three", "four",
             "two", "five", "one", "two", "three", "four"]
df11

Unnamed: 0,A,B,C,D,E
2022-06-19,-0.805378,-0.758876,-0.244702,-0.381889,one
2022-06-20,0.52474,-1.717769,-0.811127,1.416768,two
2022-06-21,2.587805,0.536143,0.30727,-1.24479,three
2022-06-22,1.699733,0.67221,0.397416,-0.393851,four
2022-06-23,-0.863254,0.554632,1.45218,-0.478468,two
2022-06-24,-0.480088,-0.473114,-0.385186,-1.583464,five
2022-06-25,-0.572114,-0.725765,-0.200521,0.377106,one
2022-06-26,0.832732,2.100882,0.337314,0.673399,two
2022-06-27,-1.19737,0.418434,-0.344305,-1.074607,three
2022-06-28,0.564749,0.323936,-0.839551,0.007879,four


In [42]:
# the isin() query - basically the in clause
df11[df11["E"].isin(["two", "five"])]

Unnamed: 0,A,B,C,D,E
2022-06-20,0.52474,-1.717769,-0.811127,1.416768,two
2022-06-23,-0.863254,0.554632,1.45218,-0.478468,two
2022-06-24,-0.480088,-0.473114,-0.385186,-1.583464,five
2022-06-26,0.832732,2.100882,0.337314,0.673399,two


## Setting values

In [43]:
# matching indexes auto-aligns values
s1 = pd.Series(range(11, 21), index=pd.date_range('20220619', periods=10))
s1

2022-06-19    11
2022-06-20    12
2022-06-21    13
2022-06-22    14
2022-06-23    15
2022-06-24    16
2022-06-25    17
2022-06-26    18
2022-06-27    19
2022-06-28    20
Freq: D, dtype: int64

In [44]:
df["F"] = s1
df

Unnamed: 0,A,B,C,D,F
2022-06-19,-0.805378,-0.758876,-0.244702,-0.381889,11
2022-06-20,0.52474,-1.717769,-0.811127,1.416768,12
2022-06-21,2.587805,0.536143,0.30727,-1.24479,13
2022-06-22,1.699733,0.67221,0.397416,-0.393851,14
2022-06-23,-0.863254,0.554632,1.45218,-0.478468,15
2022-06-24,-0.480088,-0.473114,-0.385186,-1.583464,16
2022-06-25,-0.572114,-0.725765,-0.200521,0.377106,17
2022-06-26,0.832732,2.100882,0.337314,0.673399,18
2022-06-27,-1.19737,0.418434,-0.344305,-1.074607,19
2022-06-28,0.564749,0.323936,-0.839551,0.007879,20


In [45]:
# setting values by label and position
# first let's make a quick copy
df12 = df.copy()
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,-0.805378,-0.758876,-0.244702,-0.381889,11
2022-06-20,0.52474,-1.717769,-0.811127,1.416768,12
2022-06-21,2.587805,0.536143,0.30727,-1.24479,13
2022-06-22,1.699733,0.67221,0.397416,-0.393851,14
2022-06-23,-0.863254,0.554632,1.45218,-0.478468,15
2022-06-24,-0.480088,-0.473114,-0.385186,-1.583464,16
2022-06-25,-0.572114,-0.725765,-0.200521,0.377106,17
2022-06-26,0.832732,2.100882,0.337314,0.673399,18
2022-06-27,-1.19737,0.418434,-0.344305,-1.074607,19
2022-06-28,0.564749,0.323936,-0.839551,0.007879,20


In [46]:
# set by label
df12.at[dates[0], "A"] = 0
# set by position
df12.iat[0, 1] = 0
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.0,0.0,-0.244702,-0.381889,11
2022-06-20,0.52474,-1.717769,-0.811127,1.416768,12
2022-06-21,2.587805,0.536143,0.30727,-1.24479,13
2022-06-22,1.699733,0.67221,0.397416,-0.393851,14
2022-06-23,-0.863254,0.554632,1.45218,-0.478468,15
2022-06-24,-0.480088,-0.473114,-0.385186,-1.583464,16
2022-06-25,-0.572114,-0.725765,-0.200521,0.377106,17
2022-06-26,0.832732,2.100882,0.337314,0.673399,18
2022-06-27,-1.19737,0.418434,-0.344305,-1.074607,19
2022-06-28,0.564749,0.323936,-0.839551,0.007879,20


In [47]:
# kinda bigger replacement
df12.loc[:, "D"] = np.array([5]*len(df))
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.0,0.0,-0.244702,5,11
2022-06-20,0.52474,-1.717769,-0.811127,5,12
2022-06-21,2.587805,0.536143,0.30727,5,13
2022-06-22,1.699733,0.67221,0.397416,5,14
2022-06-23,-0.863254,0.554632,1.45218,5,15
2022-06-24,-0.480088,-0.473114,-0.385186,5,16
2022-06-25,-0.572114,-0.725765,-0.200521,5,17
2022-06-26,0.832732,2.100882,0.337314,5,18
2022-06-27,-1.19737,0.418434,-0.344305,5,19
2022-06-28,0.564749,0.323936,-0.839551,5,20


In [48]:
# setting values using a boolean selection (aka where clause)
df12[df12 > 0] = -df12
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.0,0.0,-0.244702,-5,-11
2022-06-20,-0.52474,-1.717769,-0.811127,-5,-12
2022-06-21,-2.587805,-0.536143,-0.30727,-5,-13
2022-06-22,-1.699733,-0.67221,-0.397416,-5,-14
2022-06-23,-0.863254,-0.554632,-1.45218,-5,-15
2022-06-24,-0.480088,-0.473114,-0.385186,-5,-16
2022-06-25,-0.572114,-0.725765,-0.200521,-5,-17
2022-06-26,-0.832732,-2.100882,-0.337314,-5,-18
2022-06-27,-1.19737,-0.418434,-0.344305,-5,-19
2022-06-28,-0.564749,-0.323936,-0.839551,-5,-20


# Missing Data

### _reindex_
change/add/delete index on a specified axis, returns a new dataframe

In [49]:
df13 = df.reindex(index=dates[0:4], columns=list(df.columns)+["G"])
df13

Unnamed: 0,A,B,C,D,F,G
2022-06-19,-0.805378,-0.758876,-0.244702,-0.381889,11,
2022-06-20,0.52474,-1.717769,-0.811127,1.416768,12,
2022-06-21,2.587805,0.536143,0.30727,-1.24479,13,
2022-06-22,1.699733,0.67221,0.397416,-0.393851,14,


### handling missing data
1. _```np.nan```_
1. _```pandas.isna()```_
1. ```df.dropna()```
1. ```df.fillna()```

In [50]:
# missing data in pandas is np.nan
df13.iat[1, 0] = np.nan
df13.iloc[1:, 5] = np.random.randint(1)
df13

Unnamed: 0,A,B,C,D,F,G
2022-06-19,-0.805378,-0.758876,-0.244702,-0.381889,11,
2022-06-20,,-1.717769,-0.811127,1.416768,12,0.0
2022-06-21,2.587805,0.536143,0.30727,-1.24479,13,0.0
2022-06-22,1.699733,0.67221,0.397416,-0.393851,14,0.0


In [51]:
# get a boolean mask where values are NaN
df131 = pd.isna(df13)
df131

Unnamed: 0,A,B,C,D,F,G
2022-06-19,False,False,False,False,False,True
2022-06-20,True,False,False,False,False,False
2022-06-21,False,False,False,False,False,False
2022-06-22,False,False,False,False,False,False


In [52]:
# or just
pd.isna(df13)

Unnamed: 0,A,B,C,D,F,G
2022-06-19,False,False,False,False,False,True
2022-06-20,True,False,False,False,False,False
2022-06-21,False,False,False,False,False,False
2022-06-22,False,False,False,False,False,False


In [53]:
# the original is still there
df13

Unnamed: 0,A,B,C,D,F,G
2022-06-19,-0.805378,-0.758876,-0.244702,-0.381889,11,
2022-06-20,,-1.717769,-0.811127,1.416768,12,0.0
2022-06-21,2.587805,0.536143,0.30727,-1.24479,13,0.0
2022-06-22,1.699733,0.67221,0.397416,-0.393851,14,0.0


In [54]:
# we are going to drop / replace values now, let's make a couple of copies of the dataframe
df132 = df13.copy()
df132

Unnamed: 0,A,B,C,D,F,G
2022-06-19,-0.805378,-0.758876,-0.244702,-0.381889,11,
2022-06-20,,-1.717769,-0.811127,1.416768,12,0.0
2022-06-21,2.587805,0.536143,0.30727,-1.24479,13,0.0
2022-06-22,1.699733,0.67221,0.397416,-0.393851,14,0.0


In [55]:
# drop rows/columns that have missing data
# by default it returns a new dataframe, you may want to specify inplace=True for modifying current dataframe:
df_no_na = df132.dropna(how="any")
# how=‘any’ : If any NA values are present, drop that row or column.
# how=‘all’ : If all values are NA, drop that row or column.

In [56]:
# all rows/cols with missing data stripped
df_no_na

Unnamed: 0,A,B,C,D,F,G
2022-06-21,2.587805,0.536143,0.30727,-1.24479,13,0.0
2022-06-22,1.699733,0.67221,0.397416,-0.393851,14,0.0


In [57]:
# original still intact
df132

Unnamed: 0,A,B,C,D,F,G
2022-06-19,-0.805378,-0.758876,-0.244702,-0.381889,11,
2022-06-20,,-1.717769,-0.811127,1.416768,12,0.0
2022-06-21,2.587805,0.536143,0.30727,-1.24479,13,0.0
2022-06-22,1.699733,0.67221,0.397416,-0.393851,14,0.0


In [58]:
# drop missing data from original
df132.dropna(how="any", inplace=True)
df132

Unnamed: 0,A,B,C,D,F,G
2022-06-21,2.587805,0.536143,0.30727,-1.24479,13,0.0
2022-06-22,1.699733,0.67221,0.397416,-0.393851,14,0.0


In [59]:
# fill missing data
df133 = df13.copy()
df133.fillna(np.pi*1000)

Unnamed: 0,A,B,C,D,F,G
2022-06-19,-0.805378,-0.758876,-0.244702,-0.381889,11,3141.592654
2022-06-20,3141.592654,-1.717769,-0.811127,1.416768,12,0.0
2022-06-21,2.587805,0.536143,0.30727,-1.24479,13,0.0
2022-06-22,1.699733,0.67221,0.397416,-0.393851,14,0.0


# Align and Join

There needs to be a bigger notebook for this topic.
You need to know there's "joins" in Pandas just like in the SQL world, 
like join and left join and right join and inner and outer and all that...

## Create two datasets with _mismatched_ indexes

In [60]:
# date range indexes
idx1 = pd.date_range('2022-01-01',periods = 10)
# 2022-01-01', '2022-01-02' don't exist in idx2
# '2022-01-11', '2022-01-12' don't exist in idx1
idx2 = pd.date_range('2022-01-03',periods = 10)

In [61]:
# dataframes from indexes
d1 = pd.DataFrame(index = idx1, data = 
    {
        'A': np.random.rand(10),
        'B': np.random.randint(1, high=25, size = 10)
    }
)

d2 = pd.DataFrame(index = idx2, data = 
    {
        'A': np.random.rand(10),
        'B': np.random.randint(1, high=25, size = 10)
    }
)

## Quick Aside: Rendering two dataframes side-by-side

The trick of rendering 2 dataframes side-by-side was from [this](https://stackoverflow.com/questions/38783027/jupyter-notebook-display-two-pandas-tables-side-by-side) stackoverflow question.

In [62]:
# we gon need to display both data frames side by side, so...
from IPython.display import display_html 
def render_df_side_by_side(a, b, a_title = "", b_title = ""):
    a_styler = a.style.set_table_attributes("style='display:inline'").set_caption(a_title)
    b_styler = b.style.set_table_attributes("style='display:inline'").set_caption(b_title)
    display_html(a_styler._repr_html_()+b_styler._repr_html_(), raw=True)

In [63]:
render_df_side_by_side(d1,d2)

Unnamed: 0,A,B
2022-01-01 00:00:00,0.835113,3
2022-01-02 00:00:00,0.172323,6
2022-01-03 00:00:00,0.409749,9
2022-01-04 00:00:00,0.485313,11
2022-01-05 00:00:00,0.511067,8
2022-01-06 00:00:00,0.937232,13
2022-01-07 00:00:00,0.811251,18
2022-01-08 00:00:00,0.996338,24
2022-01-09 00:00:00,0.019221,14
2022-01-10 00:00:00,0.252766,23

Unnamed: 0,A,B
2022-01-03 00:00:00,0.557622,7
2022-01-04 00:00:00,0.1488,19
2022-01-05 00:00:00,0.830873,17
2022-01-06 00:00:00,0.087515,6
2022-01-07 00:00:00,0.90028,1
2022-01-08 00:00:00,0.861023,14
2022-01-09 00:00:00,0.638989,3
2022-01-10 00:00:00,0.12277,6
2022-01-11 00:00:00,0.478184,17
2022-01-12 00:00:00,0.199753,24


Table rendering/styling options is [a bigger discussion](https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html#) to be tackled later.

## .add, .sub, .mul, .div etc.

flexible wrappers (```add, sub, mul, div, mod, pow```) to arithmetic operators: ```+, -, *, /, //, %, **```

In [64]:
# the +, -, *, /, //, %, ** operations align indexes to create a UNION of indexes
d3 = d1+d2
d4 = d1.sub(d2) #equivalent to d1-d2
d5 = d1*d2
d6 = d1.div(d2)

In [65]:
# indexs not present in the other dataframe get a NaN
d3.style.highlight_null(null_color='red')

Unnamed: 0,A,B
2022-01-01 00:00:00,,
2022-01-02 00:00:00,,
2022-01-03 00:00:00,0.96737,16.0
2022-01-04 00:00:00,0.634113,30.0
2022-01-05 00:00:00,1.34194,25.0
2022-01-06 00:00:00,1.024747,19.0
2022-01-07 00:00:00,1.711531,19.0
2022-01-08 00:00:00,1.857361,38.0
2022-01-09 00:00:00,0.658209,17.0
2022-01-10 00:00:00,0.375536,29.0


In [66]:
# sub
d4.style.highlight_null(null_color='red')

Unnamed: 0,A,B
2022-01-01 00:00:00,,
2022-01-02 00:00:00,,
2022-01-03 00:00:00,-0.147873,2.0
2022-01-04 00:00:00,0.336513,-8.0
2022-01-05 00:00:00,-0.319806,-9.0
2022-01-06 00:00:00,0.849717,7.0
2022-01-07 00:00:00,-0.089028,17.0
2022-01-08 00:00:00,0.135315,10.0
2022-01-09 00:00:00,-0.619768,11.0
2022-01-10 00:00:00,0.129997,17.0


In [67]:
# mul
d5.style.highlight_null(null_color='red')

Unnamed: 0,A,B
2022-01-01 00:00:00,,
2022-01-02 00:00:00,,
2022-01-03 00:00:00,0.228485,63.0
2022-01-04 00:00:00,0.072214,209.0
2022-01-05 00:00:00,0.424632,136.0
2022-01-06 00:00:00,0.082022,78.0
2022-01-07 00:00:00,0.730353,18.0
2022-01-08 00:00:00,0.85787,336.0
2022-01-09 00:00:00,0.012282,42.0
2022-01-10 00:00:00,0.031032,138.0


In [68]:
# div
d6.style.highlight_null(null_color='red')

Unnamed: 0,A,B
2022-01-01 00:00:00,,
2022-01-02 00:00:00,,
2022-01-03 00:00:00,0.734815,1.285714
2022-01-04 00:00:00,3.261521,0.578947
2022-01-05 00:00:00,0.615097,0.470588
2022-01-06 00:00:00,10.709431,2.166667
2022-01-07 00:00:00,0.90111,18.0
2022-01-08 00:00:00,1.157156,1.714286
2022-01-09 00:00:00,0.03008,4.666667
2022-01-10 00:00:00,2.058867,3.833333


### removing the NaN values from the resultant dataframe

In [69]:
# the + operation is UNION of indexes
d31 = d1+d2
# indexs not present in the other dataframe get a NaN
# do you can obvs remove nan values like before
d3.dropna().style.highlight_null(null_color='red')

Unnamed: 0,A,B
2022-01-03 00:00:00,0.96737,16.0
2022-01-04 00:00:00,0.634113,30.0
2022-01-05 00:00:00,1.34194,25.0
2022-01-06 00:00:00,1.024747,19.0
2022-01-07 00:00:00,1.711531,19.0
2022-01-08 00:00:00,1.857361,38.0
2022-01-09 00:00:00,0.658209,17.0
2022-01-10 00:00:00,0.375536,29.0


## df.align()

Pandas [align](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.align.html) works to join the dataframes in interesting ways.

In [70]:
# align returns two dataframes - left and right as a result of a join
# default join is outer - the union of indices will be used as index for left and right
left, right = d1.align(d2, join='outer')
render_df_side_by_side(left, right)

Unnamed: 0,A,B
2022-01-01 00:00:00,0.835113,3.0
2022-01-02 00:00:00,0.172323,6.0
2022-01-03 00:00:00,0.409749,9.0
2022-01-04 00:00:00,0.485313,11.0
2022-01-05 00:00:00,0.511067,8.0
2022-01-06 00:00:00,0.937232,13.0
2022-01-07 00:00:00,0.811251,18.0
2022-01-08 00:00:00,0.996338,24.0
2022-01-09 00:00:00,0.019221,14.0
2022-01-10 00:00:00,0.252766,23.0

Unnamed: 0,A,B
2022-01-01 00:00:00,,
2022-01-02 00:00:00,,
2022-01-03 00:00:00,0.557622,7.0
2022-01-04 00:00:00,0.1488,19.0
2022-01-05 00:00:00,0.830873,17.0
2022-01-06 00:00:00,0.087515,6.0
2022-01-07 00:00:00,0.90028,1.0
2022-01-08 00:00:00,0.861023,14.0
2022-01-09 00:00:00,0.638989,3.0
2022-01-10 00:00:00,0.12277,6.0


In [71]:
# inner join
left, right = d1.align(d2, join='inner')
render_df_side_by_side(d1,d2)

Unnamed: 0,A,B
2022-01-01 00:00:00,0.835113,3
2022-01-02 00:00:00,0.172323,6
2022-01-03 00:00:00,0.409749,9
2022-01-04 00:00:00,0.485313,11
2022-01-05 00:00:00,0.511067,8
2022-01-06 00:00:00,0.937232,13
2022-01-07 00:00:00,0.811251,18
2022-01-08 00:00:00,0.996338,24
2022-01-09 00:00:00,0.019221,14
2022-01-10 00:00:00,0.252766,23

Unnamed: 0,A,B
2022-01-03 00:00:00,0.557622,7
2022-01-04 00:00:00,0.1488,19
2022-01-05 00:00:00,0.830873,17
2022-01-06 00:00:00,0.087515,6
2022-01-07 00:00:00,0.90028,1
2022-01-08 00:00:00,0.861023,14
2022-01-09 00:00:00,0.638989,3
2022-01-10 00:00:00,0.12277,6
2022-01-11 00:00:00,0.478184,17
2022-01-12 00:00:00,0.199753,24


In [72]:
# we can fill the NaNs with a certain value if needed
# let's fill with -100 so we can see it clearly
left, right = d1.align(d2, join='outer', fill_value = -100) 
render_df_side_by_side(left, right)

Unnamed: 0,A,B
2022-01-01 00:00:00,0.835113,3
2022-01-02 00:00:00,0.172323,6
2022-01-03 00:00:00,0.409749,9
2022-01-04 00:00:00,0.485313,11
2022-01-05 00:00:00,0.511067,8
2022-01-06 00:00:00,0.937232,13
2022-01-07 00:00:00,0.811251,18
2022-01-08 00:00:00,0.996338,24
2022-01-09 00:00:00,0.019221,14
2022-01-10 00:00:00,0.252766,23

Unnamed: 0,A,B
2022-01-01 00:00:00,-100.0,-100
2022-01-02 00:00:00,-100.0,-100
2022-01-03 00:00:00,0.557622,7
2022-01-04 00:00:00,0.1488,19
2022-01-05 00:00:00,0.830873,17
2022-01-06 00:00:00,0.087515,6
2022-01-07 00:00:00,0.90028,1
2022-01-08 00:00:00,0.861023,14
2022-01-09 00:00:00,0.638989,3
2022-01-10 00:00:00,0.12277,6


In [73]:
# left join: left index is added to right, but not the other way
left, right = d1.align(d2, join='left') 
render_df_side_by_side(left, right)

Unnamed: 0,A,B
2022-01-01 00:00:00,0.835113,3
2022-01-02 00:00:00,0.172323,6
2022-01-03 00:00:00,0.409749,9
2022-01-04 00:00:00,0.485313,11
2022-01-05 00:00:00,0.511067,8
2022-01-06 00:00:00,0.937232,13
2022-01-07 00:00:00,0.811251,18
2022-01-08 00:00:00,0.996338,24
2022-01-09 00:00:00,0.019221,14
2022-01-10 00:00:00,0.252766,23

Unnamed: 0,A,B
2022-01-01 00:00:00,,
2022-01-02 00:00:00,,
2022-01-03 00:00:00,0.557622,7.0
2022-01-04 00:00:00,0.1488,19.0
2022-01-05 00:00:00,0.830873,17.0
2022-01-06 00:00:00,0.087515,6.0
2022-01-07 00:00:00,0.90028,1.0
2022-01-08 00:00:00,0.861023,14.0
2022-01-09 00:00:00,0.638989,3.0
2022-01-10 00:00:00,0.12277,6.0


In [74]:
# right join: right index is added to left, but not the other way
left, right = d1.align(d2, join='right') 
render_df_side_by_side(left, right)

Unnamed: 0,A,B
2022-01-03 00:00:00,0.409749,9.0
2022-01-04 00:00:00,0.485313,11.0
2022-01-05 00:00:00,0.511067,8.0
2022-01-06 00:00:00,0.937232,13.0
2022-01-07 00:00:00,0.811251,18.0
2022-01-08 00:00:00,0.996338,24.0
2022-01-09 00:00:00,0.019221,14.0
2022-01-10 00:00:00,0.252766,23.0
2022-01-11 00:00:00,,
2022-01-12 00:00:00,,

Unnamed: 0,A,B
2022-01-03 00:00:00,0.557622,7
2022-01-04 00:00:00,0.1488,19
2022-01-05 00:00:00,0.830873,17
2022-01-06 00:00:00,0.087515,6
2022-01-07 00:00:00,0.90028,1
2022-01-08 00:00:00,0.861023,14
2022-01-09 00:00:00,0.638989,3
2022-01-10 00:00:00,0.12277,6
2022-01-11 00:00:00,0.478184,17
2022-01-12 00:00:00,0.199753,24


There's more to explore in align, and it can get confusing, so beginners be careful, take time and try to work out the result before you execute to build intuition.

# Operations on data

## Stats

Operations in general *exclude* missing data

In [75]:
# arithmetic mean, for each column (axis = 0)
df.mean()

A     0.229156
B     0.093071
C    -0.033121
D    -0.268192
F    15.500000
dtype: float64

In [76]:
# mean across a row (axis = 1)
df.mean(1)

2022-06-19    1.761831
2022-06-20    2.282522
2022-06-21    3.037286
2022-06-22    3.275102
2022-06-23    3.133018
2022-06-24    2.615630
2022-06-25    3.175741
2022-06-26    4.388865
2022-06-27    3.360430
2022-06-28    4.011403
Freq: D, dtype: float64

## Apply

In [77]:
# apply custom lambdas
df.apply(lambda x: x.max()-x.min())

A    3.785176
B    3.818651
C    2.291731
D    3.000233
F    9.000000
dtype: float64

In [78]:
df

Unnamed: 0,A,B,C,D,F
2022-06-19,-0.805378,-0.758876,-0.244702,-0.381889,11
2022-06-20,0.52474,-1.717769,-0.811127,1.416768,12
2022-06-21,2.587805,0.536143,0.30727,-1.24479,13
2022-06-22,1.699733,0.67221,0.397416,-0.393851,14
2022-06-23,-0.863254,0.554632,1.45218,-0.478468,15
2022-06-24,-0.480088,-0.473114,-0.385186,-1.583464,16
2022-06-25,-0.572114,-0.725765,-0.200521,0.377106,17
2022-06-26,0.832732,2.100882,0.337314,0.673399,18
2022-06-27,-1.19737,0.418434,-0.344305,-1.074607,19
2022-06-28,0.564749,0.323936,-0.839551,0.007879,20


In [79]:
# each subsequent value is a sum of all values before it in the respective column
cumsum_df = df.apply(np.cumsum)
render_df_side_by_side(df, cumsum_df, "OG Dataframe", "Cumulative Sum")

Unnamed: 0,A,B,C,D,F
2022-06-19 00:00:00,-0.805378,-0.758876,-0.244702,-0.381889,11
2022-06-20 00:00:00,0.52474,-1.717769,-0.811127,1.416768,12
2022-06-21 00:00:00,2.587805,0.536143,0.30727,-1.24479,13
2022-06-22 00:00:00,1.699733,0.67221,0.397416,-0.393851,14
2022-06-23 00:00:00,-0.863254,0.554632,1.45218,-0.478468,15
2022-06-24 00:00:00,-0.480088,-0.473114,-0.385186,-1.583464,16
2022-06-25 00:00:00,-0.572114,-0.725765,-0.200521,0.377106,17
2022-06-26 00:00:00,0.832732,2.100882,0.337314,0.673399,18
2022-06-27 00:00:00,-1.19737,0.418434,-0.344305,-1.074607,19
2022-06-28 00:00:00,0.564749,0.323936,-0.839551,0.007879,20

Unnamed: 0,A,B,C,D,F
2022-06-19 00:00:00,-0.805378,-0.758876,-0.244702,-0.381889,11
2022-06-20 00:00:00,-0.280638,-2.476645,-1.055828,1.034879,23
2022-06-21 00:00:00,2.307167,-1.940502,-0.748559,-0.209911,36
2022-06-22 00:00:00,4.0069,-1.268292,-0.351143,-0.603761,50
2022-06-23 00:00:00,3.143646,-0.71366,1.101037,-1.082229,65
2022-06-24 00:00:00,2.663558,-1.186774,0.715852,-2.665694,81
2022-06-25 00:00:00,2.091444,-1.91254,0.515331,-2.288588,98
2022-06-26 00:00:00,2.924176,0.188343,0.852645,-1.615189,116
2022-06-27 00:00:00,1.726806,0.606776,0.50834,-2.689796,135
2022-06-28 00:00:00,2.291555,0.930712,-0.331211,-2.681917,155


## Histogramming

Frequencies, 'nuff said

```value_counts()```

In [80]:
series1 = pd.Series(np.random.randint(0,5, size = 25))
series1.value_counts()

3    9
1    5
2    4
4    4
0    3
dtype: int64

## String Methods

In [81]:
# build a series
some_string = "SERIes and Index are EqUIppeD WITh A seT Of stRINg PROCESSING METHoDs In tHe ```sTR``` attrIBute THAT MakE IT eASy To oPerATE ON EACh ELEmeNt OF The aRrAY. NOtE thAt PAtTERN-maTching in ```sTR``` gENeralLY UsES reGUlAr eXpreSsiONs bY DEfault (aND In Some cases AlwayS uSEs tHeM)."
str_series = pd.Series(some_string.split()) # split splits on whitespace by default

In [82]:
# manipulating the case - lower, upper etc.
# also the length of each string, split, replace, yada yada...
# very important when cleaning data (column names may have stupid whitespace, bad case, spacing etc.)
low = str_series.str.lower()
up = str_series.str.upper()

low_up_df = pd.DataFrame(
    {
        'og': str_series,
        'length': str_series.str.len(),
        'low': low,
        'up': up,
        'split_low_on_i': low.str.split('i', expand = False), # expand = True will break this usecase, try elsewhere
        'replace_a_with_star_in_low': low.str.replace('a','*', case=False, regex=True) # yeah, you can mess with regular expns
    })

low_up_df.tail(10)

Unnamed: 0,og,length,low,up,split_low_on_i,replace_a_with_star_in_low
36,eXpreSsiONs,11,expressions,EXPRESSIONS,"[express, ons]",expressions
37,bY,2,by,BY,[by],by
38,DEfault,7,default,DEFAULT,[default],def*ult
39,(aND,4,(and,(AND,[(and],(*nd
40,In,2,in,IN,"[, n]",in
41,Some,4,some,SOME,[some],some
42,cases,5,cases,CASES,[cases],c*ses
43,AlwayS,6,always,ALWAYS,[always],*lw*ys
44,uSEs,4,uses,USES,[uses],uses
45,tHeM).,6,them).,THEM).,[them).],them).


See [here](https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html#text-string-methods) for more string operations.

# Merge

Combine series and dataframes, use set logic for indexes and relational algebra for joins/merges

## Concat

Chill, we've done this before. The same join = inner/outer/left/right drill. 

In [83]:
# let's make some more dataframes
concatDF = pd.DataFrame(np.random.randn(10,4)) # 10 rows and 4 columns of random numbers
pieces = [concatDF[:3], concatDF[3:7], concatDF[7:]]
reconstructedDF = pd.concat(pieces)
display('broken DFs: ',pieces)
render_df_side_by_side(concatDF, reconstructedDF, "OG", "Concatenated")

'broken DFs: '

[          0         1         2         3
 0 -2.697185  1.216515  0.853398 -0.113237
 1 -1.435410 -0.158387 -2.430388 -0.648166
 2  0.759050  0.094282  1.244369  1.218700,
           0         1         2         3
 3 -1.014651  1.496874  0.030681 -0.322639
 4  1.009126 -0.279747  0.327220  1.546151
 5 -1.291796 -0.961866  0.531144 -1.032049
 6 -0.522130  0.207556 -0.635611 -1.026961,
           0         1         2         3
 7 -1.342438 -0.692129 -1.668482 -0.560060
 8 -1.792864  0.042686 -0.602903  0.209034
 9 -0.896600  0.497787 -2.203414 -1.063591]

Unnamed: 0,0,1,2,3
0,-2.697185,1.216515,0.853398,-0.113237
1,-1.43541,-0.158387,-2.430388,-0.648166
2,0.75905,0.094282,1.244369,1.2187
3,-1.014651,1.496874,0.030681,-0.322639
4,1.009126,-0.279747,0.32722,1.546151
5,-1.291796,-0.961866,0.531144,-1.032049
6,-0.52213,0.207556,-0.635611,-1.026961
7,-1.342438,-0.692129,-1.668482,-0.56006
8,-1.792864,0.042686,-0.602903,0.209034
9,-0.8966,0.497787,-2.203414,-1.063591

Unnamed: 0,0,1,2,3
0,-2.697185,1.216515,0.853398,-0.113237
1,-1.43541,-0.158387,-2.430388,-0.648166
2,0.75905,0.094282,1.244369,1.2187
3,-1.014651,1.496874,0.030681,-0.322639
4,1.009126,-0.279747,0.32722,1.546151
5,-1.291796,-0.961866,0.531144,-1.032049
6,-0.52213,0.207556,-0.635611,-1.026961
7,-1.342438,-0.692129,-1.668482,-0.56006
8,-1.792864,0.042686,-0.602903,0.209034
9,-0.8966,0.497787,-2.203414,-1.063591
