# Setup

This is following [10 minutes to pandas](https://pandas.pydata.org/docs/user_guide/10min.html) for pandas 1.4.2

In [1]:
import numpy as np
import pandas as pd

# Object creation

In [2]:
# Series - pass a list to pd.Series
s1 = pd.Series([1, 2, 3, np.nan, 5])

In [3]:
s1

0    1.0
1    2.0
2    3.0
3    NaN
4    5.0
dtype: float64

In [4]:
# create an index to use in the dataframe
# default frequency is D (day), so creates a DatetimeIndex of 10 days
dates = pd.date_range('20220619', periods=10)
dates

DatetimeIndex(['2022-06-19', '2022-06-20', '2022-06-21', '2022-06-22',
               '2022-06-23', '2022-06-24', '2022-06-25', '2022-06-26',
               '2022-06-27', '2022-06-28'],
              dtype='datetime64[ns]', freq='D')

just in case you are wondering, here's [all the "offset aliases"](https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases) for Datetime

In [5]:
# Dataframe - pass a Numpy array
# create a 10 row, 4 col random number array, index by dates, give some column names
df = pd.DataFrame(np.random.randn(10, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2022-06-19,-1.49484,1.566663,-0.751969,-0.694111
2022-06-20,-0.497755,-0.114442,-0.293294,-0.097524
2022-06-21,-0.452266,0.106795,-2.573409,0.402769
2022-06-22,-0.772274,-0.771758,1.120627,0.290086
2022-06-23,1.274137,0.72152,-1.433394,-0.9464
2022-06-24,-0.30578,0.69134,-1.248361,-1.339002
2022-06-25,0.929974,-0.860579,-0.313065,-0.861611
2022-06-26,-0.068679,0.578892,-0.966537,0.340315
2022-06-27,0.241065,1.493878,-0.601492,1.290213
2022-06-28,0.33693,0.517254,1.151801,-0.355385


In [6]:
# Create a dataframe by passing a dictionary of objects
# where each object can be converted into a series-like structure
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20220619"),
        "C": pd.Series(1, index=list(range(6)), dtype="float32"),
        "D": np.array([3]*6, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train", "test", "train"]),
        "F": list("foofoo"),
        "G": "foo"
    }
)

df2

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2022-06-19,1.0,3,test,f,foo
1,1.0,2022-06-19,1.0,3,train,o,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
3,1.0,2022-06-19,1.0,3,train,f,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo


In [7]:
# the datatype of each of the columns would be different
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
G            object
dtype: object

# Viewing Data

In [8]:
# top of the dataframe
df.head()

Unnamed: 0,A,B,C,D
2022-06-19,-1.49484,1.566663,-0.751969,-0.694111
2022-06-20,-0.497755,-0.114442,-0.293294,-0.097524
2022-06-21,-0.452266,0.106795,-2.573409,0.402769
2022-06-22,-0.772274,-0.771758,1.120627,0.290086
2022-06-23,1.274137,0.72152,-1.433394,-0.9464


In [9]:
# bottom 3 records of the dataframe
df.tail(3)

Unnamed: 0,A,B,C,D
2022-06-26,-0.068679,0.578892,-0.966537,0.340315
2022-06-27,0.241065,1.493878,-0.601492,1.290213
2022-06-28,0.33693,0.517254,1.151801,-0.355385


In [10]:
# index of the df
df.index

DatetimeIndex(['2022-06-19', '2022-06-20', '2022-06-21', '2022-06-22',
               '2022-06-23', '2022-06-24', '2022-06-25', '2022-06-26',
               '2022-06-27', '2022-06-28'],
              dtype='datetime64[ns]', freq='D')

In [11]:
# columns of the df
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [12]:
# .to_numpy gives a NumPy representation of the dataframe
# this is expensive if all columns are of different data type
df.to_numpy()

array([[-1.49483956,  1.56666266, -0.75196901, -0.6941111 ],
       [-0.49775472, -0.11444177, -0.29329403, -0.09752365],
       [-0.45226587,  0.10679452, -2.57340924,  0.40276931],
       [-0.77227427, -0.77175773,  1.12062667,  0.29008612],
       [ 1.27413664,  0.72152036, -1.43339435, -0.9463999 ],
       [-0.30577987,  0.69134006, -1.24836076, -1.33900198],
       [ 0.9299737 , -0.8605787 , -0.31306493, -0.86161143],
       [-0.06867932,  0.57889206, -0.96653657,  0.34031529],
       [ 0.24106495,  1.49387768, -0.60149203,  1.29021322],
       [ 0.33692997,  0.51725378,  1.1518015 , -0.35538477]])

In [13]:
df2.to_numpy()

array([[1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'f',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'f',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'o',
        'foo']], dtype=object)

In [14]:
# quick summary stats
df.describe()

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,-0.080949,0.392956,-0.590909,-0.197065
std,0.814852,0.823534,1.124062,0.794919
min,-1.49484,-0.860579,-2.573409,-1.339002
25%,-0.486383,-0.059133,-1.177905,-0.819736
50%,-0.18723,0.548073,-0.676731,-0.226454
75%,0.312964,0.713975,-0.298237,0.327758
max,1.274137,1.566663,1.151801,1.290213


In [15]:
df2.describe()

Unnamed: 0,A,C,D
count,6.0,6.0,6.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [16]:
# transpose the data
# turn rows to columns and vice versa
df.T

Unnamed: 0,2022-06-19,2022-06-20,2022-06-21,2022-06-22,2022-06-23,2022-06-24,2022-06-25,2022-06-26,2022-06-27,2022-06-28
A,-1.49484,-0.497755,-0.452266,-0.772274,1.274137,-0.30578,0.929974,-0.068679,0.241065,0.33693
B,1.566663,-0.114442,0.106795,-0.771758,0.72152,0.69134,-0.860579,0.578892,1.493878,0.517254
C,-0.751969,-0.293294,-2.573409,1.120627,-1.433394,-1.248361,-0.313065,-0.966537,-0.601492,1.151801
D,-0.694111,-0.097524,0.402769,0.290086,-0.9464,-1.339002,-0.861611,0.340315,1.290213,-0.355385


In [17]:
# sort along the axis - 1 = horizontal
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2022-06-19,-0.694111,-0.751969,1.566663,-1.49484
2022-06-20,-0.097524,-0.293294,-0.114442,-0.497755
2022-06-21,0.402769,-2.573409,0.106795,-0.452266
2022-06-22,0.290086,1.120627,-0.771758,-0.772274
2022-06-23,-0.9464,-1.433394,0.72152,1.274137
2022-06-24,-1.339002,-1.248361,0.69134,-0.30578
2022-06-25,-0.861611,-0.313065,-0.860579,0.929974
2022-06-26,0.340315,-0.966537,0.578892,-0.068679
2022-06-27,1.290213,-0.601492,1.493878,0.241065
2022-06-28,-0.355385,1.151801,0.517254,0.33693


In [18]:
# sort along the axis - 0 = vertical
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2022-06-28,0.33693,0.517254,1.151801,-0.355385
2022-06-27,0.241065,1.493878,-0.601492,1.290213
2022-06-26,-0.068679,0.578892,-0.966537,0.340315
2022-06-25,0.929974,-0.860579,-0.313065,-0.861611
2022-06-24,-0.30578,0.69134,-1.248361,-1.339002
2022-06-23,1.274137,0.72152,-1.433394,-0.9464
2022-06-22,-0.772274,-0.771758,1.120627,0.290086
2022-06-21,-0.452266,0.106795,-2.573409,0.402769
2022-06-20,-0.497755,-0.114442,-0.293294,-0.097524
2022-06-19,-1.49484,1.566663,-0.751969,-0.694111


In [19]:
# sort ascending by values in a column
df.sort_values(by="A")

Unnamed: 0,A,B,C,D
2022-06-19,-1.49484,1.566663,-0.751969,-0.694111
2022-06-22,-0.772274,-0.771758,1.120627,0.290086
2022-06-20,-0.497755,-0.114442,-0.293294,-0.097524
2022-06-21,-0.452266,0.106795,-2.573409,0.402769
2022-06-24,-0.30578,0.69134,-1.248361,-1.339002
2022-06-26,-0.068679,0.578892,-0.966537,0.340315
2022-06-27,0.241065,1.493878,-0.601492,1.290213
2022-06-28,0.33693,0.517254,1.151801,-0.355385
2022-06-25,0.929974,-0.860579,-0.313065,-0.861611
2022-06-23,1.274137,0.72152,-1.433394,-0.9464


In [20]:
# sort by non-numerical values
df2.sort_values(by="F", ascending=False)

Unnamed: 0,A,B,C,D,E,F,G
1,1.0,2022-06-19,1.0,3,train,o,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo
0,1.0,2022-06-19,1.0,3,test,f,foo
3,1.0,2022-06-19,1.0,3,train,f,foo


In [21]:
# sort by two or more columns
df2.sort_values(by=["F", "E"])

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2022-06-19,1.0,3,test,f,foo
3,1.0,2022-06-19,1.0,3,train,f,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
1,1.0,2022-06-19,1.0,3,train,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo


# Selection

For production prefer the following instead of other data access methods (typical python methods like ["col"] or [a:b] slices etc.):

```.at, .iat, .loc and .iloc.```

## Getting

In [22]:
# selecting a single column returns a Series object
df["A"]

2022-06-19   -1.494840
2022-06-20   -0.497755
2022-06-21   -0.452266
2022-06-22   -0.772274
2022-06-23    1.274137
2022-06-24   -0.305780
2022-06-25    0.929974
2022-06-26   -0.068679
2022-06-27    0.241065
2022-06-28    0.336930
Freq: D, Name: A, dtype: float64

In [23]:
# selecting a slice
df[1:5]

Unnamed: 0,A,B,C,D
2022-06-20,-0.497755,-0.114442,-0.293294,-0.097524
2022-06-21,-0.452266,0.106795,-2.573409,0.402769
2022-06-22,-0.772274,-0.771758,1.120627,0.290086
2022-06-23,1.274137,0.72152,-1.433394,-0.9464


## Selection by label

In [24]:
# selecting based on a label
df.loc[dates[0]]

A   -1.494840
B    1.566663
C   -0.751969
D   -0.694111
Name: 2022-06-19 00:00:00, dtype: float64

In [28]:
# select on a multi-axis by lable
# I honestly do not know what the comment above means...help!
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2022-06-19,-1.49484,1.566663
2022-06-20,-0.497755,-0.114442
2022-06-21,-0.452266,0.106795
2022-06-22,-0.772274,-0.771758
2022-06-23,1.274137,0.72152
2022-06-24,-0.30578,0.69134
2022-06-25,0.929974,-0.860579
2022-06-26,-0.068679,0.578892
2022-06-27,0.241065,1.493878
2022-06-28,0.33693,0.517254


In [37]:
# specific index value results in reduction of dimensions
res = df.loc["2022-06-20"]
print(res)
print("res.shape = ",res.shape," vs. df.shape = ", df.shape)

A   -0.497755
B   -0.114442
C   -0.293294
D   -0.097524
Name: 2022-06-20 00:00:00, dtype: float64
res.shape =  (4,)  vs. df.shape =  (10, 4)


In [43]:
# get to a specific scalar:
# 
# method one
df.loc[dates[0], "A"]

-1.4948395627125766

In [44]:
# 
# method two (slightly faster than method one)
df.at[dates[0],"A"]

-1.4948395627125766

## Selection by position

In [45]:
df.iloc[2]

A   -0.452266
B    0.106795
C   -2.573409
D    0.402769
Name: 2022-06-21 00:00:00, dtype: float64

In [47]:
# slices - similar to NumPy / Python - [row:slice, col:slice]
df.iloc[1:5, 0:2]

Unnamed: 0,A,B
2022-06-20,-0.497755,-0.114442
2022-06-21,-0.452266,0.106795
2022-06-22,-0.772274,-0.771758
2022-06-23,1.274137,0.72152


In [51]:
# by list of locations - similar to NumPy / Python - [[list of rows], [list of cols]]
df.iloc[[0,1,2,6],[0,2]]

Unnamed: 0,A,C
2022-06-19,-1.49484,-0.751969
2022-06-20,-0.497755,-0.293294
2022-06-21,-0.452266,-2.573409
2022-06-25,0.929974,-0.313065


In [53]:
# by list of locations - similar to NumPy / Python - [[list of rows], [list of cols]]
# change the order of columns, repeact a column
df.iloc[[0,1,2,6],[2,1,0,2]]

Unnamed: 0,C,B,A,C.1
2022-06-19,-0.751969,1.566663,-1.49484,-0.751969
2022-06-20,-0.293294,-0.114442,-0.497755,-0.293294
2022-06-21,-2.573409,0.106795,-0.452266,-2.573409
2022-06-25,-0.313065,-0.860579,0.929974,-0.313065
