# Setup

This is following [10 minutes to pandas](https://pandas.pydata.org/docs/user_guide/10min.html) for pandas 1.4.2

In [1]:
import numpy as np
import pandas as pd

# Object creation

In [2]:
# Series - pass a list to pd.Series
s1 = pd.Series([1,2,3,np.nan,5])

In [3]:
s1

0    1.0
1    2.0
2    3.0
3    NaN
4    5.0
dtype: float64

In [4]:
# create an index to use in the dataframe
# default frequency is D (day), so creates a DatetimeIndex of 10 days
dates = pd.date_range('20220619', periods=10)
dates

DatetimeIndex(['2022-06-19', '2022-06-20', '2022-06-21', '2022-06-22',
               '2022-06-23', '2022-06-24', '2022-06-25', '2022-06-26',
               '2022-06-27', '2022-06-28'],
              dtype='datetime64[ns]', freq='D')

just in case you are wondering, here's [all the "offset aliases"](https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases) for Datetime

In [5]:
# Dataframe - pass a Numpy array
# create a 10 row, 4 col random number array, index by dates, give some column names
df = pd.DataFrame(np.random.randn(10,4), index = dates, columns = list('ABCD'))
df

Unnamed: 0,A,B,C,D
2022-06-19,-1.003089,0.612023,0.134382,0.533077
2022-06-20,0.772191,0.363386,-0.889165,1.30501
2022-06-21,1.116749,0.842817,0.887066,0.048783
2022-06-22,0.503601,0.006906,1.553999,-0.546315
2022-06-23,1.787706,-1.674877,-0.619534,0.111838
2022-06-24,-0.779693,-0.654378,1.411015,-0.144691
2022-06-25,-1.346452,-1.117059,0.567356,-0.296802
2022-06-26,2.367685,-0.42723,-0.21991,1.454369
2022-06-27,-1.015488,0.91091,0.305691,0.422114
2022-06-28,0.47676,-1.676741,0.309359,-1.129816


In [6]:
# Create a dataframe by passing a dictionary of objects
# where each object can be converted into a series-like structure
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20220619"),
        "C": pd.Series(1, index = list(range(6)), dtype="float32"),
        "D": np.array([3]*6, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train", "test", "train"]),
        "F": list("foofoo"),
        "G": "foo"
    }
)

df2



Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2022-06-19,1.0,3,test,f,foo
1,1.0,2022-06-19,1.0,3,train,o,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
3,1.0,2022-06-19,1.0,3,train,f,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo


In [8]:
# the datatype of each of the columns would be different
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
G            object
dtype: object

# Viewing Data

In [11]:
# top of the dataframe
df.head()

Unnamed: 0,A,B,C,D
2022-06-19,-1.003089,0.612023,0.134382,0.533077
2022-06-20,0.772191,0.363386,-0.889165,1.30501
2022-06-21,1.116749,0.842817,0.887066,0.048783
2022-06-22,0.503601,0.006906,1.553999,-0.546315
2022-06-23,1.787706,-1.674877,-0.619534,0.111838


In [12]:
# bottom 3 records of the dataframe
df.tail(3)

Unnamed: 0,A,B,C,D
2022-06-26,2.367685,-0.42723,-0.21991,1.454369
2022-06-27,-1.015488,0.91091,0.305691,0.422114
2022-06-28,0.47676,-1.676741,0.309359,-1.129816


In [13]:
# index of the df
df.index

DatetimeIndex(['2022-06-19', '2022-06-20', '2022-06-21', '2022-06-22',
               '2022-06-23', '2022-06-24', '2022-06-25', '2022-06-26',
               '2022-06-27', '2022-06-28'],
              dtype='datetime64[ns]', freq='D')

In [14]:
# columns of the df
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [15]:
# .to_numpy gives a NumPy representation of the dataframe
# this is expensive if all columns are of different data type
df.to_numpy()

array([[-1.00308895,  0.61202335,  0.13438248,  0.53307705],
       [ 0.77219087,  0.36338641, -0.88916529,  1.30501015],
       [ 1.11674867,  0.84281653,  0.88706622,  0.04878309],
       [ 0.50360131,  0.00690604,  1.55399934, -0.54631541],
       [ 1.787706  , -1.67487744, -0.61953421,  0.11183847],
       [-0.77969347, -0.65437758,  1.41101473, -0.14469117],
       [-1.346452  , -1.11705895,  0.56735615, -0.29680221],
       [ 2.3676853 , -0.42722975, -0.21990968,  1.4543687 ],
       [-1.01548792,  0.91091039,  0.305691  ,  0.42211426],
       [ 0.47676009, -1.67674129,  0.30935876, -1.1298161 ]])

In [16]:
df2.to_numpy()

array([[1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'f',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'f',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'o',
        'foo']], dtype=object)

In [17]:
# quick summary stats
df.describe()

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,0.287997,-0.281424,0.344026,0.175757
std,1.280407,0.984886,0.800078,0.793764
min,-1.346452,-1.676741,-0.889165,-1.129816
25%,-0.94724,-1.001389,-0.131337,-0.258774
50%,0.490181,-0.210162,0.307525,0.080311
75%,1.030609,0.549864,0.807139,0.505336
max,2.367685,0.91091,1.553999,1.454369


In [18]:
df2.describe()

Unnamed: 0,A,C,D
count,6.0,6.0,6.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [20]:
# transpose the data
# turn rows to columns and vice versa
df.T

Unnamed: 0,2022-06-19,2022-06-20,2022-06-21,2022-06-22,2022-06-23,2022-06-24,2022-06-25,2022-06-26,2022-06-27,2022-06-28
A,-1.003089,0.772191,1.116749,0.503601,1.787706,-0.779693,-1.346452,2.367685,-1.015488,0.47676
B,0.612023,0.363386,0.842817,0.006906,-1.674877,-0.654378,-1.117059,-0.42723,0.91091,-1.676741
C,0.134382,-0.889165,0.887066,1.553999,-0.619534,1.411015,0.567356,-0.21991,0.305691,0.309359
D,0.533077,1.30501,0.048783,-0.546315,0.111838,-0.144691,-0.296802,1.454369,0.422114,-1.129816


In [26]:
# sort along the axis - 1 = horizontal
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2022-06-19,0.533077,0.134382,0.612023,-1.003089
2022-06-20,1.30501,-0.889165,0.363386,0.772191
2022-06-21,0.048783,0.887066,0.842817,1.116749
2022-06-22,-0.546315,1.553999,0.006906,0.503601
2022-06-23,0.111838,-0.619534,-1.674877,1.787706
2022-06-24,-0.144691,1.411015,-0.654378,-0.779693
2022-06-25,-0.296802,0.567356,-1.117059,-1.346452
2022-06-26,1.454369,-0.21991,-0.42723,2.367685
2022-06-27,0.422114,0.305691,0.91091,-1.015488
2022-06-28,-1.129816,0.309359,-1.676741,0.47676


In [27]:
# sort along the axis - 0 = vertical
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2022-06-28,0.47676,-1.676741,0.309359,-1.129816
2022-06-27,-1.015488,0.91091,0.305691,0.422114
2022-06-26,2.367685,-0.42723,-0.21991,1.454369
2022-06-25,-1.346452,-1.117059,0.567356,-0.296802
2022-06-24,-0.779693,-0.654378,1.411015,-0.144691
2022-06-23,1.787706,-1.674877,-0.619534,0.111838
2022-06-22,0.503601,0.006906,1.553999,-0.546315
2022-06-21,1.116749,0.842817,0.887066,0.048783
2022-06-20,0.772191,0.363386,-0.889165,1.30501
2022-06-19,-1.003089,0.612023,0.134382,0.533077


In [28]:
# sort ascending by values in a column
df.sort_values(by="A")

Unnamed: 0,A,B,C,D
2022-06-25,-1.346452,-1.117059,0.567356,-0.296802
2022-06-27,-1.015488,0.91091,0.305691,0.422114
2022-06-19,-1.003089,0.612023,0.134382,0.533077
2022-06-24,-0.779693,-0.654378,1.411015,-0.144691
2022-06-28,0.47676,-1.676741,0.309359,-1.129816
2022-06-22,0.503601,0.006906,1.553999,-0.546315
2022-06-20,0.772191,0.363386,-0.889165,1.30501
2022-06-21,1.116749,0.842817,0.887066,0.048783
2022-06-23,1.787706,-1.674877,-0.619534,0.111838
2022-06-26,2.367685,-0.42723,-0.21991,1.454369


In [29]:
df2.sort_values(by="F", ascending=False)

Unnamed: 0,A,B,C,D,E,F,G
1,1.0,2022-06-19,1.0,3,train,o,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo
0,1.0,2022-06-19,1.0,3,test,f,foo
3,1.0,2022-06-19,1.0,3,train,f,foo
