## Introduction

This notebook is the walkalong from the reading of the Pandas Documentation.

It is used to test/practice all the examples presented in the documentation. 

(c) Soyinka Sowoolu 2023

### 10 mins to Pandas

In [37]:
import numpy as np
import pandas as pd

**Basic data structures in Pandas**

In [2]:
# Series
# Pandas creates a default `RangeIndex`
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
# DataFrame
# Create a DataFrame by passing a NumPy array with a datetime index using `date_range() and labelled columns
dates = pd.date_range('20230101', periods=6)
dates

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2023-01-01,-0.068748,-0.779788,0.914161,0.201815
2023-01-02,-0.227731,-0.859499,0.906474,-0.526006
2023-01-03,1.423977,0.299134,0.059327,1.281677
2023-01-04,-1.116002,0.118301,1.61781,-0.226803
2023-01-05,2.455807,-0.381374,-0.746268,0.7929
2023-01-06,0.841858,2.168082,-0.71777,0.415155


In [6]:
# Create a DataFrame by passing a dictionary of objects where the keys are the column labels and values are column values
df2 = pd.DataFrame(
        {'A': 1.0,
         'B': pd.Timestamp('20230102'),
         'C': pd.Series(1, index=list(range(4)),dtype='float32'),
         'D': np.array([3] * 4, dtype='int32'),
         'E': pd.Categorical(['test', 'train', 'test', 'train']),
         'F': 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2023-01-02,1.0,3,test,foo
1,1.0,2023-01-02,1.0,3,train,foo
2,1.0,2023-01-02,1.0,3,test,foo
3,1.0,2023-01-02,1.0,3,train,foo


In [17]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [20]:
# Veiw the top of the data
df2.head()

Unnamed: 0,A,B,C,D,E,F
0,1.0,2023-01-02,1.0,3,test,foo
1,1.0,2023-01-02,1.0,3,train,foo
2,1.0,2023-01-02,1.0,3,test,foo
3,1.0,2023-01-02,1.0,3,train,foo


In [22]:
# Veiw the bottom row of the data
df2.tail(2)

Unnamed: 0,A,B,C,D,E,F
2,1.0,2023-01-02,1.0,3,test,foo
3,1.0,2023-01-02,1.0,3,train,foo


In [23]:
# Display the index
df.index

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06'],
              dtype='datetime64[ns]', freq='D')

In [24]:
# Display the columns
df.columns


Index(['A', 'B', 'C', 'D'], dtype='object')

In [26]:
# Return the underlining NumPy data
# Pandas will find the NumPy dtype that can hold all the dtypes in the DataFrame.
df2.to_numpy()

array([[1.0, Timestamp('2023-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2023-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2023-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2023-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [28]:
# Quick statistics of data
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.551526,0.094142,0.338956,0.323123
std,1.284086,1.11742,0.965503,0.661232
min,-1.116002,-0.859499,-0.746268,-0.526006
25%,-0.187986,-0.680185,-0.523496,-0.119649
50%,0.386555,-0.131537,0.482901,0.308485
75%,1.278447,0.253926,0.912239,0.698464
max,2.455807,2.168082,1.61781,1.281677


In [29]:
# Transpose the data
df.T

Unnamed: 0,2023-01-01,2023-01-02,2023-01-03,2023-01-04,2023-01-05,2023-01-06
A,-0.068748,-0.227731,1.423977,-1.116002,2.455807,0.841858
B,-0.779788,-0.859499,0.299134,0.118301,-0.381374,2.168082
C,0.914161,0.906474,0.059327,1.61781,-0.746268,-0.71777
D,0.201815,-0.526006,1.281677,-0.226803,0.7929,0.415155


In [31]:
# Sort by axis (column - axis)
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2023-01-01,0.201815,0.914161,-0.779788,-0.068748
2023-01-02,-0.526006,0.906474,-0.859499,-0.227731
2023-01-03,1.281677,0.059327,0.299134,1.423977
2023-01-04,-0.226803,1.61781,0.118301,-1.116002
2023-01-05,0.7929,-0.746268,-0.381374,2.455807
2023-01-06,0.415155,-0.71777,2.168082,0.841858


In [32]:
# Sort by Index (row - axis)
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2023-01-06,0.841858,2.168082,-0.71777,0.415155
2023-01-05,2.455807,-0.381374,-0.746268,0.7929
2023-01-04,-1.116002,0.118301,1.61781,-0.226803
2023-01-03,1.423977,0.299134,0.059327,1.281677
2023-01-02,-0.227731,-0.859499,0.906474,-0.526006
2023-01-01,-0.068748,-0.779788,0.914161,0.201815


In [36]:
# Sort values
df.sort_values(by='B', ascending=False)

Unnamed: 0,A,B,C,D
2023-01-06,0.841858,2.168082,-0.71777,0.415155
2023-01-03,1.423977,0.299134,0.059327,1.281677
2023-01-04,-1.116002,0.118301,1.61781,-0.226803
2023-01-05,2.455807,-0.381374,-0.746268,0.7929
2023-01-01,-0.068748,-0.779788,0.914161,0.201815
2023-01-02,-0.227731,-0.859499,0.906474,-0.526006


**Selection**

In [None]:
The optimised Pandas data access methods are `Dataframe.at()`, `DataFrame.iat()`, D