In [1]:
import pandas as pd
import numpy as np

In [2]:
my_dates_index = pd.date_range('20220113', periods=6)
my_dates_index

DatetimeIndex(['2022-01-13', '2022-01-14', '2022-01-15', '2022-01-16',
               '2022-01-17', '2022-01-18'],
              dtype='datetime64[ns]', freq='D')

In [3]:
sample_numpy_data = np.array(np.arange(24)).reshape((6,4))
sample_numpy_data

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23]])

In [5]:
sample_df = pd.DataFrame(sample_numpy_data, index=my_dates_index, columns=list('ABCD'))
sample_df

Unnamed: 0,A,B,C,D
2022-01-13,0,1,2,3
2022-01-14,4,5,6,7
2022-01-15,8,9,10,11
2022-01-16,12,13,14,15
2022-01-17,16,17,18,19
2022-01-18,20,21,22,23


##### selection using column name

In [6]:
sample_df['C']

2022-01-13     2
2022-01-14     6
2022-01-15    10
2022-01-16    14
2022-01-17    18
2022-01-18    22
Freq: D, Name: C, dtype: int32

##### selection using slice
- remember: up to, but not including second index

In [7]:
sample_df[1:4]

Unnamed: 0,A,B,C,D
2022-01-14,4,5,6,7
2022-01-15,8,9,10,11
2022-01-16,12,13,14,15


##### selection using date time index
- note: last index is included

In [8]:
sample_df['2022-01-13':'2022-01-15']

Unnamed: 0,A,B,C,D
2022-01-13,0,1,2,3
2022-01-14,4,5,6,7
2022-01-15,8,9,10,11


### Selection by label

label-location based indexer for selection by label

In [9]:
my_dates_index[1:5:2]

DatetimeIndex(['2022-01-14', '2022-01-16'], dtype='datetime64[ns]', freq='2D')

In [10]:
sample_df.loc[my_dates_index[1:5:2]]

Unnamed: 0,A,B,C,D
2022-01-14,4,5,6,7
2022-01-16,12,13,14,15


##### Selecting using multi-axis by label


In [11]:
sample_df.loc[:,['A','B']]

Unnamed: 0,A,B
2022-01-13,0,1
2022-01-14,4,5
2022-01-15,8,9
2022-01-16,12,13
2022-01-17,16,17
2022-01-18,20,21


In [12]:
sample_df.loc[my_dates_index[1:5:2],['A','B']]

Unnamed: 0,A,B
2022-01-14,4,5
2022-01-16,12,13


##### Reduce number of dimensions for returned object
- notice order of 'D' and 'B'

In [16]:
sample_df.loc['2022-01-15',['D','B']]

D    11
B     9
Name: 2022-01-15 00:00:00, dtype: int32

##### using result

In [14]:
sample_df.loc['2022-01-15',['D','B']] [0] * 4

44

### Selection by Position

integer-location based indexing for selection by position

In [17]:
sample_numpy_data[3]

array([12, 13, 14, 15])

In [18]:
sample_df.iloc[3]

A    12
B    13
C    14
D    15
Name: 2022-01-16 00:00:00, dtype: int32

##### integer slices


In [19]:
sample_df.iloc[1:3, 2:4]

Unnamed: 0,C,D
2022-01-14,6,7
2022-01-15,10,11


##### lists of integers

In [20]:
sample_df.iloc[[0,1,3], [0,2]]

Unnamed: 0,A,C
2022-01-13,0,2
2022-01-14,4,6
2022-01-16,12,14


##### slicing rows explicitly
implicitly selecting all columns

In [21]:
sample_df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2022-01-14,4,5,6,7
2022-01-15,8,9,10,11


##### slicing columns explicitly
implicitly selecting all rows

In [22]:
sample_df.iloc[:, 1:3]

Unnamed: 0,B,C
2022-01-13,1,2
2022-01-14,5,6
2022-01-15,9,10
2022-01-16,13,14
2022-01-17,17,18
2022-01-18,21,22


### Boolean Indexing
##### test based upon one column's data

In [23]:
sample_df.C >= 14

2022-01-13    False
2022-01-14    False
2022-01-15    False
2022-01-16     True
2022-01-17     True
2022-01-18     True
Freq: D, Name: C, dtype: bool

In [24]:
sample_df[sample_df.C >= 14]

Unnamed: 0,A,B,C,D
2022-01-16,12,13,14,15
2022-01-17,16,17,18,19
2022-01-18,20,21,22,23


##### test based upon entire data set

In [25]:
sample_df >= 11

Unnamed: 0,A,B,C,D
2022-01-13,False,False,False,False
2022-01-14,False,False,False,False
2022-01-15,False,False,False,True
2022-01-16,True,True,True,True
2022-01-17,True,True,True,True
2022-01-18,True,True,True,True


In [26]:
sample_df[sample_df >= 11]

Unnamed: 0,A,B,C,D
2022-01-13,,,,
2022-01-14,,,,
2022-01-15,,,,11.0
2022-01-16,12.0,13.0,14.0,15.0
2022-01-17,16.0,17.0,18.0,19.0
2022-01-18,20.0,21.0,22.0,23.0


##### isin() method

In [27]:
sample_df['C'].isin([14,13,12,11,10])

2022-01-13    False
2022-01-14    False
2022-01-15     True
2022-01-16     True
2022-01-17    False
2022-01-18    False
Freq: D, Name: C, dtype: bool

In [28]:
sample_df[sample_df['C'].isin([14,13,12,11,10])]

Unnamed: 0,A,B,C,D
2022-01-15,8,9,10,11
2022-01-16,12,13,14,15


### Sorting

##### sort by axis

In [29]:
sample_df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2022-01-13,3,2,1,0
2022-01-14,7,6,5,4
2022-01-15,11,10,9,8
2022-01-16,15,14,13,12
2022-01-17,19,18,17,16
2022-01-18,23,22,21,20


## sort by data within a column

In [30]:
sample_df.sort_values(by='B', ascending=False)

Unnamed: 0,A,B,C,D
2022-01-18,20,21,22,23
2022-01-17,16,17,18,19
2022-01-16,12,13,14,15
2022-01-15,8,9,10,11
2022-01-14,4,5,6,7
2022-01-13,0,1,2,3


### Select, Add, Delete, Columns

##### dictionary selection with string index

In [31]:
cookbook_df = pd.DataFrame({'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]})
cookbook_df

Unnamed: 0,AAA,BBB,CCC
0,4,10,100
1,5,20,50
2,6,30,-30
3,7,40,-50


##### arithmetic vectorized operation using string indices

In [32]:
cookbook_df['BBB'] * cookbook_df['CCC']

0    1000
1    1000
2    -900
3   -2000
dtype: int64

##### column deletion 

In [33]:
del cookbook_df['BBB']
cookbook_df

Unnamed: 0,AAA,CCC
0,4,100
1,5,50
2,6,-30
3,7,-50


In [34]:
last_column = cookbook_df.pop('CCC')
last_column

0    100
1     50
2    -30
3    -50
Name: CCC, dtype: int64

In [35]:
cookbook_df

Unnamed: 0,AAA
0,4
1,5
2,6
3,7


##### add a new column using a Python list

In [36]:
cookbook_df['DDD'] = [32, 21, 43, 'hike']
cookbook_df

Unnamed: 0,AAA,DDD
0,4,32
1,5,21
2,6,43
3,7,hike


In [37]:
cookbook_df.dtypes

AAA     int64
DDD    object
dtype: object

##### insert function

In [38]:
cookbook_df.insert(1, "new column", [3,4,5,6])
cookbook_df

Unnamed: 0,AAA,new column,DDD
0,4,3,32
1,5,4,21
2,6,5,43
3,7,6,hike


In [39]:
cookbook_df.drop('DDD', axis=1, inplace=True)
cookbook_df

Unnamed: 0,AAA,new column
0,4,3
1,5,4
2,6,5
3,7,6
