# Pandas Accessing

|Operation	|Syntax	|Result
|------------|-------|--------|
Select column	|df[col]	|Series
Select row by label	|df.loc[label]	|Series
Select row by integer location	|df.iloc[loc]	|Series
Slice rows	|df[5:10]	|DataFrame
Select rows by boolean vector	|df[bool_vec]	|DataFrame


In [1]:
import pandas as pd
import numpy as np

In [3]:
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

In [4]:
df2 = pd.DataFrame({'A': 1.0,
                        'B': pd.Timestamp('20130102'),
                        'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                        'D': np.array([3] * 4, dtype='int32'),
                        'E': pd.Categorical(["test", "train", "test", "train"]),
                        'F': 'foo'})

In [5]:
df ['A']

2013-01-01   -0.097688
2013-01-02   -0.563900
2013-01-03   -0.540298
2013-01-04   -0.185317
2013-01-05    0.540873
2013-01-06   -0.101498
Freq: D, Name: A, dtype: float64

In [6]:
df.A

2013-01-01   -0.097688
2013-01-02   -0.563900
2013-01-03   -0.540298
2013-01-04   -0.185317
2013-01-05    0.540873
2013-01-06   -0.101498
Freq: D, Name: A, dtype: float64

In [7]:
# Selecting via [], which slices the rows.

In [8]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.097688,1.494417,0.993487,-0.303951
2013-01-02,-0.5639,1.533546,0.312261,0.596319
2013-01-03,-0.540298,0.627034,0.906777,0.782531


In [9]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.5639,1.533546,0.312261,0.596319
2013-01-03,-0.540298,0.627034,0.906777,0.782531
2013-01-04,-0.185317,0.661106,-1.056625,0.519773


In [10]:
df.loc["2013-01-01"]

A   -0.097688
B    1.494417
C    0.993487
D   -0.303951
Name: 2013-01-01 00:00:00, dtype: float64

In [13]:
# Select more than one column by their column names.
df.loc[:, ['A', 'B']]
#.  : represents that we want to take all the rows as well as the list, ['A', 'B'], which represents the columns.

Unnamed: 0,A,B
2013-01-01,-0.097688,1.494417
2013-01-02,-0.5639,1.533546
2013-01-03,-0.540298,0.627034
2013-01-04,-0.185317,0.661106
2013-01-05,0.540873,-0.695804
2013-01-06,-0.101498,-0.50291


### Selection by Position
We can also select the actual position in a data frame

In [16]:
df.iloc[3] # This command returns the 4th row

A   -0.185317
B    0.661106
C   -1.056625
D    0.519773
Name: 2013-01-04 00:00:00, dtype: float64

### Selection by dtype
- The select_dtypes() method implements subsetting of columns based on their dtype. By subsetting, we mean taking only the selection of columns based on their dtype.

In [17]:
 df3 = pd.DataFrame({'string': list('abc'),
                       'int64': list(range(1, 4)),
                       'uint8': np.arange(3, 6).astype('u1'),
                       'float64': np.arange(4.0, 7.0),
                       'bool1': [True, False, True],
                       'bool2': [False, True, False],
                       'dates': pd.date_range('now', periods=3),
                       'category': pd.Series(list("ABC")).astype('category')})

In [18]:
df3

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,1,3,4.0,True,False,2023-04-16 10:04:34.278580,A
1,b,2,4,5.0,False,True,2023-04-17 10:04:34.278580,B
2,c,3,5,6.0,True,False,2023-04-18 10:04:34.278580,C


In [19]:
# only select bool columns
df.select_dtypes(include=[bool])

2013-01-01
2013-01-02
2013-01-03
2013-01-04
2013-01-05
2013-01-06


### Boolean Indexing

In [20]:
# Usinf column values to filter data

In [25]:
df3

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,1,3,4.0,True,False,2023-04-16 10:04:34.278580,A
1,b,2,4,5.0,False,True,2023-04-17 10:04:34.278580,B
2,c,3,5,6.0,True,False,2023-04-18 10:04:34.278580,C


In [27]:
df3[df3['float64'] >= 5]

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
1,b,2,4,5.0,False,True,2023-04-17 10:04:34.278580,B
2,c,3,5,6.0,True,False,2023-04-18 10:04:34.278580,C


In [30]:
# can also use function isin() for filtering.
df4 = df3.copy()

In [35]:
df4['E'] = ['one', 'two', 'three'] # Here we create a new column 'E' and add the values 'one' 'two' 'three'

In [32]:
#use function isin() to take only rows where E is two or four.

In [34]:
df4[df4['E'].isin(['one','two'])]

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category,E
0,a,1,3,4.0,True,False,2023-04-16 10:04:34.278580,A,one
1,b,2,4,5.0,False,True,2023-04-17 10:04:34.278580,B,two
