In [2]:
#### Introduction to Data Wrangling with Pandas ####
## Page 3 ##

In [3]:
#### Subsetting a dataframe ####

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 

In [2]:
num = range(1,6)
mul2 = [x*2 for x in num]
mul3 = [x*3 for x in num]
mul4 = [x*4 for x in num]
mul5 = [x*35 for x in num]
data = [num, mul2, mul3, mul4, mul5]   

In [3]:
df1 = pd.DataFrame(data, index=['v', 'w', 'x', 'y', 'z'], columns=['A', 'B','C','D', 'E'])

In [4]:
df1

Unnamed: 0,A,B,C,D,E
v,1,2,3,4,5
w,2,4,6,8,10
x,3,6,9,12,15
y,4,8,12,16,20
z,35,70,105,140,175


In [8]:
# 0. Column names
# 1. By labels
# 2. By index
# 3. Combination of both

In [5]:
#### Get Columns ####

In [6]:
df1['A']

v     1
w     2
x     3
y     4
z    35
Name: A, dtype: int64

In [7]:
df1.A

v     1
w     2
x     3
y     4
z    35
Name: A, dtype: int64

In [8]:
df1

Unnamed: 0,A,B,C,D,E
v,1,2,3,4,5
w,2,4,6,8,10
x,3,6,9,12,15
y,4,8,12,16,20
z,35,70,105,140,175


In [10]:
df1[['A', 'E', 'C']]

Unnamed: 0,A,E,C
v,1,5,3
w,2,10,6
x,3,15,9
y,4,20,12
z,35,175,105


In [11]:
#### Selecting data using Labels ####

In [12]:
df1

Unnamed: 0,A,B,C,D,E
v,1,2,3,4,5
w,2,4,6,8,10
x,3,6,9,12,15
y,4,8,12,16,20
z,35,70,105,140,175


In [13]:
df1.loc['z']

A     35
B     70
C    105
D    140
E    175
Name: z, dtype: int64

In [16]:
df1

Unnamed: 0,A,B,C,D,E
v,1,2,3,4,5
w,2,4,6,8,10
x,3,6,9,12,15
y,4,8,12,16,20
z,35,70,105,140,175


In [14]:
df1.loc[['w','z']] #only rows

Unnamed: 0,A,B,C,D,E
w,2,4,6,8,10
z,35,70,105,140,175


In [16]:
df1.loc[['w','z'],['A','B']] # rows and columns

Unnamed: 0,A,B
w,2,4
z,35,70


In [17]:
df1.loc['w','A'] # like value = array[row_index][column_index]

2

In [18]:
df1

Unnamed: 0,A,B,C,D,E
v,1,2,3,4,5
w,2,4,6,8,10
x,3,6,9,12,15
y,4,8,12,16,20
z,35,70,105,140,175


In [18]:
df1.loc['x': ,] # some row onwards

Unnamed: 0,A,B,C,D,E
x,3,6,9,12,15
y,4,8,12,16,20
z,35,70,105,140,175


In [19]:
df1

Unnamed: 0,A,B,C,D,E
v,1,2,3,4,5
w,2,4,6,8,10
x,3,6,9,12,15
y,4,8,12,16,20
z,35,70,105,140,175


In [19]:
df1.loc[:, 'C':'E'] # some column onwards

Unnamed: 0,C,D,E
v,3,4,5
w,6,8,10
x,9,12,15
y,12,16,20
z,105,140,175


In [20]:
df1.loc[:,:] # equal to printing the df

Unnamed: 0,A,B,C,D,E
v,1,2,3,4,5
w,2,4,6,8,10
x,3,6,9,12,15
y,4,8,12,16,20
z,35,70,105,140,175


In [25]:
df1.loc[:, 'B':'C']%2==0 # or anyother operation that yields boolean result

Unnamed: 0,B,C
v,True,False
w,True,True
x,True,False
y,True,True
z,True,False


In [21]:
#### Using Index values

In [22]:
df1

Unnamed: 0,A,B,C,D,E
v,1,2,3,4,5
w,2,4,6,8,10
x,3,6,9,12,15
y,4,8,12,16,20
z,35,70,105,140,175


In [21]:
df1.iloc[2] # the 2nd row, index starts from zero

A     3
B     6
C     9
D    12
E    15
Name: x, dtype: int64

In [22]:
df1.iloc[2:] #row onwards

Unnamed: 0,A,B,C,D,E
x,3,6,9,12,15
y,4,8,12,16,20
z,35,70,105,140,175


In [23]:
df1

Unnamed: 0,A,B,C,D,E
v,1,2,3,4,5
w,2,4,6,8,10
x,3,6,9,12,15
y,4,8,12,16,20
z,35,70,105,140,175


In [24]:
df1.iloc[1:3, 1:3] #subset rows and columns, iloc subsets from start to stop-1

Unnamed: 0,B,C
w,4,6
x,6,9


In [25]:
df1.iloc[:, 1:3] # column onwards

Unnamed: 0,B,C
v,2,3
w,4,6
x,6,9
y,8,12
z,70,105


In [25]:
df1

Unnamed: 0,A,B,C,D,E
v,1,2,3,4,5
w,2,4,6,8,10
x,3,6,9,12,15
y,4,8,12,16,20
z,35,70,105,140,175


In [26]:
df1.iloc[[1, 3], [2, 4]] # list the row and column number to display

Unnamed: 0,C,E
w,6,10
y,12,20


In [27]:
df_result = df1.iloc[:, [2, 4]] # all rows of selected columns

Unnamed: 0,C,E
v,3,5
w,6,10
x,9,15
y,12,20
z,105,175


In [28]:
df1.iloc[0,0] #value = array[row_index][col_index]

1

In [34]:
# depricated ix where you could use a combination of labels and index

In [29]:
df1['A'].iloc[0]

1