In [2]:
import pandas as pd

# Syntax - creating dataframes

In [5]:
df = pd.DataFrame(
    {"a" : [4 ,5, 6],
    "b" : [7, 8, 9],
    "c" : [10, 11, 12]},
    index = [1, 2, 3])

In [4]:
df

Unnamed: 0,a,b,c
1,4,7,10
2,5,8,11
3,6,9,12


In [6]:
df["c"]

1    10
2    11
3    12
Name: c, dtype: int64

In [10]:
df[["a", "c"]]

Unnamed: 0,a,c
1,4,10
2,5,11
3,6,12


In [13]:
df.[1]   # Error

SyntaxError: invalid syntax (<ipython-input-13-25a8941f1944>, line 1)

In [12]:
df.loc[1]    # Use loc with index

a     4
b     7
c    10
Name: 1, dtype: int64

In [14]:
df.loc[1, "a"] # Use loc with row index and column name

4

In [15]:
df.loc[[1,2], ["a", "b"]]

Unnamed: 0,a,b
1,4,7
2,5,8


In [16]:
df = pd.DataFrame(
    [[4, 7, 10],
    [5, 8, 11],
    [6, 9, 12]],
    index=[1, 2, 3],
    columns=['a', 'b', 'c'])

In [17]:
df

Unnamed: 0,a,b,c
1,4,7,10
2,5,8,11
3,6,9,12


In [19]:
# data frame with multi index   

df = pd.DataFrame(
    {"a" : [4 ,5, 6],
    "b" : [7, 8, 9],
    "c" : [10, 11, 12]},
    index = pd.MultiIndex.from_tuples(
    [('d',1),('d',2),('e',2)],
    names=['n','v']))

In [20]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
d,2,5,8,11
e,2,6,9,12


In [25]:
df.loc[('d',1), ["a","c"]]

a     4
c    10
Name: (d, 1), dtype: int64

# Subset observations (row)

In [28]:
df[df.b > 7]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,2,5,8,11
e,2,6,9,12


In [31]:
df.b > 7  # Result is boolean series

n  v
d  1    False
   2     True
e  2     True
Name: b, dtype: bool

In [33]:
df['b'] > 7      # if column name is not alphabet, can not use .columnname. have to use ['columnname']

n  v
d  1    False
   2     True
e  2     True
Name: b, dtype: bool

In [42]:
df = pd.DataFrame(
    {"a" : [4 ,5, 6, 6],
    "b" : [7, 8, 9, 9],
    "c" : [10, 11, 12, 12]},
    index = pd.MultiIndex.from_tuples(
    [('d',1),('d',2),('e',2),('e',3)],
    names=['n','v']))

In [43]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
d,2,5,8,11
e,2,6,9,12
e,3,6,9,12


In [47]:
df.drop_duplicates(keep='first')

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
d,2,5,8,11
e,2,6,9,12


In [49]:
df.drop_duplicates(keep='last') # Dropped index is not same accoding to keep value

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
d,2,5,8,11
e,3,6,9,12


In [50]:
df[df['b'] != 7]  # Filtering by condition value of each row

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,2,5,8,11
e,2,6,9,12
e,3,6,9,12


In [54]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
d,2,5,8,11
e,2,6,9,12
e,3,6,9,12


In [53]:
df.a.isin([5])

n  v
d  1    False
   2     True
e  2    False
   3    False
Name: a, dtype: bool

In [57]:
import numpy as np
df = pd.DataFrame(
    {"a" : [4 ,5, 6, np.nan],
    "b" : [7, 8, np.nan, 9],
    "c" : [10, 11, np.nan, 12]},
    index = pd.MultiIndex.from_tuples(
    [('d',1),('d',2),('e',2),('e',3)],
    names=['n','v']))

In [58]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4.0,7.0,10.0
d,2,5.0,8.0,11.0
e,2,6.0,,
e,3,,9.0,12.0


In [59]:
pd.isnull(df)  

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,False,False,False
d,2,False,False,False
e,2,False,True,True
e,3,True,False,False


In [64]:
df.a.isnull()

n  v
d  1    False
   2    False
e  2    False
   3     True
Name: a, dtype: bool

In [66]:
df.a.isnull().sum()

1

In [70]:
pd.notnull(df)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,True,True,True
d,2,True,True,True
e,2,True,False,False
e,3,False,True,True


In [73]:
pd.notnull(df).sum()

a    3
b    3
c    3
dtype: int64

In [74]:
pd.notnull(df).sum().sum()

9

In [75]:
df.a.notnull()

n  v
d  1     True
   2     True
e  2     True
   3    False
Name: a, dtype: bool

* &,|,~,^,df.any(),df.all()
* and, or, not, xor, any, all

In [76]:
df.a.notnull()

n  v
d  1     True
   2     True
e  2     True
   3    False
Name: a, dtype: bool

In [77]:
~df.a.notnull()

n  v
d  1    False
   2    False
e  2    False
   3     True
Name: a, dtype: bool

In [79]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4.0,7.0,10.0
d,2,5.0,8.0,11.0
e,2,6.0,,
e,3,,9.0,12.0


In [84]:
df[df.b==7] 

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4.0,7.0,10.0


In [85]:
df[df.a==5]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,2,5.0,8.0,11.0


In [87]:
df[(df.b==7) | (df.a==5)]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4.0,7.0,10.0
d,2,5.0,8.0,11.0
