## Syntax – Creating DataFrames
* https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf

In [1]:
import pandas as pd

In [6]:
import numpy as np
df = pd.DataFrame(
        {"a" : [4 ,5, 6, 6],
        "b" : [7, 8, 9, 9],
        "c" : [10, 11, 12, 12]},
        index = pd.MultiIndex.from_tuples(
        [('d',1),('d',2),('e',2),('e',3)],
        names=['n','v']))
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
d,2,5,8,11
e,2,6,9,12
e,3,6,9,12


## Subset Observations (Rows)
* 전체 데이터프레임에서 일부만 가져오기

In [7]:
df[df['a'] < 7]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
d,2,5,8,11
e,2,6,9,12
e,3,6,9,12


In [8]:
df[df['c'] >= 7]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
d,2,5,8,11
e,2,6,9,12
e,3,6,9,12


In [9]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
d,2,5,8,11
e,2,6,9,12
e,3,6,9,12


In [10]:
# 중복 행을 제거
# keep 파라미터의 값; first == 중복행 첫번째만 남김
# last == 중복행의 마지막만 남김
df = df.drop_duplicates(keep='last')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
d,2,5,8,11
e,3,6,9,12


In [11]:
df["a"] != 7

n  v
d  1    True
   2    True
e  3    True
Name: a, dtype: bool

In [13]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
d,2,5,8,11
e,3,6,9,12


In [12]:
# 7이 아닌 값만 인덱싱하여 보여줌
df[df["b"] != 7]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,2,5,8,11
e,3,6,9,12


In [None]:
df.column.isin(values) # 특정 컬럼 안에 파라미터 밸류가 있는지 확인

In [14]:
df['a'].isin([5])

n  v
d  1    False
   2     True
e  3    False
Name: a, dtype: bool

In [16]:
df = pd.DataFrame(
        {"a" : [4 ,5, 6, 6, np.nan],
        "b" : [7, 8, np.nan, 9, 9],
        "c" : [10, 11, 12, np.nan, 12]},
        index = pd.MultiIndex.from_tuples(
        [('d',1),('d',2),('e',2),('e',3), ('e', 4)],
        names=['n','v']))

In [17]:
pd.isnull(df)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,False,False,False
d,2,False,False,False
e,2,False,True,False
e,3,False,False,True
e,4,True,False,False


In [19]:
# null이 있는 경우의 개수
df['a'].isnull().sum()

1

In [20]:
pd.notnull(df)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,True,True,True
d,2,True,True,True
e,2,True,False,True
e,3,True,True,False
e,4,False,True,True


In [21]:
df.notnull().sum()

a    4
b    4
c    4
dtype: int64

In [22]:
df.a.notnull()

n  v
d  1     True
   2     True
e  2     True
   3     True
   4    False
Name: a, dtype: bool

## Logic in Python (and pandas)
* &,|,~,^,df.any(),df.all() 
* and, or, not, xor, any, all

In [23]:
df.any()

a    True
b    True
c    True
dtype: bool

In [24]:
df[df.b == 7] & df[df.a == 5]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,,,
d,2,,,


In [25]:
df[df.b == 7] | df[df.a == 5]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,,,
d,2,,,


In [26]:
df.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4.0,7.0,10.0
d,2,5.0,8.0,11.0


In [27]:
df.sample(frac=0.3)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,2,5.0,8.0,11.0
e,3,6.0,9.0,


In [28]:
df.sample(n=5)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4.0,7.0,10.0
e,3,6.0,9.0,
e,4,,9.0,12.0
d,2,5.0,8.0,11.0
e,2,6.0,,12.0


In [29]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4.0,7.0,10.0
d,2,5.0,8.0,11.0
e,2,6.0,,12.0
e,3,6.0,9.0,
e,4,,9.0,12.0


In [30]:
df.iloc[-2:]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
e,3,6.0,9.0,
e,4,,9.0,12.0


In [31]:
df = pd.DataFrame({'a': [1, 10, 8, 11, -1],
                'b': list('abdce'),
                'c': [1.0, 2.0, np.nan, 3.0, 4.0]})
df

Unnamed: 0,a,b,c
0,1,a,1.0
1,10,b,2.0
2,8,d,
3,11,c,3.0
4,-1,e,4.0


In [32]:
df.nlargest(1, 'a')

Unnamed: 0,a,b,c
3,11,c,3.0


In [33]:
df.nsmallest(3, 'a')

Unnamed: 0,a,b,c
4,-1,e,4.0
0,1,a,1.0
2,8,d,
