# pandas 특징

- DataFrame, Series, Index 를 사용하여 데이터 처리 및 분석을 수행한다
    + 다양한 형태의 file, 생성자를 사용하여 DataFrame을 만든다
    + 위의 객체들은 <font color=red>ndarray를 기반</font>으로 데이터의 처리, 분석을 효율적으로 할 수 있다
    
![nn](./images/pandas-01.png)

# pandas 학습 내용

![nn](./images/pandas-02.png)

# CSV 파일 읽어 DataFrame으로 가져오기

- DataFrame 의 구조 : index, columns, values
- index(row에 대한) 와 columns(column에 대한) -> Index Type, 즉 DataFramedms 2개의 Index 사용
- values -> ndarray type

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("./data/easySample.csv", index_col="ID")
data.head(2)

Unnamed: 0_level_0,pname,birth,dept,english,japanese,chinese
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18030201,James Kim,1990-01-23,Education,1.0,1.0,
18030202,Rose Hwang,1992-10-11,Marketing,,2.0,


In [3]:
data.index

Int64Index([18030201, 18030202, 19030401, 19070101, 19070102, 19070103], dtype='int64', name='ID')

In [4]:
data.columns

Index(['pname', 'birth', 'dept', 'english', 'japanese', 'chinese'], dtype='object')

In [5]:
print(type(data.values), data.values.shape)
data.values

<class 'numpy.ndarray'> (6, 6)


array([['James Kim', '1990-01-23', 'Education', 1.0, 1.0, nan],
       ['Rose Hwang', '1992-10-11', 'Marketing', nan, 2.0, nan],
       ['Sam Park', '1995-07-02', 'Education', 1.0, nan, nan],
       ['Chris Jang', '1990-11-23', 'Education', nan, nan, 3.0],
       ['Grace Lee', '1993-02-01', 'Marketing', nan, nan, nan],
       ['Juile Yoon', '1992-07-16', 'Education', nan, nan, 1.0]],
      dtype=object)

In [6]:
data = pd.read_csv("./data/easySample_woHeader.csv", 
                   header=None,
                   names=['ID','name','birth', 'dept', 'english', 'japanese', 'chinese'],
                   index_col='ID')

In [7]:
data.head()

Unnamed: 0_level_0,name,birth,dept,english,japanese,chinese
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18030201,James Kim,1990-01-23,Education,1.0,1.0,
18030202,Rose Hwang,1992-10-11,Marketing,,2.0,
19030401,Sam Park,1995-07-02,Education,1.0,,
19070101,Chris Jang,1990-11-23,Education,,,3.0
19070102,Grace Lee,1993-02-01,Marketing,,,


In [8]:
data

Unnamed: 0_level_0,name,birth,dept,english,japanese,chinese
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18030201,James Kim,1990-01-23,Education,1.0,1.0,
18030202,Rose Hwang,1992-10-11,Marketing,,2.0,
19030401,Sam Park,1995-07-02,Education,1.0,,
19070101,Chris Jang,1990-11-23,Education,,,3.0
19070102,Grace Lee,1993-02-01,Marketing,,,
19070103,Juile Yoon,1992-07-16,Education,,,1.0


# DataFrame과 Series의 주요 구성 요소

## DataFrame 속성 확인하기

- `.index`, `.columns`, `.index.name` : Read-Write
- `.values`, `.index.values`, `.columns.values` : Read Only

In [9]:
def printAttr(w):
    print(type(w), w, sep="\n")
    print("-" * 75)

In [10]:
data = pd.read_csv("./data/easySample.csv", index_col=0)

In [11]:
data.head(3)

Unnamed: 0_level_0,pname,birth,dept,english,japanese,chinese
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18030201,James Kim,1990-01-23,Education,1.0,1.0,
18030202,Rose Hwang,1992-10-11,Marketing,,2.0,
19030401,Sam Park,1995-07-02,Education,1.0,,


In [12]:
data.tail(3)

Unnamed: 0_level_0,pname,birth,dept,english,japanese,chinese
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
19070101,Chris Jang,1990-11-23,Education,,,3.0
19070102,Grace Lee,1993-02-01,Marketing,,,
19070103,Juile Yoon,1992-07-16,Education,,,1.0


In [13]:
printAttr(data.index)
printAttr(data.columns)

<class 'pandas.core.indexes.numeric.Int64Index'>
Int64Index([18030201, 18030202, 19030401, 19070101, 19070102, 19070103], dtype='int64', name='ID')
---------------------------------------------------------------------------
<class 'pandas.core.indexes.base.Index'>
Index(['pname', 'birth', 'dept', 'english', 'japanese', 'chinese'], dtype='object')
---------------------------------------------------------------------------


In [14]:
printAttr(data.values)
printAttr(data.index.values)
printAttr(data.columns.values)
printAttr(data.index.name)

<class 'numpy.ndarray'>
[['James Kim' '1990-01-23' 'Education' 1.0 1.0 nan]
 ['Rose Hwang' '1992-10-11' 'Marketing' nan 2.0 nan]
 ['Sam Park' '1995-07-02' 'Education' 1.0 nan nan]
 ['Chris Jang' '1990-11-23' 'Education' nan nan 3.0]
 ['Grace Lee' '1993-02-01' 'Marketing' nan nan nan]
 ['Juile Yoon' '1992-07-16' 'Education' nan nan 1.0]]
---------------------------------------------------------------------------
<class 'numpy.ndarray'>
[18030201 18030202 19030401 19070101 19070102 19070103]
---------------------------------------------------------------------------
<class 'numpy.ndarray'>
['pname' 'birth' 'dept' 'english' 'japanese' 'chinese']
---------------------------------------------------------------------------
<class 'str'>
ID
---------------------------------------------------------------------------


In [15]:
data.index = list("abcdef")
data.columns = list("ABCDEF")
data.index.name = "myIndex"

In [16]:
data

Unnamed: 0_level_0,A,B,C,D,E,F
myIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
a,James Kim,1990-01-23,Education,1.0,1.0,
b,Rose Hwang,1992-10-11,Marketing,,2.0,
c,Sam Park,1995-07-02,Education,1.0,,
d,Chris Jang,1990-11-23,Education,,,3.0
e,Grace Lee,1993-02-01,Marketing,,,
f,Juile Yoon,1992-07-16,Education,,,1.0


In [17]:
import numpy as np
# data.index.values = np.array(list("abcdef"))

## Series 속성 확인하기

- Series : DataFrame 에서 하나의 Column 
- DataFrame 의 Index 를 포함

In [18]:
data = pd.read_csv("./data/easySample.csv", index_col=0)

In [19]:
data.columns

Index(['pname', 'birth', 'dept', 'english', 'japanese', 'chinese'], dtype='object')

In [20]:
eng = data['english']
printAttr(eng)

<class 'pandas.core.series.Series'>
ID
18030201    1.0
18030202    NaN
19030401    1.0
19070101    NaN
19070102    NaN
19070103    NaN
Name: english, dtype: float64
---------------------------------------------------------------------------


In [21]:
printAttr(eng.index)
printAttr(eng.values)
printAttr(eng.index.values)

# DataFrame과 동일하게 .index 는 Read-Write
# DataFrame과 동일하게 .values, .index.values 는 Read Only

<class 'pandas.core.indexes.numeric.Int64Index'>
Int64Index([18030201, 18030202, 19030401, 19070101, 19070102, 19070103], dtype='int64', name='ID')
---------------------------------------------------------------------------
<class 'numpy.ndarray'>
[ 1. nan  1. nan nan nan]
---------------------------------------------------------------------------
<class 'numpy.ndarray'>
[18030201 18030202 19030401 19070101 19070102 19070103]
---------------------------------------------------------------------------


# Index 타입의 종류 - RangeIndex, Int64Index, Index, DatetimeIndex

In [22]:
def printObj(df):
    print(df.index)
    print(type(df.index))
    print("-" * 50)

In [23]:
import pandas as pd
import numpy as np

In [24]:
a = pd.RangeIndex(10)
b = pd.RangeIndex(10000)

import sys
print(sys.getsizeof(a), sys.getsizeof(b))
print(a, b)

144 144
RangeIndex(start=0, stop=10, step=1) RangeIndex(start=0, stop=10000, step=1)


In [25]:
df1 = pd.read_csv('./data/easySample.csv')
printObj(df1)

RangeIndex(start=0, stop=6, step=1)
<class 'pandas.core.indexes.range.RangeIndex'>
--------------------------------------------------


In [26]:
df1.head()

Unnamed: 0,ID,pname,birth,dept,english,japanese,chinese
0,18030201,James Kim,1990-01-23,Education,1.0,1.0,
1,18030202,Rose Hwang,1992-10-11,Marketing,,2.0,
2,19030401,Sam Park,1995-07-02,Education,1.0,,
3,19070101,Chris Jang,1990-11-23,Education,,,3.0
4,19070102,Grace Lee,1993-02-01,Marketing,,,


In [27]:
df2 = pd.read_csv('./data/easySample.csv', index_col="ID")
printObj(df2)

Int64Index([18030201, 18030202, 19030401, 19070101, 19070102, 19070103], dtype='int64', name='ID')
<class 'pandas.core.indexes.numeric.Int64Index'>
--------------------------------------------------


In [28]:
df3 = pd.read_csv('./data/easySample.csv', index_col="pname")
printObj(df3)

Index(['James Kim', 'Rose Hwang', 'Sam Park', 'Chris Jang', 'Grace Lee',
       'Juile Yoon'],
      dtype='object', name='pname')
<class 'pandas.core.indexes.base.Index'>
--------------------------------------------------


In [29]:
df4 = pd.read_csv('./data/easySample.csv', index_col="birth")
df4.index = pd.to_datetime(df4.index)
printObj(df4)

DatetimeIndex(['1990-01-23', '1992-10-11', '1995-07-02', '1990-11-23',
               '1993-02-01', '1992-07-16'],
              dtype='datetime64[ns]', name='birth', freq=None)
<class 'pandas.core.indexes.datetimes.DatetimeIndex'>
--------------------------------------------------


In [30]:
print(issubclass(pd.Int64Index, pd.Index))
print(issubclass(pd.RangeIndex, pd.Index))
print(issubclass(pd.DatetimeIndex, pd.Index))

True
True
True


  print(issubclass(pd.Int64Index, pd.Index))


# index 및 column의 데이터 타입(dtype) 확인

- 올바른 dtype 의 중요성
1) 적합한 연산, Method 사용을 할 수 있음
2) 메모리의 사용을 절약할 수 있음 

In [31]:
data = pd.read_csv("./data/easySample.csv", index_col="ID")
print(data.columns)

Index(['pname', 'birth', 'dept', 'english', 'japanese', 'chinese'], dtype='object')


In [32]:
print(data.dtypes, end="\n\n")
print(data.index.dtype, end="\n\n")
print(data['pname'].dtype, end="\n\n")
print(data.memory_usage(index=True, deep=True), end="\n\n")

pname        object
birth        object
dept         object
english     float64
japanese    float64
chinese     float64
dtype: object

int64

object

Index        48
pname       398
birth       402
dept        396
english      48
japanese     48
chinese      48
dtype: int64



# dtype 변경 - Series.astype(), pd.to_datetime()

- https://pandas.pydata.org/docs/reference/api/pandas.Series.astype.html
- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.astype.html
- https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html

In [33]:
data.dtypes

pname        object
birth        object
dept         object
english     float64
japanese    float64
chinese     float64
dtype: object

In [34]:
data

Unnamed: 0_level_0,pname,birth,dept,english,japanese,chinese
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18030201,James Kim,1990-01-23,Education,1.0,1.0,
18030202,Rose Hwang,1992-10-11,Marketing,,2.0,
19030401,Sam Park,1995-07-02,Education,1.0,,
19070101,Chris Jang,1990-11-23,Education,,,3.0
19070102,Grace Lee,1993-02-01,Marketing,,,
19070103,Juile Yoon,1992-07-16,Education,,,1.0


In [35]:
data['dept'] = data['dept'].astype('category')
data['birth'] = pd.to_datetime(data['birth'])
data.loc[:, 'english':'chinese'] = data.loc[:, 'english':'chinese'].astype('float32')

  data.loc[:, 'english':'chinese'] = data.loc[:, 'english':'chinese'].astype('float32')


In [36]:
print(data.dtypes, end='\n\n')
print(data.memory_usage(index=True, deep=True))

pname               object
birth       datetime64[ns]
dept              category
english            float32
japanese           float32
chinese            float32
dtype: object

Index        48
pname       398
birth        48
dept        246
english      24
japanese     24
chinese      24
dtype: int64


# DataFrame의 저장 - to_csv(), shelve

- csv 로 저정하면 dtype 이 유지되지 않을 수 있다
- datetime, category 등이 object로 저장됨
- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
- dtype을 유지하면서 파일로 저장하기
    + shelve 바이너리로 파일 저장

In [37]:
data.to_csv('./data/mydata.csv')

In [38]:
dfx = pd.read_csv('./data/mydata.csv', index_col='ID')
dfx

Unnamed: 0_level_0,pname,birth,dept,english,japanese,chinese
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18030201,James Kim,1990-01-23,Education,1.0,1.0,
18030202,Rose Hwang,1992-10-11,Marketing,,2.0,
19030401,Sam Park,1995-07-02,Education,1.0,,
19070101,Chris Jang,1990-11-23,Education,,,3.0
19070102,Grace Lee,1993-02-01,Marketing,,,
19070103,Juile Yoon,1992-07-16,Education,,,1.0


In [39]:
dfx.dtypes

pname        object
birth        object
dept         object
english     float64
japanese    float64
chinese     float64
dtype: object

In [40]:
import shelve

with shelve.open('./data/mysample') as ms:
    # mysample 파일에 sample 을 key 로 하여 data 를 저장
    ms['sample'] = data
    # mysample 파일에 sample 을 key 로 하여 저장된 객체 가져오기
    dfx = ms['sample']
    print(dfx.dtypes)

pname               object
birth       datetime64[ns]
dept              category
english            float32
japanese           float32
chinese            float32
dtype: object


In [41]:
data = pd.read_csv('./data/easySampleLong.csv', index_col='ID')
print(data.shape)
print(data.dtypes, end='\n\n')
print(data.memory_usage(index=True, deep=True), end='\n\n')
data.head()

(1000, 3)
pname    object
birth    object
dept     object
dtype: object

Index     8000
pname    66364
birth    67000
dept     66000
dtype: int64



Unnamed: 0_level_0,pname,birth,dept
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
18030201,James Kim,1990-01-23,Education
18030202,Rose Hwang,1992-10-11,Marketing
19030401,Sam Park,1995-07-02,Education
19070101,Chris Jang,1990-11-23,Education
19070102,Grace Lee,1993-02-01,Marketing


In [42]:
data['dept'] = data['dept'].astype('category')
data['birth'] = pd.to_datetime(data['birth'])

print(data.shape)
print(data.dtypes, end='\n\n')
print(data.memory_usage(index=True, deep=True), end='\n\n')
data.head()

(1000, 3)
pname            object
birth    datetime64[ns]
dept           category
dtype: object

Index     8000
pname    66364
birth     8000
dept      1240
dtype: int64



Unnamed: 0_level_0,pname,birth,dept
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
18030201,James Kim,1990-01-23,Education
18030202,Rose Hwang,1992-10-11,Marketing
19030401,Sam Park,1995-07-02,Education
19070101,Chris Jang,1990-11-23,Education
19070102,Grace Lee,1993-02-01,Marketing


# Series 생성, 메서드, 연산

## Series 생성

- `pd.Series(data=None, index=None, dtype=None, name=None, copy=False, ...)`
1) data: array-like, iterable, dictionary or scalar value 등을 사용할 수 있음
2) index : array-like or index(1D)
    + index, values 는 hashable 객체이어야 하며, datadhk 같은 길이이어야 함
    + 생략 시 RangeIndex 가 사용됨 (0 부터 1씩 증가하는 숫자 values)
    + data가 dictionary 객체일 떄 index 가 생략되면, dict 객체의 key 를 index 로 사용
    + dict 객체의 key 와 index 가 중복되면 index 를 따름 (dict의 key가 갱신됨)
3) dtype : Series.values 의 데이터 타입
    + str, numpy.dtype, ExtensionDtype 등을 사용할 수 있음
4) name : Seriess.name 으로 사용할 것으로 문자열로 지정함
5) copy : 입력 데이터를 복사하는지에 대한 여부 (ndarray에만 해당)
    + True 인 경우 복사 (다른 메모리 사용), False 라도 dtype이 다르면 복사함

In [43]:
import pandas as pd
import numpy as np

def printObj(*a):
    for x in a:
        print(x)
        print("-" * 20)

In [44]:
score = (80, 90, 100, 95)
name = ["Kim", "Yoon", "Choi", "Park"]

s1 = pd.Series(score, dtype='<i4')  # np.int32, 'int', '<i4'
s1 = s1.astype(np.int64)
s2 = pd.Series(name)
s3 = pd.Series(np.arange(50, 54))

printObj(s1, s2, s3)

0     80
1     90
2    100
3     95
dtype: int64
--------------------
0     Kim
1    Yoon
2    Choi
3    Park
dtype: object
--------------------
0    50
1    51
2    52
3    53
dtype: int64
--------------------


In [45]:
score = (80, 90, 100, 95)
name = ["Kim", "Yoon", "Choi", "Park"]

s1 = pd.Series(score, index=name, name="score")
s2 = pd.Series(np.random.randint(70, 90, 4), index=s1.index)

printObj(s1, s2)

Kim      80
Yoon     90
Choi    100
Park     95
Name: score, dtype: int64
--------------------
Kim     75
Yoon    75
Choi    76
Park    88
dtype: int64
--------------------


In [46]:
s2.name = "score"
s2.index = pd.RangeIndex(len(s2))

printObj(s1, s2)

Kim      80
Yoon     90
Choi    100
Park     95
Name: score, dtype: int64
--------------------
0    75
1    75
2    76
3    88
Name: score, dtype: int64
--------------------


In [47]:
score = (80, 90, 100, 95)
name = ["Kim", "Yoon", "Choi", "Park"]

# Dictionary 는 순서를 보장하지 않음
mydata = dict(zip(name, score))

mydata

{'Kim': 80, 'Yoon': 90, 'Choi': 100, 'Park': 95}

In [48]:
name_p = ["Song", "Kim", "Lee", "Choi", 'Park']

s3 = pd.Series(mydata, dtype=np.int32)
# name_p 에 있으나, mydata 의 name 에는 없는 것은 NaN 으로 값이 채워짐
s4 = pd.Series(mydata, index=name_p)

printObj(s3, s4)

Kim      80
Yoon     90
Choi    100
Park     95
dtype: int32
--------------------
Song      NaN
Kim      80.0
Lee       NaN
Choi    100.0
Park     95.0
dtype: float64
--------------------


## Series 의 메서드

- pandas 의 함수/메서드들은 일반적으로 NaN에 대해 제외하고 처리한다 (numpy와 다름)
- `s.sum()`: NaN 를 제외한 데이터 합계 구하기, float 반환
- `s.count()`: NaN 를 제외한 데이터 갯수 구하기, int 반환
- `s.mean()`: NaN 를 제외한 데이터 평균 구하기, float 반환
- `s.unique()`: 중복 데이터를 제외한 데이터의 ndarray로 변환 (NaN 포함)
- `s.value_counts()`: NaN 를 제외한 데이터의 갯수(정수)의 Series 반환
- `s.head(숫자)`, : `s.tail(숫자)` : 데이터 상위/하위 숫자 개만큼의 Series 반환
- `s.to_list()`: s.values 를 list 객체로 반환
- `s.to_numpy([dtype, copy])`: s.values 를 ndarray 객체로 반환

In [49]:
import pandas as pd
import numpy as np

def printObj(*a):
    for x in a:
        print(type(x))
        print(x)
        print("-" * 25)

In [50]:
arr = np.array([4, 2, 2, np.nan, np.nan, 6, 7, 6, 7], dtype=np.float64)
s = pd.Series(arr)
s

0    4.0
1    2.0
2    2.0
3    NaN
4    NaN
5    6.0
6    7.0
7    6.0
8    7.0
dtype: float64

In [51]:
a = s.sum()
b = s.count()
c = s.mean()

printObj(a, b, c, a/b)

<class 'numpy.float64'>
34.0
-------------------------
<class 'numpy.int64'>
7
-------------------------
<class 'numpy.float64'>
4.857142857142857
-------------------------
<class 'numpy.float64'>
4.857142857142857
-------------------------


In [52]:
d = s.unique()
e = s.value_counts()

printObj(d, e)

<class 'numpy.ndarray'>
[ 4.  2. nan  6.  7.]
-------------------------
<class 'pandas.core.series.Series'>
2.0    2
6.0    2
7.0    2
4.0    1
dtype: int64
-------------------------


In [53]:
printObj(s.to_list(), s.to_numpy())

<class 'list'>
[4.0, 2.0, 2.0, nan, nan, 6.0, 7.0, 6.0, 7.0]
-------------------------
<class 'numpy.ndarray'>
[ 4.  2.  2. nan nan  6.  7.  6.  7.]
-------------------------


## Series 의 연산

- index 를 기준으로 연산된다
- Series 와 Series 의 연산은 같은 index 의 value 끼리 연산된다
    + 같은 index 가 없는 경우, 추가되며 결과는 NaN 임
- Series 와 스칼라의 연산은 각 원소별로 스칼라와 연산된다
    + 스칼라가 broadcasting 되어 사용된다

In [54]:
import pandas as pd
import numpy as np

def printObj(*a):
    for x in a:
        print(type(x))
        print(x)
        print("-" * 25)

In [55]:
a = np.arange(1, 6)
b = np.arange(6, 11)

s1 = pd.Series(a)
s2 = pd.Series(b)

printObj(s1, s2, s1 + s2)

<class 'pandas.core.series.Series'>
0    1
1    2
2    3
3    4
4    5
dtype: int64
-------------------------
<class 'pandas.core.series.Series'>
0     6
1     7
2     8
3     9
4    10
dtype: int64
-------------------------
<class 'pandas.core.series.Series'>
0     7
1     9
2    11
3    13
4    15
dtype: int64
-------------------------


In [56]:
a = np.arange(1, 6)
b = np.arange(6, 10)
n1 = ['A','B','C','D','E']
n2 = ['A','X','Y','D']

s1 = pd.Series(a, index=n1)
s2 = pd.Series(b, index=n2)
s3 = s1.add(s2, fill_value=0).astype(np.int32)

printObj(s1, s2, s1 + s2, s3)

<class 'pandas.core.series.Series'>
A    1
B    2
C    3
D    4
E    5
dtype: int64
-------------------------
<class 'pandas.core.series.Series'>
A    6
X    7
Y    8
D    9
dtype: int64
-------------------------
<class 'pandas.core.series.Series'>
A     7.0
B     NaN
C     NaN
D    13.0
E     NaN
X     NaN
Y     NaN
dtype: float64
-------------------------
<class 'pandas.core.series.Series'>
A     7
B     2
C     3
D    13
E     5
X     7
Y     8
dtype: int32
-------------------------


In [58]:
a = np.arange(1, 6)
n1 = ['A','B','C','D','E']

s1 = pd.Series(a, index=n1)

printObj(s1, s1 + 2)

<class 'pandas.core.series.Series'>
A    1
B    2
C    3
D    4
E    5
dtype: int64
-------------------------
<class 'pandas.core.series.Series'>
A    3
B    4
C    5
D    6
E    7
dtype: int64
-------------------------


# DataFrame 생성, 메서드, 연산

## DataFrame 생성

- `pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)`
- data: ndarray(structured or homogeneous), iterable, dict, DataFrame
    + dict 사용 시 columns 가 지정되지 않으면 dict 의 key 가 columns 로 사용됨
    + DataFrame 사용 시 data, index, columns 가 복사됨
- index, columns: array-like or index (1D)
    + 생략 시 RangeIndex 가 사용됨 (0부터 1씩 증가하는 숫자 values)
    + index.values 는 hashable 객체이어야 함
    + data로 dict 사용 시, dict의 key와 columns가 중복될 경우 columns 에 따름
    + data로 ndarray 사용 시 index.values 갯수 == data 행 갯수
    + data로 ndarray 사용 시 columns.values 갯수 == data 열 갯수
- dtype: 직접 데이터 타입을 지정하며, 1개의 타입 지정만 가능함
- copy: ndarray(2D), DataFrame을 data로 사용 시 copy=False는 연결을 의미함
    + ndarray의 dtype이나, DataFrame의 index, columns 등이 변경이 되면 copy=True로 동작함

In [59]:
import pandas as pd
import numpy as np

def printObj(*a):
    for x in a:
        print(type(x))
        print(x)
        print("-" * 25)

In [60]:
ID = [1900101, 1900102, 1900103, 1900104]
name = ['Kim', 'Yoon', 'Choi', 'Park']
data = {'name': name, 
        'english': [80, 90, 100, 95], 
        'chinese': [100, 80, 70, 85], 
        'korean' : [95, 100, 80, 60], }

In [61]:
df = pd.DataFrame(data)
df

Unnamed: 0,name,english,chinese,korean
0,Kim,80,100,95
1,Yoon,90,80,100
2,Choi,100,70,80
3,Park,95,85,60


In [70]:
df = pd.DataFrame(data, index=ID, columns= ['name', 'english', 'temp'])
df

Unnamed: 0,name,english,temp
1900101,Kim,80,
1900102,Yoon,90,
1900103,Choi,100,
1900104,Park,95,


In [71]:
df.index.name = 'ID'
df['korean'] = [100] * 4
df['english'] = [90, 80, 100, 100]

In [73]:
df

Unnamed: 0_level_0,name,english,temp,korean
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1900101,Kim,90,,100
1900102,Yoon,80,,100
1900103,Choi,100,,100
1900104,Park,100,,100


In [77]:
a = np.random.randint(80, 100, size=(4, 3)).astype(np.float16)

df = pd.DataFrame(a, 
                  index=ID, 
                  columns=['english', 'japanese', 'chineses'], 
                  copy=True)
df.index.name = 'ID'
a[0] = [100] * 3

printObj(a, df)

<class 'numpy.ndarray'>
[[100. 100. 100.]
 [ 92.  80.  97.]
 [ 94.  95.  80.]
 [ 83.  95.  81.]]
-------------------------
<class 'pandas.core.frame.DataFrame'>
         english  japanese  chineses
ID                                  
1900101     85.0      87.0      85.0
1900102     92.0      80.0      97.0
1900103     94.0      95.0      80.0
1900104     83.0      95.0      81.0
-------------------------


## DataFrame 의 연산

- DataFrame 끼리의 연산은 index 와 column 을 모두 대상으로 한다
- df + df 의 경우 match 되는 index 나 column 이 없는 경우 NaN 으로 결과가 표시된다

In [78]:
import pandas as pd
import numpy as np

def printObj(*a):
    for x in a:
        print(type(x))
        print(x)
        print("-" * 25)

In [82]:
df1 = pd.DataFrame(np.full((2,3), 1), 
                   index=list("AB"),
                   columns=list("abc"))
df2 = pd.DataFrame(np.full((2,3), 2), 
                   index=list("AB"),
                   columns=list("abc"))

printObj(df1, df2, df1 + df2)

<class 'pandas.core.frame.DataFrame'>
   a  b  c
A  1  1  1
B  1  1  1
-------------------------
<class 'pandas.core.frame.DataFrame'>
   a  b  c
A  2  2  2
B  2  2  2
-------------------------
<class 'pandas.core.frame.DataFrame'>
   a  b  c
A  3  3  3
B  3  3  3
-------------------------


In [85]:
df1 = pd.DataFrame(np.full((2,4), 1), 
                   index=list("CB"),
                   columns=list("abcd"))
df2 = pd.DataFrame(np.full((2,3), 2), 
                   index=list("AB"),
                   columns=list("acd"))

df3 = df1 + df2
df4 = df1.add(df2, fill_value=0)

printObj(df1, df2, df3, df4)

<class 'pandas.core.frame.DataFrame'>
   a  b  c  d
C  1  1  1  1
B  1  1  1  1
-------------------------
<class 'pandas.core.frame.DataFrame'>
   a  c  d
A  2  2  2
B  2  2  2
-------------------------
<class 'pandas.core.frame.DataFrame'>
     a   b    c    d
A  NaN NaN  NaN  NaN
B  3.0 NaN  3.0  3.0
C  NaN NaN  NaN  NaN
-------------------------
<class 'pandas.core.frame.DataFrame'>
     a    b    c    d
A  2.0  NaN  2.0  2.0
B  3.0  1.0  3.0  3.0
C  1.0  1.0  1.0  1.0
-------------------------


![nn](./images/pandas-03.png)

- DataFrame + Series 은 DataFrame의 column, Series 의 index 에 맞춰 연산된다
- Series 가 DataFrame의 row 갯수에 맞춰 broadcasting 되어 사용된다
- match 되는 index 가 없는 경우 결과는 NaN 임

In [89]:
df = pd.DataFrame(np.full((2,3), 2), 
                  index=list("AB"),
                  columns=list("abc"))
s1 = pd.Series(np.full(3,1),
               index=list("abc"))
s2 = pd.Series(np.full(3,3),
               index=list("abx"))

In [95]:
df2 = df + s1
printObj(df, s1, df2)

<class 'pandas.core.frame.DataFrame'>
   a  b  c
A  2  2  2
B  2  2  2
-------------------------
<class 'pandas.core.series.Series'>
a    1
b    1
c    1
dtype: int64
-------------------------
<class 'pandas.core.frame.DataFrame'>
   a  b  c
A  3  3  3
B  3  3  3
-------------------------


In [96]:
df3 = df + s2
printObj(df, s2, df3)

<class 'pandas.core.frame.DataFrame'>
   a  b  c
A  2  2  2
B  2  2  2
-------------------------
<class 'pandas.core.series.Series'>
a    3
b    3
x    3
dtype: int64
-------------------------
<class 'pandas.core.frame.DataFrame'>
     a    b   c   x
A  5.0  5.0 NaN NaN
B  5.0  5.0 NaN NaN
-------------------------


- DataFrame + Scalar 는 각 원소별로 스칼라와 연산된다
- 스칼라가 broadcasting 되어 사용된다

In [97]:
df1 = df + 2
printObj(df, df1)

<class 'pandas.core.frame.DataFrame'>
   a  b  c
A  2  2  2
B  2  2  2
-------------------------
<class 'pandas.core.frame.DataFrame'>
   a  b  c
A  4  4  4
B  4  4  4
-------------------------


# DataFrame, Series의 삽입, 삭제, 갱신

## Series 추가, 갱신, 삭제

In [98]:
import pandas as pd

In [99]:
def printObj(*a):
    for x in a:
        print(type(x))
        print(x)
        print("-" * 25)

In [107]:
ID = [1900101, 1900102, 1900103, 1900104]
name = ['Kim','Yoon','Choi','Park']

In [110]:
s = pd.Series(ID, index=name)
s

Kim     1900101
Yoon    1900102
Choi    1900103
Park    1900104
dtype: int64

In [111]:
s['Yoon'] = 1900100
s['Lee'] = 1900200

del s['Kim']
#del s.Kim

printObj(s, s['Park'], s.Choi)

<class 'pandas.core.series.Series'>
Yoon    1900100
Choi    1900103
Park    1900104
Lee     1900200
dtype: int64
-------------------------
<class 'numpy.int64'>
1900104
-------------------------
<class 'numpy.int64'>
1900103
-------------------------


## DataFrame 추가, 갱신, 삭제

- 추가, 갱신: 항의 갯수가 같아야 함
    + column 단위로 추가, 갱신 작업
- df['column_label'] = 1D array
    + column_label 에 해당하는 column 갱신 또는 추가
    + read/write 용으로 모두 사용
- df.column_label = 1D array
    + read 용으로 사용

In [112]:
ID = [1900101, 1900102, 1900103, 1900104]
name = ['Kim','Yoon','Choi','Park']
data = {'name': name, 
        'english': [80,90,100,95],
        'korean': [95,100,80,60]}

In [114]:
df = pd.DataFrame(data, index=ID)
df.index.name = 'ID'
df

Unnamed: 0_level_0,name,english,korean
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1900101,Kim,80,95
1900102,Yoon,90,100
1900103,Choi,100,80
1900104,Park,95,60


In [116]:
eng1 = df['english']
eng2 = df.english

df['korean'] = [100] * 4
#df.chinese = [200] * 4
df['chinese'] = [100, 80, 70, 85]

printObj(eng1 is eng2, df)

<class 'bool'>
True
-------------------------
<class 'pandas.core.frame.DataFrame'>
         name  english  korean  chinese
ID                                     
1900101   Kim       80     100      100
1900102  Yoon       90     100       80
1900103  Choi      100     100       70
1900104  Park       95     100       85
-------------------------


In [117]:
del df['chinese']

printObj(df)

<class 'pandas.core.frame.DataFrame'>
         name  english  korean
ID                            
1900101   Kim       80     100
1900102  Yoon       90     100
1900103  Choi      100     100
1900104  Park       95     100
-------------------------


In [126]:
s = pd.Series([1,2,3], index=['A','B','sum'])
# s.sum = 10
s['sum'] = 10
print(s.sum())
display(s)

13


A       1
B       2
sum    10
dtype: int64

# Index 조작

## set_index

- `df.set_index(keys, drop=True, append=False, inplace=False)`
- 선택된 columns 의 일부로 새로 구성한 index 를 갖는 DataFrame 반환
- keys : 새로운 index 생성에 사용할 column/column 목록 (columns labels 로 작성)
- drop : keys 를 columns 로 부터 제거 여부 (default True -> 제거함)
- append : 기존 index 를 유지하면서 keys 를 추가 여부 (default False -> 유지하지 않음)

In [128]:
df = pd.read_csv('./data/easySampleIndex.csv', index_col='pname')
df.head()

Unnamed: 0_level_0,dept,gender,age,salary
pname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
James Kim,Education,Female,36,4700
Rose Hwang,Marketing,Male,35,4320
Sam Park,Education,Female,29,5600
Chris Jang,Education,Female,33,4500
Grace Lee,Marketing,Male,30,3150


In [131]:
df1 = df.set_index('age')
df1.head()

Unnamed: 0_level_0,dept,gender,salary
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
36,Education,Female,4700
35,Marketing,Male,4320
29,Education,Female,5600
33,Education,Female,4500
30,Marketing,Male,3150


In [132]:
df2 = df.set_index('age', drop=False)
df2.head()

Unnamed: 0_level_0,dept,gender,age,salary
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
36,Education,Female,36,4700
35,Marketing,Male,35,4320
29,Education,Female,29,5600
33,Education,Female,33,4500
30,Marketing,Male,30,3150


In [133]:
df3 = df.set_index('age', append=True)
df3.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,dept,gender,salary
pname,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
James Kim,36,Education,Female,4700
Rose Hwang,35,Marketing,Male,4320
Sam Park,29,Education,Female,5600
Chris Jang,33,Education,Female,4500
Grace Lee,30,Marketing,Male,3150


In [138]:
df4 = df.set_index(['dept', 'gender'])
df4.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,salary
dept,gender,Unnamed: 2_level_1,Unnamed: 3_level_1
Education,Female,36,4700
Marketing,Male,35,4320
Education,Female,29,5600
Education,Female,33,4500
Marketing,Male,30,3150


In [139]:
df5 = df.set_index(['dept', 'gender'], append=True)
df5.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,age,salary
pname,dept,gender,Unnamed: 3_level_1,Unnamed: 4_level_1
James Kim,Education,Female,36,4700
Rose Hwang,Marketing,Male,35,4320
Sam Park,Education,Female,29,5600
Chris Jang,Education,Female,33,4500
Grace Lee,Marketing,Male,30,3150


## reset_index

- `df.reset_index(level=None, drop=False, ...)`
- 현 index를 columns 에 포함하여 새로 구성한 index를 갖는 DataFrame 반환
- level : int, str, tuple or list 로 작성 (None 이면 모든 index 를 대상으로 함)
    + 현 index 중 columns 로 포함할 대상을 번호 또는 name 으로 지정함
    + 모든 index 가 columns 로 포함되면 RangeIndex 가 새로운 index 로 생성됨
- drop : 현 index 를 columns 로 포함하지 않을 지의 여부 (drop=True : 포함 안 함)

In [140]:
df = pd.read_csv('./data/easySampleIndex.csv', index_col='pname')
df.head()

Unnamed: 0_level_0,dept,gender,age,salary
pname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
James Kim,Education,Female,36,4700
Rose Hwang,Marketing,Male,35,4320
Sam Park,Education,Female,29,5600
Chris Jang,Education,Female,33,4500
Grace Lee,Marketing,Male,30,3150


In [143]:
df1 = df.reset_index(0)
display(df1.head())
display(df1.index)

Unnamed: 0,pname,dept,gender,age,salary
0,James Kim,Education,Female,36,4700
1,Rose Hwang,Marketing,Male,35,4320
2,Sam Park,Education,Female,29,5600
3,Chris Jang,Education,Female,33,4500
4,Grace Lee,Marketing,Male,30,3150


RangeIndex(start=0, stop=10, step=1)

In [144]:
df2 = df.reset_index('pname')
display(df2.head())
display(df2.index)

Unnamed: 0,pname,dept,gender,age,salary
0,James Kim,Education,Female,36,4700
1,Rose Hwang,Marketing,Male,35,4320
2,Sam Park,Education,Female,29,5600
3,Chris Jang,Education,Female,33,4500
4,Grace Lee,Marketing,Male,30,3150


RangeIndex(start=0, stop=10, step=1)

In [145]:
df3 = df.reset_index()
display(df3.head())
display(df3.index)

Unnamed: 0,pname,dept,gender,age,salary
0,James Kim,Education,Female,36,4700
1,Rose Hwang,Marketing,Male,35,4320
2,Sam Park,Education,Female,29,5600
3,Chris Jang,Education,Female,33,4500
4,Grace Lee,Marketing,Male,30,3150


RangeIndex(start=0, stop=10, step=1)

In [147]:
df = df.set_index(['dept', 'gender'], append=True)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,age,salary
pname,dept,gender,Unnamed: 3_level_1,Unnamed: 4_level_1
James Kim,Education,Female,36,4700
Rose Hwang,Marketing,Male,35,4320
Sam Park,Education,Female,29,5600
Chris Jang,Education,Female,33,4500
Grace Lee,Marketing,Male,30,3150


In [148]:
df4 = df.reset_index(['pname', 'dept'])
df4.head()

Unnamed: 0_level_0,pname,dept,age,salary
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,James Kim,Education,36,4700
Male,Rose Hwang,Marketing,35,4320
Female,Sam Park,Education,29,5600
Female,Chris Jang,Education,33,4500
Male,Grace Lee,Marketing,30,3150


In [149]:
df5 = df.reset_index([1,2], drop=True)
df5.head()

Unnamed: 0_level_0,age,salary
pname,Unnamed: 1_level_1,Unnamed: 2_level_1
James Kim,36,4700
Rose Hwang,35,4320
Sam Park,29,5600
Chris Jang,33,4500
Grace Lee,30,3150


## sort_index

- `df.sort_index(axis=0, level=None, ascending=True, inplace=False, kind='quicksort', na_position='last', ...)`
- 축에 따른 index labels를 기준으로 대상을 정렬하는 메소드이다
- axis : 정렬 축 지정, DataFrame은 0, 1 / Series 은 0 을 사용할 수 없음
- level : int, level-name, list of int or level-names
    + MultiIndex 에서 정렬 기준으로 사용할 level 을 지정 함
- ascending : True - 오름차순 정렬, False - 내림차순 정렬, list of boolean
- inplace : True 인 경우 객체를 직접 수정하고 None 반환
- kind : 정렬 알고리즘 지정 {'quicksort', 'mergesort', 'heapsort'}
- na_position : NA value 의 위치 {'first', 'last'}

In [151]:
import pandas as pd
import shelve

df = shelve.open('./data/mysample')['sample3']
df.head()

Unnamed: 0_level_0,pname,birth,dept,english,japanese,chinese,salary,overtime
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
18030201,James Kim,1990-01-23,Education,1,1,0,3456,0 days 23:10:10
18030202,Rose Hwang,1992-10-11,Marketing,0,2,0,4320,0 days 10:15:17
19030401,Sam Park,1995-07-02,Education,1,0,0,5600,0 days 16:21:10
19070101,Chris Jang,1990-11-23,Education,0,0,3,4500,0 days 15:00:20
19070102,Grace Lee,1993-02-01,Marketing,0,0,0,3150,0 days 21:19:50


In [152]:
df.index

Int64Index([18030201, 18030202, 19030401, 19070101, 19070102, 19070103,
            19080101, 19080102, 19090201, 19090202],
           dtype='int64', name='ID')

In [153]:
df.columns

Index(['pname', 'birth', 'dept', 'english', 'japanese', 'chinese', 'salary',
       'overtime'],
      dtype='object')

In [154]:
df1 = df.sort_index(ascending=False)
df1.head()

Unnamed: 0_level_0,pname,birth,dept,english,japanese,chinese,salary,overtime
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
19090202,Anne Lee,1993-05-05,Education,0,3,1,4750,0 days 19:50:20
19090201,John Park,1992-06-16,Sales,3,1,2,6840,0 days 17:30:20
19080102,Bob Kim,1991-12-07,Sales,1,1,1,10100,0 days 08:40:40
19080101,Chirle Song,1993-04-11,Accounting,2,0,0,4800,0 days 09:50:30
19070103,Juile Yoon,1992-07-16,Education,0,0,1,4200,0 days 14:10:40


In [155]:
df2 = df.sort_index(axis=1)
df2.head()

Unnamed: 0_level_0,birth,chinese,dept,english,japanese,overtime,pname,salary
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
18030201,1990-01-23,0,Education,1,1,0 days 23:10:10,James Kim,3456
18030202,1992-10-11,0,Marketing,0,2,0 days 10:15:17,Rose Hwang,4320
19030401,1995-07-02,0,Education,1,0,0 days 16:21:10,Sam Park,5600
19070101,1990-11-23,3,Education,0,0,0 days 15:00:20,Chris Jang,4500
19070102,1993-02-01,0,Marketing,0,0,0 days 21:19:50,Grace Lee,3150


In [156]:
df3 = df.set_index(['dept', 'pname'], append=True)
df3.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,birth,english,japanese,chinese,salary,overtime
ID,dept,pname,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
18030201,Education,James Kim,1990-01-23,1,1,0,3456,0 days 23:10:10
18030202,Marketing,Rose Hwang,1992-10-11,0,2,0,4320,0 days 10:15:17
19030401,Education,Sam Park,1995-07-02,1,0,0,5600,0 days 16:21:10
19070101,Education,Chris Jang,1990-11-23,0,0,3,4500,0 days 15:00:20
19070102,Marketing,Grace Lee,1993-02-01,0,0,0,3150,0 days 21:19:50


In [157]:
df4 = df3.sort_index()
df4.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,birth,english,japanese,chinese,salary,overtime
ID,dept,pname,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
18030201,Education,James Kim,1990-01-23,1,1,0,3456,0 days 23:10:10
18030202,Marketing,Rose Hwang,1992-10-11,0,2,0,4320,0 days 10:15:17
19030401,Education,Sam Park,1995-07-02,1,0,0,5600,0 days 16:21:10
19070101,Education,Chris Jang,1990-11-23,0,0,3,4500,0 days 15:00:20
19070102,Marketing,Grace Lee,1993-02-01,0,0,0,3150,0 days 21:19:50


In [158]:
df5 = df3.sort_index(level=[1,2])
df5.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,birth,english,japanese,chinese,salary,overtime
ID,dept,pname,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
19080101,Accounting,Chirle Song,1993-04-11,2,0,0,4800,0 days 09:50:30
19090202,Education,Anne Lee,1993-05-05,0,3,1,4750,0 days 19:50:20
19070101,Education,Chris Jang,1990-11-23,0,0,3,4500,0 days 15:00:20
18030201,Education,James Kim,1990-01-23,1,1,0,3456,0 days 23:10:10
19070103,Education,Juile Yoon,1992-07-16,0,0,1,4200,0 days 14:10:40


In [159]:
df5 = df3.sort_index(level=[1,2], ascending=[True, False])
df5.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,birth,english,japanese,chinese,salary,overtime
ID,dept,pname,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
19080101,Accounting,Chirle Song,1993-04-11,2,0,0,4800,0 days 09:50:30
19030401,Education,Sam Park,1995-07-02,1,0,0,5600,0 days 16:21:10
19070103,Education,Juile Yoon,1992-07-16,0,0,1,4200,0 days 14:10:40
18030201,Education,James Kim,1990-01-23,1,1,0,3456,0 days 23:10:10
19070101,Education,Chris Jang,1990-11-23,0,0,3,4500,0 days 15:00:20


## sort_values

- `df.sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')`
- 축에 따른 by 목록의 values를 기준으로 대상을 정렬하여 반환한다
- by : str 또는 list of str을 사용하여 정렬 기준이 되는 이름 또는 이름 목록 지정
- axis = 0: column labels, axis = 1 : index labels 를 사용하여 이름 목록 작성
- axix = 1 사용을 위해서는 모든 columns 의 dtype이 동일해야 함 (거의 사용 안 함)

In [165]:
dfna = pd.read_csv("./data/easySample2.csv", index_col="ID")
dfna = dfna[['pname', 'english', 'chinese']].head()
dfna

Unnamed: 0_level_0,pname,english,chinese
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
18030201,James Kim,1.0,
18030202,Rose Hwang,,
19030401,Sam Park,1.0,
19070101,Chris Jang,,3.0
19070102,Grace Lee,,


In [166]:
df1 = dfna.sort_values('pname')
df1

Unnamed: 0_level_0,pname,english,chinese
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
19070101,Chris Jang,,3.0
19070102,Grace Lee,,
18030201,James Kim,1.0,
18030202,Rose Hwang,,
19030401,Sam Park,1.0,


In [167]:
df2 = dfna.sort_values(['english', 'pname'], ascending=[True, False])
df2

Unnamed: 0_level_0,pname,english,chinese
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
18030202,Rose Hwang,,
19030401,Sam Park,1.0,
18030201,James Kim,1.0,
19070102,Grace Lee,,
19070101,Chris Jang,,3.0


In [168]:
df = pd.DataFrame({'A': [60,50,40,30],
                   'C': [np.nan,30,10,20],
                   'B': ['Kim','Park','Choi','Lee'],
                   'D': [10,20,10,20]})
df

Unnamed: 0,A,C,B,D
0,60,,Kim,10
1,50,30.0,Park,20
2,40,10.0,Choi,10
3,30,20.0,Lee,20


In [170]:
df1 = df.sort_values(by=['D', 'A'])
df1

Unnamed: 0,A,C,B,D
2,40,10.0,Choi,10
0,60,,Kim,10
3,30,20.0,Lee,20
1,50,30.0,Park,20


In [171]:
df2 = df[['A', 'C', 'D']].sort_values(1, axis=1, ascending=True)
df2

Unnamed: 0,D,C,A
0,10,,60
1,20,30.0,50
2,10,10.0,40
3,20,20.0,30


In [173]:
df3 = df.sort_values(by='C', na_position='first')
df3

Unnamed: 0,A,C,B,D
0,60,,Kim,10
2,40,10.0,Choi,10
3,30,20.0,Lee,20
1,50,30.0,Park,20


# Indexing

- index를 사용하여 원하는 내용(values)에 접근하는 방법
- 접근하여 읽기, 쓰기 등의 작업을 함
- https://pandas.pydata.org/docs/user_guide/indexing.html
- indexing 의 종류
    + Basic indexing : 1개의 indexer 사용
    + Multi-axis indexing : axis 별 indexer 사용
        - Selection by Label : label 을 사용함
        - Selection by Position : integer 를 사용함
- indexer 의 종류
    + a single label/integer : 1개의 label 또는 integer
    + a list or array of labels/integers : label 또는 integer의 리스트 또는 배열
    + a slice object with labels/integers : label 또는 integer의 slice (start:stop:step)
    + a boolean array : True/False로 이루어진 boolean type의 배열 (배열의 요소수는 index의 요소수와 동일해야 함)
    + a collable function with one argument (다루지 않음)

## Basic Indexing

- indexer를 1개만 사용하는 indexing
- Series 의 경우 labels/integers 를 모두 사용할 수 있음
- DataFrame 의 경우 종류에 따라 기준이 되는 axis와 labels/integers의 사용이 다름

In [1]:
import pandas as pd
import numpy as np
import shelve

df = shelve.open('./data/mysample')['sample3']
df = df[['pname', 'dept', 'birth', 'overtime']]
df.index = pd.Index(list("ABCDEFGHIJ"))
df

Unnamed: 0,pname,dept,birth,overtime
A,James Kim,Education,1990-01-23,0 days 23:10:10
B,Rose Hwang,Marketing,1992-10-11,0 days 10:15:17
C,Sam Park,Education,1995-07-02,0 days 16:21:10
D,Chris Jang,Education,1990-11-23,0 days 15:00:20
E,Grace Lee,Marketing,1993-02-01,0 days 21:19:50
F,Juile Yoon,Education,1992-07-16,0 days 14:10:40
G,Chirle Song,Accounting,1993-04-11,0 days 09:50:30
H,Bob Kim,Sales,1991-12-07,0 days 08:40:40
I,John Park,Sales,1992-06-16,0 days 17:30:20
J,Anne Lee,Education,1993-05-05,0 days 19:50:20


### Basic Indexing of Series

In [178]:
s = df['pname']
s.head(7)

A      James Kim
B     Rose Hwang
C       Sam Park
D     Chris Jang
E      Grace Lee
F     Juile Yoon
G    Chirle Song
Name: pname, dtype: object

In [180]:
print(s[0], s['A'])
print(s[-1], s['A'])

James Kim James Kim
Anne Lee James Kim


In [183]:
print(s[[0,2,5]], s[['A','C','F']], sep='\n\n')

A     James Kim
C      Sam Park
F    Juile Yoon
Name: pname, dtype: object

A     James Kim
C      Sam Park
F    Juile Yoon
Name: pname, dtype: object


In [186]:
print(s[:3], s['A':'C'], sep='\n\n')

A     James Kim
B    Rose Hwang
C      Sam Park
Name: pname, dtype: object

A     James Kim
B    Rose Hwang
C      Sam Park
Name: pname, dtype: object


In [198]:
print(s[[True,False,True,False,True,False,True,False,True,False]])

A      James Kim
C       Sam Park
E      Grace Lee
G    Chirle Song
I      John Park
Name: pname, dtype: object


In [199]:
print(s[s.str.startswith('J')])

A     James Kim
F    Juile Yoon
I     John Park
Name: pname, dtype: object


### Basic Indexing of DataFrame

In [201]:
df.head()

Unnamed: 0,pname,dept,birth,overtime
A,James Kim,Education,1990-01-23,0 days 23:10:10
B,Rose Hwang,Marketing,1992-10-11,0 days 10:15:17
C,Sam Park,Education,1995-07-02,0 days 16:21:10
D,Chris Jang,Education,1990-11-23,0 days 15:00:20
E,Grace Lee,Marketing,1993-02-01,0 days 21:19:50


In [213]:
s1= df['pname']
printObj(s1.head())

<class 'pandas.core.series.Series'>
A     James Kim
B    Rose Hwang
C      Sam Park
D    Chris Jang
E     Grace Lee
Name: pname, dtype: object
-------------------------


In [214]:
df1 = df[['pname']]
printObj(df1.head())

<class 'pandas.core.frame.DataFrame'>
        pname
A   James Kim
B  Rose Hwang
C    Sam Park
D  Chris Jang
E   Grace Lee
-------------------------


In [215]:
# column name 의 array 의 경우 axis=1
df2 = df[['pname', 'dept', 'birth']]
df2.head()

Unnamed: 0,pname,dept,birth
A,James Kim,Education,1990-01-23
B,Rose Hwang,Marketing,1992-10-11
C,Sam Park,Education,1995-07-02
D,Chris Jang,Education,1990-11-23
E,Grace Lee,Marketing,1993-02-01


In [220]:
# slicing 의 경우 axis=0
df3 = df[3:6]
display(df3)

df3 = df['D':'F']
display(df3)

Unnamed: 0,pname,dept,birth,overtime
D,Chris Jang,Education,1990-11-23,0 days 15:00:20
E,Grace Lee,Marketing,1993-02-01,0 days 21:19:50
F,Juile Yoon,Education,1992-07-16,0 days 14:10:40


Unnamed: 0,pname,dept,birth,overtime
D,Chris Jang,Education,1990-11-23,0 days 15:00:20
E,Grace Lee,Marketing,1993-02-01,0 days 21:19:50
F,Juile Yoon,Education,1992-07-16,0 days 14:10:40


In [224]:
# boolean array 의 경우 axis=0
display(df.dept == 'Marketing')

df4 = df[df.dept == 'Marketing']
display(df4)

df4 = df[df['dept'] == 'Marketing']
display(df4)

A    False
B     True
C    False
D    False
E     True
F    False
G    False
H    False
I    False
J    False
Name: dept, dtype: bool

Unnamed: 0,pname,dept,birth,overtime
B,Rose Hwang,Marketing,1992-10-11,0 days 10:15:17
E,Grace Lee,Marketing,1993-02-01,0 days 21:19:50


Unnamed: 0,pname,dept,birth,overtime
B,Rose Hwang,Marketing,1992-10-11,0 days 10:15:17
E,Grace Lee,Marketing,1993-02-01,0 days 21:19:50


### isin()을 이용한 Boolean Indexing

- 목록에 포함된 내용이 있을 때 True, 아니면 False
    + `df.isin(list/dict)`
    + `s.isin(list)`

In [226]:
import pandas as pd
import numpy as np

name = ['Kim','Yoon','Choi','Park']
data = {'korean': ['A', 'A', 'A', 'B'], 
        'english': ['B', 'A', 'C', 'B']}

In [228]:
df = pd.DataFrame(data, index=name)
df.index.name = 'name'
df

Unnamed: 0_level_0,korean,english
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Kim,A,B
Yoon,A,A
Choi,A,C
Park,B,B


In [230]:
df1 = df.isin(['B', 'C'])
df1

Unnamed: 0_level_0,korean,english
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Kim,False,True
Yoon,False,False
Choi,False,True
Park,True,True


In [231]:
w = {'korean': ['A'], 
     'english': ['A', 'B']}
df2 = df.isin(w)
df2

Unnamed: 0_level_0,korean,english
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Kim,True,True
Yoon,True,True
Choi,True,False
Park,False,True


In [233]:
df3 = df[df2]
df3

Unnamed: 0_level_0,korean,english
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Kim,A,B
Yoon,A,A
Choi,A,
Park,,B


### Accessor 를 사용

- https://pandas.pydata.org/docs/reference/series.html#accessors
- datetime, object(str) dtype의 Series에 있는 Accessor
    + datatime, timedelta : dt 
        - https://pandas.pydata.org/docs/reference/series.html#accessors
    + object(str) : str 
        - https://pandas.pydata.org/docs/reference/series.html#string-handling

In [18]:
import pandas as pd
import numpy as np
import shelve

with shelve.open('./data/mysample') as myshelf:
    df = myshelf['sample3']

df.index = pd.RangeIndex(len(df))
df.head()

Unnamed: 0,pname,birth,dept,english,japanese,chinese,salary,overtime
0,James Kim,1990-01-23,Education,1,1,0,3456,0 days 23:10:10
1,Rose Hwang,1992-10-11,Marketing,0,2,0,4320,0 days 10:15:17
2,Sam Park,1995-07-02,Education,1,0,0,5600,0 days 16:21:10
3,Chris Jang,1990-11-23,Education,0,0,3,4500,0 days 15:00:20
4,Grace Lee,1993-02-01,Marketing,0,0,0,3150,0 days 21:19:50


In [19]:
df = df[['pname', 'birth']]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   pname   10 non-null     object        
 1   birth   10 non-null     datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 288.0+ bytes


In [23]:
sb = df['birth']
df['year'] = sb.dt.year 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   pname   10 non-null     object        
 1   birth   10 non-null     datetime64[ns]
 2   year    10 non-null     int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 368.0+ bytes


In [28]:
df2 = df[df.year == 1992]
df2

Unnamed: 0,pname,birth,year
1,Rose Hwang,1992-10-11,1992
5,Juile Yoon,1992-07-16,1992
8,John Park,1992-06-16,1992


In [34]:
a = df.birth.dt
mydata = { "year": a.year, 
           "month": a.month, 
           "day": a.day,
           "weekday": a.weekday, 
           "dayofyear": a.dayofyear }

# display(df.birth.dt.year)
# df2 = pd.DataFrame(mydata, index=df['pname'])
df2 = pd.DataFrame(mydata)
display(df2)

df2.index = df['pname']
display(df2)

Unnamed: 0,year,month,day,weekday,dayofyear
0,1990,1,23,1,23
1,1992,10,11,6,285
2,1995,7,2,6,183
3,1990,11,23,4,327
4,1993,2,1,0,32
5,1992,7,16,3,198
6,1993,4,11,6,101
7,1991,12,7,5,341
8,1992,6,16,1,168
9,1993,5,5,2,125


Unnamed: 0_level_0,year,month,day,weekday,dayofyear
pname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
James Kim,1990,1,23,1,23
Rose Hwang,1992,10,11,6,285
Sam Park,1995,7,2,6,183
Chris Jang,1990,11,23,4,327
Grace Lee,1993,2,1,0,32
Juile Yoon,1992,7,16,3,198
Chirle Song,1993,4,11,6,101
Bob Kim,1991,12,7,5,341
John Park,1992,6,16,1,168
Anne Lee,1993,5,5,2,125


In [43]:
wday = dict(zip(range(7), '월 화 수 목 금 토 일'.split()))

df2['weekday'] = df2['weekday'].replace(wday)
df2

Unnamed: 0_level_0,year,month,day,weekday,dayofyear
pname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
James Kim,1990,1,23,화,23
Rose Hwang,1992,10,11,일,285
Sam Park,1995,7,2,일,183
Chris Jang,1990,11,23,금,327
Grace Lee,1993,2,1,월,32
Juile Yoon,1992,7,16,목,198
Chirle Song,1993,4,11,일,101
Bob Kim,1991,12,7,토,341
John Park,1992,6,16,화,168
Anne Lee,1993,5,5,수,125


In [49]:
import pandas as pd
import numpy as np
import shelve

with shelve.open('./data/mysample') as myshelf:
    df = myshelf['sample3']

df.index = pd.RangeIndex(len(df))
df = df[['pname', 'overtime']]
df.head()

Unnamed: 0,pname,overtime
0,James Kim,0 days 23:10:10
1,Rose Hwang,0 days 10:15:17
2,Sam Park,0 days 16:21:10
3,Chris Jang,0 days 15:00:20
4,Grace Lee,0 days 21:19:50


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype          
---  ------    --------------  -----          
 0   pname     10 non-null     object         
 1   overtime  10 non-null     timedelta64[ns]
dtypes: object(1), timedelta64[ns](1)
memory usage: 288.0+ bytes


In [54]:
seconds = df.overtime.dt.seconds

df['minute'], df['second'] = divmod(seconds, 60)
df['hour'], df['minute'] = divmod(df['minute'], 60)

df.head()

Unnamed: 0,pname,overtime,minute,second,hour
0,James Kim,0 days 23:10:10,10,10,23
1,Rose Hwang,0 days 10:15:17,15,17,10
2,Sam Park,0 days 16:21:10,21,10,16
3,Chris Jang,0 days 15:00:20,0,20,15
4,Grace Lee,0 days 21:19:50,19,50,21


In [56]:
df[df['hour'] > 18]

Unnamed: 0,pname,overtime,minute,second,hour
0,James Kim,0 days 23:10:10,10,10,23
4,Grace Lee,0 days 21:19:50,19,50,21
9,Anne Lee,0 days 19:50:20,50,20,19


## Multi-Axis Indexing

- 축(axis)별로 indexer 를 사용하는 indexing qkdqjq
- Multi-Axis Indexing 의 종류
    + Selection by Label : `df.loc[label_indexer, label_indexer]`
    + Selection by Position : `df.iloc[integer_indexer, integer_indexer]`
- indexer
    + single label/integer
    + list/array of label/integer
    + slice object of lable/integer
    + boolean array

### Selection by Label

In [57]:
import pandas as pd
import numpy as np

In [59]:
df = pd.DataFrame(np.arange(12).reshape(3,4),
                  index=list('abc'),
                  columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11


In [60]:
df1 = df.loc['a']
df2 = df.loc[:, 'A']
df3 = df.loc['a', 'C']

display(df1, df2, df3)

A    0
B    1
C    2
D    3
Name: a, dtype: int64

a    0
b    4
c    8
Name: A, dtype: int64

2

In [63]:
df = pd.DataFrame(np.arange(12).reshape(3,4),
                 index=list('aba'),
                 columns=list('CBCD'))
df

Unnamed: 0,C,B,C.1,D
a,0,1,2,3
b,4,5,6,7
a,8,9,10,11


In [64]:
df1 = df.loc['a']
df2 = df.loc['a', 'C']

display(df, df1, df2)

Unnamed: 0,C,B,C.1,D
a,0,1,2,3
b,4,5,6,7
a,8,9,10,11


Unnamed: 0,C,B,C.1,D
a,0,1,2,3
a,8,9,10,11


Unnamed: 0,C,C.1
a,0,2
a,8,10


In [65]:
df = pd.DataFrame(np.arange(12).reshape(3,4),
                 index=list('abc'),
                 columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11


In [66]:
df1 = df.loc[['a', 'b']]
df2 = df.loc[:, ['A', 'B']]
df3 = df.loc[['a', 'b'],['A','B','D']]

display(df, df1, df2, df3)

Unnamed: 0,A,B,C,D
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11


Unnamed: 0,A,B,C,D
a,0,1,2,3
b,4,5,6,7


Unnamed: 0,A,B
a,0,1
b,4,5
c,8,9


Unnamed: 0,A,B,D
a,0,1,3
b,4,5,7


In [67]:
df = pd.DataFrame(np.arange(12).reshape(3,4),
                 index=list('bab'),
                 columns=list('ABCD'))
df.sort_index(axis='index', inplace=True)
df

Unnamed: 0,A,B,C,D
a,4,5,6,7
b,0,1,2,3
b,8,9,10,11


In [71]:
df1 = df.loc[:'a']
df2 = df.loc[:'b', 'A':'B']
df3 = df.loc[:, 'B':'E']

display(df, df1, df2, df3)

Unnamed: 0,A,B,C,D
a,4,5,6,7
b,0,1,2,3
b,8,9,10,11


Unnamed: 0,A,B,C,D
a,4,5,6,7


Unnamed: 0,A,B
a,4,5
b,0,1
b,8,9


Unnamed: 0,B,C,D
a,5,6,7
b,1,2,3
b,9,10,11


### category, datetime, timedelta 타입에 대한 indexing

- categorical 인 경우 범위 내의 값만 slicing 시 지정하여야 함 -> key error 발생
    + 또한 정렬 처리할 것을 권장
- datetime : 구분 기호 없이 문자열로 취급
- timedelta : ':'로 구분하여 문자열로 표기

In [72]:
df = pd.DataFrame(np.arange(12).reshape(3,4),
                 index=pd.Categorical(list('abc')),
                 columns=list('CBCD'))
df

Unnamed: 0,C,B,C.1,D
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11


In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
CategoricalIndex: 3 entries, a to c
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   C       3 non-null      int64
 1   B       3 non-null      int64
 2   C       3 non-null      int64
 3   D       3 non-null      int64
dtypes: int64(4)
memory usage: 231.0 bytes


In [77]:
df1 = df.loc[['b'], ['C', 'D']]
df1

Unnamed: 0,C,C.1,D
b,4,6,7


In [80]:
# df2 = df.loc['b':'d']
# df.loc[:, 'C':'F']

In [82]:
df = pd.DataFrame(np.arange(15).reshape(5,3), 
                 index=pd.to_datetime([4,3,1,2,5], unit='D', origin=pd.Timestamp('2023-01-03')),
                 columns=list('ABC'))
df = df.sort_index(axis=0)
df

Unnamed: 0,A,B,C
2023-01-04,6,7,8
2023-01-05,9,10,11
2023-01-06,3,4,5
2023-01-07,0,1,2
2023-01-08,12,13,14


In [84]:
df1 = df.loc['20230104':'20230106', 'B':'C']
df2 = df.loc['20230105':'20230110']

display(df1, df2)

Unnamed: 0,B,C
2023-01-04,7,8
2023-01-05,10,11
2023-01-06,4,5


Unnamed: 0,A,B,C
2023-01-05,9,10,11
2023-01-06,3,4,5
2023-01-07,0,1,2
2023-01-08,12,13,14


In [86]:
np.random.seed(100)
time = np.arange(1, 7) * 1000
df = pd.DataFrame(np.arange(18).reshape(6,3),
                 index=pd.to_timedelta(time, unit='s'),
                 columns=list('ABC'))
df

Unnamed: 0,A,B,C
0 days 00:16:40,0,1,2
0 days 00:33:20,3,4,5
0 days 00:50:00,6,7,8
0 days 01:06:40,9,10,11
0 days 01:23:20,12,13,14
0 days 01:40:00,15,16,17


In [88]:
df1 = df.loc['01:00:00':'02:00:00']
df2 = df.loc['00:30:00':'01:23:20']
display(df, df1, df2)

Unnamed: 0,A,B,C
0 days 00:16:40,0,1,2
0 days 00:33:20,3,4,5
0 days 00:50:00,6,7,8
0 days 01:06:40,9,10,11
0 days 01:23:20,12,13,14
0 days 01:40:00,15,16,17


Unnamed: 0,A,B,C
0 days 01:06:40,9,10,11
0 days 01:23:20,12,13,14
0 days 01:40:00,15,16,17


Unnamed: 0,A,B,C
0 days 00:33:20,3,4,5
0 days 00:50:00,6,7,8
0 days 01:06:40,9,10,11
0 days 01:23:20,12,13,14


### Selection by Position

In [91]:
df = pd.DataFrame(np.arange(12).reshape(3,4),
                 index=list('abc'),
                 columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11


In [98]:
s1 = df.iloc[2]; s1

A     8
B     9
C    10
D    11
Name: c, dtype: int64

In [99]:
s2 = df.iloc[:, 1]; s2

a    1
b    5
c    9
Name: B, dtype: int64

In [100]:
v = df.iloc[1, 2]; v

6

In [101]:
df = pd.DataFrame(np.arange(12).reshape(3,4),
                 index=list('abc'),
                 columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11


In [102]:
df.iloc[[1,2,2], :]

Unnamed: 0,A,B,C,D
b,4,5,6,7
c,8,9,10,11
c,8,9,10,11


In [103]:
df.iloc[[1,2], [0,2,3]]

Unnamed: 0,A,C,D
b,4,6,7
c,8,10,11


In [104]:
df = pd.DataFrame(np.arange(12).reshape(3,4),
                 index=list('abc'),
                 columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11


In [105]:
df.iloc[:2]

Unnamed: 0,A,B,C,D
a,0,1,2,3
b,4,5,6,7


In [106]:
df.iloc[::-1]

Unnamed: 0,A,B,C,D
c,8,9,10,11
b,4,5,6,7
a,0,1,2,3


In [108]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
a,1,2
b,5,6
c,9,10


In [109]:
df.iloc[:5, 2:]

Unnamed: 0,C,D
a,2,3
b,6,7
c,10,11


In [110]:
df.iloc[:, :-1]

Unnamed: 0,A,B,C
a,0,1,2
b,4,5,6
c,8,9,10


### Boolean Indexing

In [111]:
mydata = {'A': list("abcde"), 
          'B': list("cbdfd"), 
          'C':list("zsyjc")}

df = pd.DataFrame(mydata, index=['one','two','three','four','five'])
df

Unnamed: 0,A,B,C
one,a,c,z
two,b,b,s
three,c,d,y
four,d,f,j
five,e,d,c


In [113]:
df[(df.A == 'a') | (df.A == 'b')]

Unnamed: 0,A,B,C
one,a,c,z
two,b,b,s


In [115]:
df.loc[(df.A == 'a') | (df.A == 'b')]

Unnamed: 0,A,B,C
one,a,c,z
two,b,b,s


In [119]:
# iloc 의 경우 Series 를 넣을 수 없음
# df.iloc[(df.A == 'a') | (df.A == 'b')]
df.iloc[((df.A == 'a') | (df.A == 'b')).to_list()]

Unnamed: 0,A,B,C
one,a,c,z
two,b,b,s


In [123]:
df = pd.DataFrame(np.random.randint(1, 100, (5, 3)),
                 index=list('ABCDE'),
                 columns=list('abc'))
df

Unnamed: 0,a,b,c
A,5,92,60
B,68,8,50
C,48,66,62
D,15,56,72
E,81,3,95


In [126]:
df1 = df[df.a > 50]
df2 = df.loc[df.a > 50]
df3 = df.iloc[(df.a > 50).to_list()]

display(df1, df2, df3)

Unnamed: 0,a,b,c
B,68,8,50
E,81,3,95


Unnamed: 0,a,b,c
B,68,8,50
E,81,3,95


Unnamed: 0,a,b,c
B,68,8,50
E,81,3,95


In [139]:
ridx = (df.a > 30) | (df.b > 50)
cidx = df.loc['A'] < df.loc['C']

df3 = df.loc[ridx, cidx]
df4 = df.iloc[ridx.to_list(), cidx.to_list()]

display(df3, df4)

Unnamed: 0,a,c
A,5,60
B,68,50
C,48,62
D,15,72
E,81,95


Unnamed: 0,a,c
A,5,60
B,68,50
C,48,62
D,15,72
E,81,95


## Indexing for Multilevel Index

- 여러 개의 column 또는 row 로 이루어진 index
- 여러 개의 column/row 를 level로 취급함
- `pd.Multiindex.from_product(iterables, names)`
    + iterables : Multiindex 각 column을 항으로 하는 1D array
    + 각 항의 갯수를 곱한 것 만큼의 항이 생성됨
- `pd.Multiindex.from_tuples(tuplies, names)`
    + tuples : Multiindex 각 row 를 항으로 하는 1D array
    + names : index 의 각 column 이름, 1D array

- basic indexing : column 기준

In [140]:
import pandas as pd
import numpy as np
import shelve

np.random.seed(100)

In [145]:
midx_c = pd.MultiIndex.from_product([['A','B'],['a','b']])
midx_r = pd.MultiIndex.from_tuples([['a','x'],['a','y'],['b','x'],['b','y']])

df = pd.DataFrame(np.random.randint(50, 90, (4,4)),
                 index=midx_r,
                 columns=midx_c)
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4 entries, ('a', 'x') to ('b', 'y')
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   (A, a)  4 non-null      int64
 1   (A, b)  4 non-null      int64
 2   (B, a)  4 non-null      int64
 3   (B, b)  4 non-null      int64
dtypes: int64(4)
memory usage: 416.0+ bytes


In [146]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,A,B,B
Unnamed: 0_level_1,Unnamed: 1_level_1,a,b,a,b
a,x,66,59,79,72
a,y,52,77,54,81
b,x,51,63,69,86
b,y,54,77,53,57


In [151]:
s1 = df['A']['a']
s2 = df.loc[:, ('A','a')]

display(s1, s2)

a  x    66
   y    52
b  x    51
   y    54
Name: a, dtype: int64

a  x    66
   y    52
b  x    51
   y    54
Name: (A, a), dtype: int64

In [152]:
df3 = df.copy()

df3[('A','a')] = [100] * 4
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,A,A,B,B
Unnamed: 0_level_1,Unnamed: 1_level_1,a,b,a,b
a,x,100,59,79,72
a,y,100,77,54,81
b,x,100,63,69,86
b,y,100,77,53,57


In [153]:
df3 = df.copy()

df3['A']['a'] = [100] * 4
df3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['A']['a'] = [100] * 4


Unnamed: 0_level_0,Unnamed: 1_level_0,A,A,B,B
Unnamed: 0_level_1,Unnamed: 1_level_1,a,b,a,b
a,x,66,59,79,72
a,y,52,77,54,81
b,x,51,63,69,86
b,y,54,77,53,57


In [155]:
df3.loc[('a','y'),('B','b')] = 1234
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,A,A,B,B
Unnamed: 0_level_1,Unnamed: 1_level_1,a,b,a,b
a,x,66,59,79,72
a,y,52,77,54,1234
b,x,51,63,69,86
b,y,54,77,53,57
