# Pandas
- R의 자료구조 dataframe, matrix(2차원 배열)
- 데이터 분석에서 데이터의 핸들링을 할 때 필수 라이브러리
- 자료형 : Series와 DataFrame
- 표(table) 형태 : 행(관측치, 레코드), 열(속성, 피처)
- 시리즈(Series) = 인덱스(index) + 값(value)
- 1차원 : Series
- 2차원 : DataFrame
- 3차원 : Panel

## Series
- 1차원 배열과 비슷
- 구성 : index와 value
- 생성 : pd.Series(data, index)

In [1]:
# 모듈로딩
import numpy as np
import pandas as pd


In [2]:
price = pd.Series([4000, 3000, 5000, 2000])

In [3]:
price

0    4000
1    3000
2    5000
3    2000
dtype: int64

In [4]:
price.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
price.values

array([4000, 3000, 5000, 2000])

In [9]:
fruit = pd.Series([4000, 3000, 5000, 2000], index=['aaa','bbb','ccc','ddd'])
fruit

aaa    4000
bbb    3000
ccc    5000
ddd    2000
dtype: int64

In [10]:
fruit['aaa']

4000

In [13]:
fruit['aaa':'ccc']

aaa    4000
bbb    3000
ccc    5000
dtype: int64

In [15]:
# dict를 이용한 시리즈 생성
city_dict = {'Seoul':82, 'Busan':90, 'Incheon':84, 'Daejeon':42}
city_dict

{'Seoul': 82, 'Busan': 90, 'Incheon': 84, 'Daejeon': 42}

In [18]:
city = pd.Series(city_dict)
city # 딕셔너리의 키가 인덱스가 된다.

Seoul      82
Busan      90
Incheon    84
Daejeon    42
dtype: int64

In [19]:
city.index

Index(['Seoul', 'Busan', 'Incheon', 'Daejeon'], dtype='object')

In [20]:
city.values

array([82, 90, 84, 42])

In [22]:
city['Incheon']

84

In [23]:
obj = pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [26]:
obj2 = pd.Series([4,7,-5,3], index=['d','b','a','c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [27]:
obj2 > 0

d     True
b     True
a    False
c     True
dtype: bool

In [29]:
obj2[obj2 > 0]

d    4
b    7
c    3
dtype: int64

In [30]:
'c' in obj2

True

## DataFrame
- 행과 열로 구성
- 행(관측치), 열(속성, 피처)
- pd.DataFrame(data, index, columns)
- 데이터분석에서 가장 기본이 되는 자료구조
- 여러개의 Series로 구성

In [32]:
city_dict = {'name':['aaa','bbb','ccc'], 'age':[13,24,19]}
city_dict

{'name': ['aaa', 'bbb', 'ccc'], 'age': [13, 24, 19]}

In [35]:
city = pd.DataFrame(city_dict)
city

Unnamed: 0,name,age
0,aaa,13
1,bbb,24
2,ccc,19


In [36]:
city.index

RangeIndex(start=0, stop=3, step=1)

In [38]:
city.columns

Index(['name', 'age'], dtype='object')

In [77]:
city.T

Unnamed: 0,0,1,2
name,aaa,bbb,ccc
age,13,24,19


In [50]:
data = {
    "2015": [9904312, 3448737, 2890451, 2466052],
    "2010": [9631482, 3393191, 2632035, 2431774],
    "2005": [9762546, 3512547, 2517680, 2456016],
    "2000": [9853972, 3655437, 2466338, 2473990],
    "지역": ["수도권", "경상권", "수도권", "경상권"],
    "2010-2015 증가율": [0.0283, 0.0163, 0.0982, 0.0141]
}
columns = ["지역", "2015", "2010", "2005", "2000", "2010-2015 증가율"]
index = ["서울", "부산", "인천", "대구"]
df = pd.DataFrame(data, index=index, columns=columns)
df

Unnamed: 0,지역,2015,2010,2005,2000,2010-2015 증가율
서울,수도권,9904312,9631482,9762546,9853972,0.0283
부산,경상권,3448737,3393191,3512547,3655437,0.0163
인천,수도권,2890451,2632035,2517680,2466338,0.0982
대구,경상권,2466052,2431774,2456016,2473990,0.0141


In [51]:
df.values

array([['수도권', 9904312, 9631482, 9762546, 9853972, 0.0283],
       ['경상권', 3448737, 3393191, 3512547, 3655437, 0.0163],
       ['수도권', 2890451, 2632035, 2517680, 2466338, 0.0982],
       ['경상권', 2466052, 2431774, 2456016, 2473990, 0.0141]], dtype=object)

In [52]:
df.columns

Index(['지역', '2015', '2010', '2005', '2000', '2010-2015 증가율'], dtype='object')

In [55]:
df.index

Index(['서울', '부산', '인천', '대구'], dtype='object')

In [56]:
df.T

Unnamed: 0,서울,부산,인천,대구
지역,수도권,경상권,수도권,경상권
2015,9904312,3448737,2890451,2466052
2010,9631482,3393191,2632035,2431774
2005,9762546,3512547,2517680,2456016
2000,9853972,3655437,2466338,2473990
2010-2015 증가율,0.0283,0.0163,0.0982,0.0141


In [58]:
# 열 추가
df['2005-2010 증가율'] = ((df['2010']-df['2005']) / df['2005'] * 100).round(2)
df

Unnamed: 0,지역,2015,2010,2005,2000,2010-2015 증가율,2005-2010 증가율
서울,수도권,9904312,9631482,9762546,9853972,0.0283,-1.34
부산,경상권,3448737,3393191,3512547,3655437,0.0163,-3.4
인천,수도권,2890451,2632035,2517680,2466338,0.0982,4.54
대구,경상권,2466052,2431774,2456016,2473990,0.0141,-0.99


In [62]:
# 열 삭제
# del df['2010-2015 증가율']
df

Unnamed: 0,지역,2015,2010,2005,2000,2005-2010 증가율
서울,수도권,9904312,9631482,9762546,9853972,-1.34
부산,경상권,3448737,3393191,3512547,3655437,-3.4
인천,수도권,2890451,2632035,2517680,2466338,4.54
대구,경상권,2466052,2431774,2456016,2473990,-0.99


In [64]:
# 열 인덱싱 : 시리즈로 반환
df['지역']

서울    수도권
부산    경상권
인천    수도권
대구    경상권
Name: 지역, dtype: object

In [66]:
df[['2010','2015']]

Unnamed: 0,2010,2015
서울,9631482,9904312
부산,3393191,3448737
인천,2632035,2890451
대구,2431774,2466052


In [69]:
type(df[['지역']])

pandas.core.frame.DataFrame

In [71]:
type(df['지역'])

pandas.core.series.Series

In [75]:
df2 = pd.DataFrame(np.arange(12).reshape(3,4))
df2

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [79]:
df2[[1,2]]

Unnamed: 0,1,2
0,1,2
1,5,6
2,9,10


In [81]:
# 행인덱싱 : 항상 슬라이싱을[:] 사용해야 한다.
df


Unnamed: 0,지역,2015,2010,2005,2000,2005-2010 증가율
서울,수도권,9904312,9631482,9762546,9853972,-1.34
부산,경상권,3448737,3393191,3512547,3655437,-3.4
인천,수도권,2890451,2632035,2517680,2466338,4.54
대구,경상권,2466052,2431774,2456016,2473990,-0.99


In [85]:
df[0:1]

Unnamed: 0,지역,2015,2010,2005,2000,2005-2010 증가율
서울,수도권,9904312,9631482,9762546,9853972,-1.34


In [87]:
df[1:2]

Unnamed: 0,지역,2015,2010,2005,2000,2005-2010 증가율
부산,경상권,3448737,3393191,3512547,3655437,-3.4


In [89]:
df[1:3]

Unnamed: 0,지역,2015,2010,2005,2000,2005-2010 증가율
부산,경상권,3448737,3393191,3512547,3655437,-3.4
인천,수도권,2890451,2632035,2517680,2466338,4.54


In [95]:
df["서울":"서울"] # 행은 무조건 슬라이싱써야한다.

Unnamed: 0,지역,2015,2010,2005,2000,2005-2010 증가율
서울,수도권,9904312,9631482,9762546,9853972,-1.34


In [97]:
df['2015']

서울    9904312
부산    3448737
인천    2890451
대구    2466052
Name: 2015, dtype: int64

In [99]:
df['2015']["서울"] # 시리즈는 그냥 읽어올수있다.

9904312