## <strong> 6. Pandas 객체 생성 및 조작 </strong>

```Pandas``` 라이브러리 설치

In [1]:
!pip install pandas



In [1]:
import pandas as pd
import numpy as np

In [3]:
# 헬프 문서 출력
pd?

[1;31mType:[0m        module
[1;31mString form:[0m <module 'pandas' from 'c:\\Anaconda3\\Lib\\site-packages\\pandas\\__init__.py'>
[1;31mFile:[0m        c:\anaconda3\lib\site-packages\pandas\__init__.py
[1;31mDocstring:[0m  
pandas - a powerful data analysis and manipulation library for Python

**pandas** is a Python package providing fast, flexible, and expressive data
structures designed to make working with "relational" or "labeled" data both
easy and intuitive. It aims to be the fundamental high-level building block for
doing practical, **real world** data analysis in Python. Additionally, it has
the broader goal of becoming **the most powerful and flexible open source data
analysis / manipulation tool available in any language**. It is already well on
its way toward this goal.

Main Features
-------------
Here are just a few of the things that pandas does well:

  - Easy handling of missing data in floating point as well as non-floating
    point data.
  - Size mutability:

In [4]:
# 라이브러리 버전 확인
pd.__version__

'2.2.2'

### Pandas 객체: <strong> Series </strong>

In [5]:
# [+] List 객체에서 Series 객체를 생성
ser = pd.Series([0.25, 0.5, 0.75, 1.0])
ser

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [6]:
# Pandas 객체 속성: .values
vals = ser.values
vals

array([0.25, 0.5 , 0.75, 1.  ])

In [7]:
# Pandas 객체 속성: .index
ind = ser.index
print(ind)
print(list(ind))

RangeIndex(start=0, stop=4, step=1)
[0, 1, 2, 3]


In [10]:
# [+] 레이블 기반 인덱싱
ser = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print(ser)
print(ser['a'])

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
0.25


In [14]:
print(ser.index)
print(ser.values)
list(ser.index)

Index(['a', 'b', 'c', 'd'], dtype='object')
[0.25 0.5  0.75 1.  ]


['a', 'b', 'c', 'd']

### <strong> Dictionary와 Series 객체 </strong>

In [2]:
# Dictionary 객체에서 Series 객체를 생성
population_dict = {
    'California': 38332521,   # 미국 연방주 별 인구
    'Texas': 26448193,
    'New York': 19651127,
    'Florida': 19552860,
    'Illinois': 12882135
}

population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [7]:
# [+] 레이블 기반 인덱싱: 'California'에서 'New York'까지
print(population[:'New York']) #기존 슬라이싱은 해당 인덱스 전까지 선택 되었지만 레이블 기반 인덱싱은 해당 key까지 슬라이싱 해준다
population[:3]

California    38332521
Texas         26448193
New York      19651127
dtype: int64


California    38332521
Texas         26448193
New York      19651127
dtype: int64

In [None]:
population_dict['Texas':'Illinois'] #딕션어리는 슬라이싱을 지원 안한다.

KeyError: slice('Texas', 'Illinois', None)

### Pandas 객체:<strong> DataFrame </strong>

In [16]:
# 미국 연방주 별 면적
area_dict = {
    'California': 423967,
    'Texas': 695662,
    'New York': 141297,
    'Florida': 170312,
    'Illinois': 149995,
    'test' : 0
}

area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
test               0
dtype: int64

In [17]:
# [+] 'population'과 'area' Series 객체를 포함하는 DataFrame 객체 생성
states = pd.DataFrame({'population' : population, 'area': area}) # 합쳐질때 값이 없으면 NAN으로 표시되는거 같음
states

Unnamed: 0,population,area
California,38332521.0,423967
Florida,19552860.0,170312
Illinois,12882135.0,149995
New York,19651127.0,141297
Texas,26448193.0,695662
test,,0


In [14]:
# DataFrame 객체의 인덱스와 컬럼
print(states.index)
print(states.columns)

Index(['California', 'Florida', 'Illinois', 'New York', 'Texas', 'a'], dtype='object')
Index(['population', 'area'], dtype='object')


In [19]:
print(states.columns)
states.index

Index(['population', 'area'], dtype='object')


Index(['California', 'Florida', 'Illinois', 'New York', 'Texas', 'test'], dtype='object')

In [20]:
# [+] DataFrame으로부터 Series 객체 접근하기
states['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
test               0
Name: area, dtype: int64

#### **NumPy 배열로부터 DataFrame 객체 생성**
+ 예제, 오늘의 운세: 금전운(```money_fortune```), 연애운(```love_fortune```)

In [23]:
# [+] 실수 난수로 이루어진 12X2 크기의 NumPy 배열 생성
arr = np.random.rand(12, 2)
arr

array([[0.17168291, 0.01179336],
       [0.40859942, 0.1626081 ],
       [0.81071344, 0.3838086 ],
       [0.28707893, 0.60984712],
       [0.44069693, 0.72219648],
       [0.25524388, 0.21176472],
       [0.92974107, 0.65812967],
       [0.00575401, 0.32133636],
       [0.71089332, 0.45713984],
       [0.56539061, 0.96207655],
       [0.59061333, 0.5873778 ],
       [0.64403396, 0.88161627]])

In [None]:
# NumPy 배열로부터 DataFrame 객체 생성
df = pd.DataFrame(
    arr, 
    columns=['money_fortune', 'love_fortune'], 
    index=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
) #직접 지정할 수 있음 coulms, index

df

Unnamed: 0,money_fortune,love_fortune
Jan,0.171683,0.011793
Feb,0.408599,0.162608
Mar,0.810713,0.383809
Apr,0.287079,0.609847
May,0.440697,0.722196
Jun,0.255244,0.211765
Jul,0.929741,0.65813
Aug,0.005754,0.321336
Sep,0.710893,0.45714
Oct,0.565391,0.962077


### <strong> Series 객체 조작 </strong>

#### Dictionary 스타일 조작

In [32]:
# Series 객체 생성
ser = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

print(ser)    
print("=============================")
print("a" in ser)   # [+] Key + in 키워드
print(0.25 in ser)  # [+] Value + in 키워드
print(ser.index)    # [+] 인덱스
print(ser.keys)   # [+] 키 집합
ser['e'] = 1.25     # [+] 값 추가
ser['a'] = 10.25    # [+] 값 수정
print(ser)

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
True
False
Index(['a', 'b', 'c', 'd'], dtype='object')
<bound method Series.keys of a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64>
a    10.25
b     0.50
c     0.75
d     1.00
e     1.25
dtype: float64


#### 배열 스타일 조작

In [33]:
print(ser['a':'c'])   # 슬라이싱
print(ser[(ser > 0.3) & (ser < 0.8)])   # 논리 연산
# 팬시 인덱싱
ind = ['a', 'e'] 
print(ser[ind])

a    10.25
b     0.50
c     0.75
dtype: float64
b    0.50
c    0.75
dtype: float64
a    10.25
e     1.25
dtype: float64


#### Pandas 객체 인덱싱
+ 정수 기반 인덱싱(암묵적, implicit)
+ 레이블 기반 인덱싱(명시적, explicit)

In [34]:
# 슬라이싱을 이용한 'a', 'b', 'c' 선택
print(ser[0:3])   # 정수 기반 인덱싱
print(ser['a':'c'])     # 레이블 기반 인덱싱

a    10.25
b     0.50
c     0.75
dtype: float64
a    10.25
b     0.50
c     0.75
dtype: float64


### <strong> DataFrame 객체 조작 </strong>

In [38]:
# 특정 Series 객체 접근
states['area']
print(states.area)
print("============================")
states.population

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
test               0
Name: area, dtype: int64


California    38332521.0
Florida       19552860.0
Illinois      12882135.0
New York      19651127.0
Texas         26448193.0
test                 NaN
Name: population, dtype: float64

In [45]:
# [+] Series 객체 추가 (밀도 = 인구 / 면적)
states['density'] = states['population']/states['area']
states

Unnamed: 0,population,area,a,density
California,38332521.0,423967,90.413926,90.413926
Florida,19552860.0,170312,114.806121,114.806121
Illinois,12882135.0,149995,85.883763,85.883763
New York,19651127.0,141297,139.076746,139.076746
Texas,26448193.0,695662,38.01874,38.01874
test,,0,,


#### 인덱서: ```loc```, ```iloc```

In [46]:
# Series 객체 생성
ser = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
ser

1    a
3    b
5    c
dtype: object

In [None]:
# loc 인덱서 => 레이블 기반반
print(ser.loc[1])
print(ser.loc[1:3])

a
1    a
3    b
dtype: object


In [None]:
# iloc 인덱서 => index 기반반
print(ser.iloc[1])
print(ser.iloc[1:3])

b
3    b
5    c
dtype: object


In [None]:
# DataFrame 객체 = 2차원 배열
print(states.values, '\n')        # 값들을 얻어오기
print(states.T, '\n')             # 전치행렬 => 행과열울 바꿀때
print(states.iloc[:3, :2], '\n')  # 정수 기반 슬라이싱
print(states.loc[: 'Illinois', : 'population'])  # 레이블 기반 슬라이싱

[[3.83325210e+07 4.23967000e+05 9.04139261e+01 9.04139261e+01]
 [1.95528600e+07 1.70312000e+05 1.14806121e+02 1.14806121e+02]
 [1.28821350e+07 1.49995000e+05 8.58837628e+01 8.58837628e+01]
 [1.96511270e+07 1.41297000e+05 1.39076746e+02 1.39076746e+02]
 [2.64481930e+07 6.95662000e+05 3.80187404e+01 3.80187404e+01]
 [           nan 0.00000000e+00            nan            nan]] 

              California       Florida      Illinois      New York  \
population  3.833252e+07  1.955286e+07  1.288214e+07  1.965113e+07   
area        4.239670e+05  1.703120e+05  1.499950e+05  1.412970e+05   
a           9.041393e+01  1.148061e+02  8.588376e+01  1.390767e+02   
density     9.041393e+01  1.148061e+02  8.588376e+01  1.390767e+02   

                   Texas  test  
population  2.644819e+07   NaN  
area        6.956620e+05   0.0  
a           3.801874e+01   NaN  
density     3.801874e+01   NaN   

            population    area
California  38332521.0  423967
Florida     19552860.0  170312
Illinois

In [50]:
# 마스킹 + 팬시 인덱싱
states.loc[states.density > 100, ['population', 'density']]

Unnamed: 0,population,density
Florida,19552860.0,114.806121
New York,19651127.0,139.076746


In [52]:
# 값 수정
states.iloc[0, 2] = 90
states.iloc[5, 0] = 100
states

Unnamed: 0,population,area,a,density
California,38332521.0,423967,90.0,90.413926
Florida,19552860.0,170312,114.806121,114.806121
Illinois,12882135.0,149995,85.883763,85.883763
New York,19651127.0,141297,139.076746,139.076746
Texas,26448193.0,695662,38.01874,38.01874
test,100.0,0,,
