<a href="https://colab.research.google.com/github/seohyeon1578/TensorFlow/blob/main/pandasEX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pandas

In [34]:
import pandas as pd
from pandas import Series, DataFrame

## Series 객체
* 일차원 배열 같은 자료구조 객체

In [35]:
obj = Series([3, 22, 34, 11])
obj

0     3
1    22
2    34
3    11
dtype: int64

In [36]:
print(obj.values)
print('----------------------')
print(obj.index)

[ 3 22 34 11]
----------------------
RangeIndex(start=0, stop=4, step=1)


In [37]:
print([1, 2, 3, 4])

[1, 2, 3, 4]


* index가 보이므로 지정 가능

In [38]:
obj2 = Series([4, 5, 6, 2], index = ['c', 'd', 'e', 'f'])
obj2

c    4
d    5
e    6
f    2
dtype: int64

* indexing

In [39]:
obj2['c']

4

* 각 요소를 한꺼번에 지정

In [40]:
obj2[['c', 'd', 'f']]

c    4
d    5
f    2
dtype: int64

* 각 요소별 연산

In [41]:
obj2 * 2

c     8
d    10
e    12
f     4
dtype: int64

* 딕셔너리와 거의 유사하므로 대체 가능

In [42]:
data = {
    'kim' : 3400,
    'hong' : 2000,
    'kang' : 1000,
    'lee' : 2400
}

obj3 = Series(data)
obj3

kim     3400
hong    2000
kang    1000
lee     2400
dtype: int64

In [43]:
name = ['woo', 'hong', 'kang', 'lee']

obj4 = Series(data, index = name)
obj4

woo        NaN
hong    2000.0
kang    1000.0
lee     2400.0
dtype: float64

* 누락 데이터 찾는 함수 : isnull, notnull

In [44]:
print(pd.isnull(obj4))
print(pd.notnull(obj4))

woo      True
hong    False
kang    False
lee     False
dtype: bool
woo     False
hong     True
kang     True
lee      True
dtype: bool


* Series 객체 이름, Series 색인 객체의 이름 모두 name 속성이 있음

In [45]:
obj3

kim     3400
hong    2000
kang    1000
lee     2400
dtype: int64

In [46]:
obj3.name = '최고득점'
obj3

kim     3400
hong    2000
kang    1000
lee     2400
Name: 최고득점, dtype: int64

In [47]:
obj3.index.name = '이름'
obj3

이름
kim     3400
hong    2000
kang    1000
lee     2400
Name: 최고득점, dtype: int64

## DataFrame 자료구조 객체
* 2차원리스트(배열) 과 같은 자료구조 객체

In [48]:
x = DataFrame([
               [1, 2, 3],
               [4, 5, 6],
               [7, 8, 9]
])

x

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


* 딕셔너리로 데이터 프레임 대체 가능

In [49]:
data = {
    'city' : ['서울', '부산', '광주', '대구'],
    'year' : [2000, 2001, 2002, 2002],
    'pop' : [4000, 2000, 1000, 1000]
}

In [50]:
df = DataFrame(data)
df

Unnamed: 0,city,year,pop
0,서울,2000,4000
1,부산,2001,2000
2,광주,2002,1000
3,대구,2002,1000


* 컬럼 순서 변경

In [51]:
df = DataFrame(data, columns = ['year', 'city', 'pop'])
df

Unnamed: 0,year,city,pop
0,2000,서울,4000
1,2001,부산,2000
2,2002,광주,1000
3,2002,대구,1000


* 인덱스 지정

In [52]:
df2 = DataFrame(data, columns=['year', 'city', 'pop', 'debt'],
               index=['one', 'two', 'three', 'four'])
df2

Unnamed: 0,year,city,pop,debt
one,2000,서울,4000,
two,2001,부산,2000,
three,2002,광주,1000,
four,2002,대구,1000,


In [53]:
# 인덱싱
df2['city']

one      서울
two      부산
three    광주
four     대구
Name: city, dtype: object

In [54]:
print(df2.columns, df2.index)

Index(['year', 'city', 'pop', 'debt'], dtype='object') Index(['one', 'two', 'three', 'four'], dtype='object')


In [55]:
# 행 단위로 추출
df2.loc['three']

year    2002
city      광주
pop     1000
debt     NaN
Name: three, dtype: object

In [56]:
df2

Unnamed: 0,year,city,pop,debt
one,2000,서울,4000,
two,2001,부산,2000,
three,2002,광주,1000,
four,2002,대구,1000,


* 값 삽입

In [58]:
df2['debt'] = 1000       # df2.debt = 1000
df2 

Unnamed: 0,year,city,pop,debt
one,2000,서울,4000,1000
two,2001,부산,2000,1000
three,2002,광주,1000,1000
four,2002,대구,1000,1000


* 연속된 값 넣기

In [60]:
import numpy as np

df2['debt'] = np.arange(4)

df2

Unnamed: 0,year,city,pop,debt
one,2000,서울,4000,0
two,2001,부산,2000,1
three,2002,광주,1000,2
four,2002,대구,1000,3


In [62]:
# Series 이용해서 값 삽입 (유의 - 인덱스 매칭이 필요)
val = Series([1000, 2000, 3000, 4000], index=['one', 'two', 'three', 'four'])

df2['debt'] = val
df2

Unnamed: 0,year,city,pop,debt
one,2000,서울,4000,1000
two,2001,부산,2000,2000
three,2002,광주,1000,3000
four,2002,대구,1000,4000


In [63]:
# 컬럼 삭제
del df2['debt']
df2

Unnamed: 0,year,city,pop
one,2000,서울,4000
two,2001,부산,2000
three,2002,광주,1000
four,2002,대구,1000


In [64]:
# 값 삽입 - 연산의 결과 t/f를 삽입
df2['cap'] = df2.city == '서울'
df2

Unnamed: 0,year,city,pop,cap
one,2000,서울,4000,True
two,2001,부산,2000,False
three,2002,광주,1000,False
four,2002,대구,1000,False


In [67]:
data2 = {
    'seoul' : {2019 : 20, 2020 : 30},
    'busan' : {2018 : 10, 2019 : 200, 2020 : 300}
}

df3 = DataFrame(data2)
df3

Unnamed: 0,seoul,busan
2019,20.0,200
2020,30.0,300
2018,,10


In [68]:
# 전치행렬
df3.T

Unnamed: 0,2019,2020,2018
seoul,20.0,30.0,
busan,200.0,300.0,10.0


In [69]:
# 데이터만 추출
df3.values

array([[ 20., 200.],
       [ 30., 300.],
       [ nan,  10.]])