In [1]:
# 표준 라이브러리 로딩
import numpy as np
import pandas as pd
# from pandas import Series, DataFrame

In [2]:
# Series : 1차원 배열, index + values
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [3]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [4]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [5]:
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [6]:
obj2['a']

-5

In [7]:
obj2['d'] = 6
obj2

d    6
b    7
a   -5
c    3
dtype: int64

In [8]:
obj2[['c', 'a', 'd']]

c    3
a   -5
d    6
dtype: int64

In [9]:
obj2[obj2>0]

d    6
b    7
c    3
dtype: int64

In [10]:
obj2 * 2

d    12
b    14
a   -10
c     6
dtype: int64

In [11]:
# in, not in
'b' in obj2

True

In [12]:
'e' in obj2

False

In [13]:
# 파이썬의 사전을 이용해 Series 생성
# Dictiory key = Series index
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 50000}
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah      50000
dtype: int64

In [14]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
# California 값이 sdata에 없으므로 NaN
obj4 = pd.Series(sdata, index = states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [15]:
# 결측값을 찾는 함수 : isnull(), notnull()
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [16]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [17]:
# name 
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [18]:
# 숫자 인덱스를 사용하는 것은 좋지 않음
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

### DataFrame
- 표 형태의 자료구조
- 여러 개의 컬럼이 존재하고, 각 컬럼은 서로 다른 값을 담을 수 있다.
- 2차원 배열

In [19]:
data = {'state' : ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
       'year' : [2000, 2001, 2002, 2001, 2002, 2003],
       'pop' : [1.5, 1.7, 3.6, 2.4, 2.9, 3.7]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.7


In [20]:
# head(), tail(): 데이터프레임의 내용을 앞, 뒷단 5개의 row를 표시
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [21]:
frame.head(3)

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6


In [23]:
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.7


In [25]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                     index=['one', 'two', 'three', 'four', 'five', 'six'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.7,


In [27]:
frame2.columns
# object : int, float, string 등 모든 타입을 받을 수 있는 타입

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [28]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [29]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [31]:
# 특정 행 접근 시 loc
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [32]:
# 일괄적으로 값 넣기
frame2['debt'] = 16.5
print(frame2)
frame2['debt'] = np.arange(6.)
print(frame2)

       year   state  pop  debt
one    2000    Ohio  1.5  16.5
two    2001    Ohio  1.7  16.5
three  2002    Ohio  3.6  16.5
four   2001  Nevada  2.4  16.5
five   2002  Nevada  2.9  16.5
six    2003  Nevada  3.7  16.5
       year   state  pop  debt
one    2000    Ohio  1.5   0.0
two    2001    Ohio  1.7   1.0
three  2002    Ohio  3.6   2.0
four   2001  Nevada  2.4   3.0
five   2002  Nevada  2.9   4.0
six    2003  Nevada  3.7   5.0


In [33]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.7,


In [34]:
# del 예약어를 통해 컬럼을 삭제
frame2['estern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,estern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.7,,False


In [36]:
del frame2['estern']

In [37]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.7,


In [38]:
# 중첩된 사전을 이용한 데이터프레임 생성
pop = {'Navada':{2001:2.4, 2002:2.9},
      'Ohio':{2000:1.5, 2001:1.7, 2002:3.6}}
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,Navada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [39]:
frame3.T

Unnamed: 0,2000,2001,2002
Navada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [40]:
pd.DataFrame(pop, index = [2001, 2002, 2003])

Unnamed: 0,Navada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [42]:
pdata = {'Ohio':frame3['Ohio'][:-1],
        'Navada':frame3['Navada'][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Navada
2000,1.5,
2001,1.7,2.4


In [43]:
# name 속성
frame3.index.name = 'year'
frame3.columns.name = 'state'
frame3

state,Navada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [44]:
frame3.values

array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])

In [45]:
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7],
       [2003, 'Nevada', 3.7, nan]], dtype=object)

### DataFrame의 인덱스

In [46]:
obj = pd.Series(range(3), index = ['a', 'b', 'c'])
obj

a    0
b    1
c    2
dtype: int64

In [47]:
obj.index

Index(['a', 'b', 'c'], dtype='object')

In [48]:
index = obj.index
index # index type : 변경 불가능(자료 안전 보장)

Index(['a', 'b', 'c'], dtype='object')

In [49]:
index[1:]

Index(['b', 'c'], dtype='object')

In [50]:
# reindex
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index = ['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [52]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2  # 마지막 인덱스값 결측처리(NaN)됨

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [53]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index = [0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [54]:
# method='ffill' 누락 된 값을 직전 값으로 채워주는 역할
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [58]:
frame = pd.DataFrame(np.arange(9).reshape((3,3)), 
                     index = ['a', 'c', 'd'],
                    columns = ['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [59]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [60]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns = states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [61]:
obj = pd.Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [62]:
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [63]:
obj.drop(['d', 'c'])  # inplace=False, 원본은 보장

a    0.0
b    1.0
e    4.0
dtype: float64

In [64]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [65]:
obj.drop(['d', 'c'], inplace=True)  # inplace=True 원본에 반영

In [66]:
obj

a    0.0
b    1.0
e    4.0
dtype: float64

In [67]:
data = pd.DataFrame(np.arange(16).reshape((4,4)),
                   index = ['Ohio', 'Colorado', 'Utah', 'NewYork'],
                   columns = ['one', 'two', 'three', 'four'])

In [68]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
NewYork,12,13,14,15


In [69]:
# 컬럼을 삭제할 경우 반드시 Axis(축)을 넘겨줘야함
data.drop('two', axis=1)  # axis = 'columns'

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
NewYork,12,14,15


In [70]:
data.drop(['two', 'four'], axis='columns')

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
NewYork,12,14


In [71]:
obj = pd.Series(np.arange(4.), index=['a','b','c','d'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [72]:
obj[['b', 'a', 'd']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [73]:
obj[[1, 3]]

b    1.0
d    3.0
dtype: float64

In [74]:
obj[obj<2]

a    0.0
b    1.0
dtype: float64

In [75]:
obj['b':'c']  # 라벨 인덱싱 시, end는 쓰여진 라벨 값까지 출력(숫자는 -1 값)

b    1.0
c    2.0
dtype: float64

In [76]:
obj['b':'c'] = 5

In [77]:
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

In [78]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
NewYork,12,13,14,15


In [79]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
NewYork,14,12


In [80]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
NewYork,12,13,14,15


In [81]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
NewYork,False,False,False,False


In [82]:
data[data < 5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
NewYork,12,13,14,15


##### loc, iloc
loc : 축 이름을 선택할 때 

iloc : 정수 인덱스를 선택할 때

In [83]:
# [행, 열]
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int32

In [84]:
data.iloc[1, [1, 2]]

two      5
three    6
Name: Colorado, dtype: int32

In [85]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [86]:
data.iloc[[1, 2], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [87]:
data.loc[:'Utah', 'two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32

In [88]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
NewYork,12,13,14


In [89]:
ser = pd.Series(np.arange(3.))
ser
#ser[-1] Error

0    0.0
1    1.0
2    2.0
dtype: float64

In [90]:
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser2[-1]

2.0

In [91]:
ser[:1]

0    0.0
dtype: float64

In [92]:
# Label
ser.loc[:1]

0    0.0
1    1.0
dtype: float64

In [93]:
# int index
ser.iloc[:1]

0    0.0
dtype: float64

### 산술연산

In [95]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index = ['a','c','d','e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index = ['a','c','e','f','g'])

In [96]:
print(s1)
print(s2)

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64
a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64


In [97]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [98]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)),
                  columns = ['b','c','d'],
                  index = ['Colorado', 'Ohio', 'Texas'])

df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                  columns = list('bde'),
                  index = ['Utah', 'Ohio', 'Texas', 'Oregon'])

In [99]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,


In [100]:
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': [3, 4]})

In [101]:
df1 - df2

Unnamed: 0,A,B
0,,
1,,


In [102]:
df1.add(df2, fill_value = 0)

Unnamed: 0,A,B
0,1.0,3.0
1,2.0,4.0


In [103]:
arr = np.arange(12.).reshape((3, 4))
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [104]:
arr[0]

array([0., 1., 2., 3.])

In [105]:
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [107]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                  columns = list('bde'),
                  index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.iloc[0]

In [108]:
print(frame)
print(series)

          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0
b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64


In [109]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [110]:
series2 = pd.Series(range(3), index=['b','e','f'])
series2

b    0
e    1
f    2
dtype: int64

In [111]:
frame+series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [113]:
frame = pd.DataFrame(np.random.randn(4, 3),
                  columns = list('bde'),
                  index = ['Utah', 'Ohio', 'Texas', 'Oregon'])

In [114]:
frame

Unnamed: 0,b,d,e
Utah,0.387794,-0.527699,1.254075
Ohio,0.774983,1.660625,-1.892933
Texas,-0.359979,-0.329304,-0.600215
Oregon,0.264105,0.076816,-1.582772


In [115]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.387794,0.527699,1.254075
Ohio,0.774983,1.660625,1.892933
Texas,0.359979,0.329304,0.600215
Oregon,0.264105,0.076816,1.582772


In [116]:
# frame 데이터에서 각 행마다 최대값에서 최소값을 뺀 결과값을 추출하려면
# apply() : 특정함수를 이용하여 반복적인 계산을 수행할 때 사용
# lambda [입력] : [출력]
f = lambda x : x.max() - x.min()
frame.apply(f)

b    1.134962
d    2.188324
e    3.147008
dtype: float64

In [117]:
frame.apply(f, axis='columns')

Utah      1.781774
Ohio      3.553558
Texas     0.270910
Oregon    1.846877
dtype: float64

In [118]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

frame.apply(f)

Unnamed: 0,b,d,e
min,-0.359979,-0.527699,-1.892933
max,0.774983,1.660625,1.254075


In [120]:
fmt = lambda x : '%.2f'%x
frame.applymap(fmt)  # str서식이기 때문에 applymap

Unnamed: 0,b,d,e
Utah,0.39,-0.53,1.25
Ohio,0.77,1.66,-1.89
Texas,-0.36,-0.33,-0.6
Oregon,0.26,0.08,-1.58


In [None]:
frame['e'].map(x)

### GroupBy 객체

In [2]:
df = pd.DataFrame({'key': ['a', 'b', 'c'] * 4,
                  'value': np.arange(12.)})
df

Unnamed: 0,key,value
0,a,0.0
1,b,1.0
2,c,2.0
3,a,3.0
4,b,4.0
5,c,5.0
6,a,6.0
7,b,7.0
8,c,8.0
9,a,9.0


In [4]:
# 'key'별로 평균을 계산하여 추출하시오.
tmp = df.groupby('key').value
tmp.mean()

key
a    4.5
b    5.5
c    6.5
Name: value, dtype: float64

In [5]:
# lambda [입력] : [출력]
tmp.transform(lambda x : x.mean())

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [6]:
tmp.transform('mean')  # 내장 그룹함수의 경우 문자열로 입력

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [9]:
# 정규화
def normalize(x):
    return (x - x.mean()) / x.std()

In [10]:
tmp.transform(normalize)

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64

In [11]:
tmp.apply(normalize)

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64