In [1]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

## Series

In [2]:
obj = Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [4]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [5]:
type(obj.values)

numpy.ndarray

In [6]:
obj.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [7]:
obj2 = Series([4,7,-5,3], index = ['d','b','a','c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [8]:
obj2[0] # 인덱스가 부여되어도 0 기반 인덱스가 기본적으로 계속 적용된다.

4

In [9]:
obj2[['c','a','d']]

c    3
a   -5
d    4
dtype: int64

In [11]:
# 값이 0보다 큰 항목만 선택 (boolean filter)
obj2[obj2>0]

d    4
b    7
c    3
dtype: int64

In [12]:
obj2 * 2
# [1,2,3] * 2 = [1,2,3,1,2,3,] 이다

d     8
b    14
a   -10
c     6
dtype: int64

In [14]:
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [15]:
'b' in obj2 # index 에서 찾기

True

In [18]:
4 in obj2.values # values 에서 찾기

True

### dict로 부터 Series

In [19]:
sdata = {'Ohio' : 35000, 'Texas' : 71000, 'Oregon' : 16000, 'Utah' : 5000}

In [20]:
obj3 = Series(sdata)
obj3

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

### Series -> dict

In [21]:
dict(obj3)

{'Ohio': 35000, 'Oregon': 16000, 'Texas': 71000, 'Utah': 5000}

In [22]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index=states) # index와 dict의 조합을 통해 비록 없는 값이라도 index를 유지시켜준다
obj4

California      NaN
Ohio          35000
Oregon        16000
Texas         71000
dtype: float64

In [23]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [25]:
pd.isnull(obj4)
obj4[pd.notnull(obj4)] ## 값이 있는 것들만 추출해 내는 패턴

Ohio      35000
Oregon    16000
Texas     71000
dtype: float64

In [26]:
obj3

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [27]:
obj4

California      NaN
Ohio          35000
Oregon        16000
Texas         71000
dtype: float64

#### 인덱스가 어긋나도 계산됨

In [28]:
obj3 + obj4

California       NaN
Ohio           70000
Oregon         32000
Texas         142000
Utah             NaN
dtype: float64

In [29]:
obj5 = obj3 + obj4
obj5[obj5.notnull()]

Ohio       70000
Oregon     32000
Texas     142000
dtype: float64

In [30]:
obj4.name = 'population' ## 열의 이름, DataFrame에 포함되는 순간 쓸모가 있어짐
obj4

California      NaN
Ohio          35000
Oregon        16000
Texas         71000
Name: population, dtype: float64

In [33]:
s1 = Series({u'국어' : 90, u'수학' : 95, u'영어' : 85})
s2 = Series({u'영어' : 75, u'수학' : 84, u'과학' : 50})

common_sub = s1.index & s2.index
print(common_sub)
print(s1[common_sub])
print(s2[common_sub])



Index([u'수학', u'영어'], dtype='object')
수학    95
영어    85
dtype: int64
수학    84
영어    75
dtype: int64


In [52]:
s1 = Series({u'국어' : 90, u'수학' : 95, u'영어' : 85})
s2 = Series({u'국어' : 90, u'수학' : 84, u'과학' : 50})

print(s1[s1 == s2])
print(s2[s2 == s1])

Series([], dtype: int64)
Series([], dtype: int64)


In [51]:
s11 = Series({'a':1, 'b':2})
s12 = Series({'a':1, 'b':3})
s21 = Series({'b':2, 'a':1})

s11 == s21

a    True
b    True
dtype: bool

In [53]:
pd.read_csv?

In [58]:
data = {u'state' : ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'year' : [2000, 2001, 2002, 2001, 2002], 'pop' : [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [62]:
# 열을 특정한 방식으로 구성하고 싶으면
frame2 = DataFrame(data, columns=['year', 'state', 'pop'],index=['하나','둘','셋','넷','다섯'])
frame2

Unnamed: 0,year,state,pop
하나,2000,Ohio,1.5
둘,2001,Ohio,1.7
셋,2002,Ohio,3.6
넷,2001,Nevada,2.4
다섯,2002,Nevada,2.9


In [60]:
# 행 선택
frame2.ix['셋']

year     2002
state    Ohio
pop       3.6
Name: 셋, dtype: object

In [63]:
frame2 = DataFrame(data, columns=['year', 'state', 'pop','debt'],index=['하나','둘','셋','넷','다섯'])
# 브로드캐스팅 전체 column의 값이 입력된다
frame2.debt = 16.5
frame2

Unnamed: 0,year,state,pop,debt
하나,2000,Ohio,1.5,16.5
둘,2001,Ohio,1.7,16.5
셋,2002,Ohio,3.6,16.5
넷,2001,Nevada,2.4,16.5
다섯,2002,Nevada,2.9,16.5


In [64]:
frame2.debt = range(5)
# range(4) 값이 count가 다르면 에러가 난다
frame2

Unnamed: 0,year,state,pop,debt
하나,2000,Ohio,1.5,0
둘,2001,Ohio,1.7,1
셋,2002,Ohio,3.6,2
넷,2001,Nevada,2.4,3
다섯,2002,Nevada,2.9,4


In [65]:
val = Series([-1.2, -1.5, -1.7, -1.8], index=['둘','넷','다섯','여섯'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt
하나,2000,Ohio,1.5,
둘,2001,Ohio,1.7,-1.2
셋,2002,Ohio,3.6,
넷,2001,Nevada,2.4,-1.5
다섯,2002,Nevada,2.9,-1.7


In [66]:
# 칼럼 추가는 값 할당으로 가능
frame2[u'동부'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,동부
하나,2000,Ohio,1.5,,True
둘,2001,Ohio,1.7,-1.2,True
셋,2002,Ohio,3.6,,True
넷,2001,Nevada,2.4,-1.5,False
다섯,2002,Nevada,2.9,-1.7,False


In [67]:
del frame2[u'동부'] ## memory 해제는 gc가 나중에 알아서
frame2

Unnamed: 0,year,state,pop,debt
하나,2000,Ohio,1.5,
둘,2001,Ohio,1.7,-1.2
셋,2002,Ohio,3.6,
넷,2001,Nevada,2.4,-1.5
다섯,2002,Nevada,2.9,-1.7


In [68]:
# 다른 형태의 dict를 dataframe으로 바꾸는 방법
pop = {'Nevada' : {2001:2.4, 2002:2.9}, 'Ohio':{2000:1.5, 2001:1.7, 2002:3.6}}
frame3 = DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [69]:
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [100]:
p_data = {'James' : {u'국어' : 90, u'수학' : 80, u'과학' : 85},
         'Allan' : {u'국어' : 78, u'수학' : 92, u'과학' : 95},
         'Mindy' : {u'국어' : 66, u'수학' : 79, u'과학' : 99}}
frame4 = DataFrame(p_data, index=[u'국어', u'수학', u'과학'])
frame4 = frame4.T
frame4

Unnamed: 0,국어,수학,과학
Allan,78,92,95
James,90,80,85
Mindy,66,79,99


In [101]:
frame4[u'평균'] = 0

In [102]:
frame4[u'성별'] = [u'남자', u'남자', u'여자']

In [110]:
frame4[u'여성'] = frame4[u'성별'] == u'남자'

In [112]:
del frame4[u'성별']

In [131]:
frame4[u'평균'] = (frame4[u'국어'] + frame4[u'수학'] + frame4[u'과학']) / 3

In [132]:
frame4

Unnamed: 0,국어,수학,과학,평균,여성
Allan,78,92,95,88.333333,True
James,90,80,85,85.0,True
Mindy,66,79,99,81.333333,False


## 색인 객체

In [137]:
obj = Series(range(3), index=['a','b','c'])
idx = obj.index

In [138]:
obj2 = Series([1.5, -2.5, 0], index=idx)
obj2

a    1.5
b   -2.5
c    0.0
dtype: float64

In [139]:
obj.index is obj2.index

True

In [140]:
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d','b','a','c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [142]:
obj2 = obj.reindex(['a','b','c','d','e'], fill_value=0) # 초기화 값
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [147]:
obj3 = Series(['blue','purple','yellow'], index=[0,2,4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [152]:
obj3.reindex(range(6), method='ffill') # forward fill (없는 값은 그 전 값으로 채움)

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

## DataFrame 재색인

In [153]:
data = np.arange(9).reshape((3,3)) # 3 * 3 행렬로 만드는 구문
frame = DataFrame(data, index=['a','c','d'], columns=['Ohio','Texas','California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [156]:
frame2 = frame.reindex(['a','b','c','d'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [157]:
frame3 = frame.reindex([1,2,3,4,5])
frame3

Unnamed: 0,Ohio,Texas,California
1,,,
2,,,
3,,,
4,,,
5,,,


In [158]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [162]:
oo = Series([4.5, 7.2, -5.3, 3.6], index=['d','b','a','c'])
oo

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [165]:
oo2 = Series(oo.values, index=['z','x','y','w'])
oo2

z    4.5
x    7.2
y   -5.3
w    3.6
dtype: float64

In [202]:
s_data = {'sj' : np.random.randint(100,200, size=10), 'macro' : np.random.randint(100,200, size=10)}
s_df = DataFrame(s_data)
s_df['melon'] = np.random.randint(100,200, size=10)
s_df = s_df.reindex(columns=['macro','melon', 'sj','codebasic'])
s_df['codebasic'][5:10] = np.random.randint(100,200, size=5)
s_df.ix[s_df['codebasic'].notnull()]

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,macro,melon,sj,codebasic
5,108,168,111,177
6,109,160,147,105
7,177,180,159,191
8,101,175,187,164
9,159,137,122,136


In [203]:
obj=Series(np.arange(5.), index=['a','b','c','d','e'])
obj

a    0
b    1
c    2
d    3
e    4
dtype: float64

In [206]:
print(obj.drop('c'))
print(obj.drop(['c','d']))
print(obj)

a    0
b    1
d    3
e    4
dtype: float64
a    0
b    1
e    4
dtype: float64
a    0
b    1
c    2
d    3
e    4
dtype: float64


In [208]:
data = np.arange(16).reshape((4,4))
frame = DataFrame(data, index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one','two','trois','quatre'])
frame

Unnamed: 0,one,two,trois,quatre
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [210]:
frame.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,trois,quatre
Utah,8,9,10,11
New York,12,13,14,15


In [212]:
frame.drop(['one','two'], axis=1) ## axis 0은 행방향, axis 1은 열방향

Unnamed: 0,trois,quatre
Ohio,2,3
Colorado,6,7
Utah,10,11
New York,14,15


## 색인, 선택, 필터
obj[<선택조건>]

In [213]:
obj = Series(np.arange(4.0), index=list('abcd'))
obj

a    0
b    1
c    2
d    3
dtype: float64

In [215]:
list('abcd')

['a', 'b', 'c', 'd']

In [218]:
obj['b':'c'] ## 알파벳 순서가 아니라 해당 index의 번호를 찾고 계산함, 단 문자기반 index는 마지막 문자도 포함

b    1
c    2
dtype: float64

In [225]:
obj = Series(range(5), index=['d','e','b','a','c'])
obj['b':'c']

b    2
a    3
c    4
dtype: int64

In [226]:
data = DataFrame(np.arange(16).reshape((4,4)), index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one','two','three','four'])

In [227]:
data[['three','one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


열 방향으로 선택되어야 하는거 아닌가?

In [249]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [237]:
data.ix['Colorado', ['two', 'three']]  ## Serie로 return

two      5
three    6
Name: Colorado, dtype: int32

In [238]:
data.ix[['Colorado','Ohio'], ['two', 'three']] ## DataFrame으로 return

Unnamed: 0,two,three
Colorado,5,6
Ohio,1,2


In [229]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [270]:
p_frame = DataFrame(p_data, index=[u'국어', u'수학', u'과학'])
p_frame.T[p_frame.ix[u'수학'] > 80]

Unnamed: 0,국어,수학,과학
Allan,78,92,95


In [272]:
p_frame.T[p_frame.ix[u'수학'] > 80]
p_frame.ix[:, p_frame.ix[u'수학'] > 80]

Unnamed: 0,Allan
국어,78
수학,92
과학,95


In [275]:
print(p_frame.ix[u'수학'] > 80)
type(p_frame.ix[u'수학'] > 80)

Allan     True
James    False
Mindy    False
Name: 수학, dtype: bool


pandas.core.series.Series

In [269]:
p_frame.ix[[u'수학', u'과학'],['Allan', 'James']]

Unnamed: 0,Allan,James
수학,92,80
과학,95,85


In [276]:
s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a','c','d','e'])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a','c','e','f', 'g'])

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [277]:
s1 + s2 # 하나라도 없으면 NaN

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [278]:
df1 = DataFrame(np.arange(12.).reshape((3,4)), columns=list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4,5)), columns=list('abcde'))

In [279]:
df1

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [280]:
df2

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [281]:
df1 + df2 # 역시 없는것은 NaN으로 처리

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [290]:
df1.add(df2, fill_value=0) # NaN 인 경우 0으로 처리

Unnamed: 0,a,b,c,d,e
0,0,2,4,6,4
1,9,11,13,15,9
2,18,20,22,24,14
3,15,16,17,18,19


In [288]:
df3 = df1 + df2
df3[list('abcd')]

Unnamed: 0,a,b,c,d
0,0.0,2.0,4.0,6.0
1,9.0,11.0,13.0,15.0
2,18.0,20.0,22.0,24.0
3,,,,


In [289]:
frame = DataFrame(np.random.randn(4,3), columns=list('bde'), index=['Utah','Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,1.717749,1.181449,0.735933
Ohio,-0.323541,-0.040612,1.39578
Texas,-0.205837,0.278862,-0.18162
Oregon,1.511122,-1.568461,-1.941382


In [291]:
series = frame.ix[0]
series

b    1.717749
d    1.181449
e    0.735933
Name: Utah, dtype: float64

In [296]:
series - frame

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,2.04129,1.222062,-0.659847
Texas,1.923585,0.902587,0.917553
Oregon,0.206627,2.749911,2.677316


In [297]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,-2.04129,-1.222062,0.659847
Texas,-1.923585,-0.902587,-0.917553
Oregon,-0.206627,-2.749911,-2.677316


In [300]:
series2 = Series(range(3), index=['b','e','f']) # Series의 Name이 없는 경우 모든 행에 더해진다
frame + series2

Unnamed: 0,b,d,e,f
Utah,1.717749,,1.735933,
Ohio,-0.323541,,2.39578,
Texas,-0.205837,,0.81838,
Oregon,1.511122,,-0.941382,


In [301]:
series3 = Series(range(1,4), index=['b','e','f'])
frame + series3

Unnamed: 0,b,d,e,f
Utah,2.717749,,2.735933,
Ohio,0.676459,,3.39578,
Texas,0.794163,,1.81838,
Oregon,2.511122,,0.058618,


In [303]:
series4 = frame['d']
series4

Utah      1.181449
Ohio     -0.040612
Texas     0.278862
Oregon   -1.568461
Name: d, dtype: float64

Series의 인덱스를 frame의 인덱스를 간주하고 연산 수행

In [304]:
frame - series4

Unnamed: 0,Ohio,Oregon,Texas,Utah,b,d,e
Utah,,,,,,,
Ohio,,,,,,,
Texas,,,,,,,
Oregon,,,,,,,


In [332]:
scores = np.random.randint(0, 100, size=(10, 5)) # 10 * 5
frame_scores = DataFrame(scores, index=range(1, 11), columns=range(101, 106))
frame_scores

Unnamed: 0,101,102,103,104,105
1,48,86,7,48,62
2,13,69,50,18,15
3,96,20,8,79,20
4,41,78,5,9,41
5,75,93,24,14,80
6,32,82,97,25,4
7,41,82,68,32,39
8,5,44,28,55,44
9,43,89,31,27,66
10,75,91,48,11,38


In [333]:
# ss = Series(np.random.randint(-10,10, size=10), index=range(1,11))
frame_scores[102] = frame_scores[102] + Series(np.random.randint(-10,10, size=10), index=frame_scores.index)

Unnamed: 0,101,102,103,104,105
1,49,87,8,49,63
2,8,64,45,13,10
3,93,17,5,76,17
4,40,77,4,8,40
5,68,86,17,7,73
6,37,87,102,30,9
7,40,81,67,31,38
8,12,51,35,62,51
9,36,82,24,20,59
10,79,95,52,15,42


In [326]:
Series(np.random.randint(-10,10, size=10), index=frame_scores.index, name=102)

1     3
2     8
3    -1
4    -4
5     4
6    -6
7    -6
8     6
9     9
10   -4
Name: 102, dtype: int32