In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 1. Objection Creation(객체 생성) #

Pandas는 값을 가지고 있는 리스트를 통해 Series를 만들고, 정수로 만들어진 인덱스를 기본값으로 불러올 것입니다.

In [2]:
s = pd.Series([1,3,5,np.nan,6,8])

In [3]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

datetime 인덱스와 레이블이 있는 열을 가지고 있는 numpy 배열을 전달하여 데이터프레임을 만듭니다.

In [4]:
dates = pd.date_range('20130101', periods=6)

In [5]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [6]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

In [7]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.526608,-2.805938,-0.588822,0.371553
2013-01-02,-2.101543,-1.233838,0.3453,1.462888
2013-01-03,-0.994043,-1.887242,-0.896516,0.577925
2013-01-04,-0.050065,-0.475197,0.451113,0.067388
2013-01-05,-1.186304,-0.724697,0.208976,1.01763
2013-01-06,-1.121361,0.226982,-0.301775,0.932423


In [13]:
df2 = pd.DataFrame({'A' : 1.,
                   'B' : pd.Timestamp('20130102'),
                   'C' : pd.Series(1,index=list(range(4)), dtype='float32'),
                   'D' : np.array([3] * 4, dtype='int32'),
                   'E' : pd.Categorical(['test','train','test','train']),
                   'F' : 'foo'})

In [14]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [15]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

# 2. Viewing Data(데이터 확인하기)

데이터프레임의 가장 윗줄과 마지막 줄을 확인하고 싶을 때에 사용하는 방법은 다음과 같습니다.

In [17]:
df.tail(3) #끝에서 마지막 3줄을 불러옴

Unnamed: 0,A,B,C,D
2013-01-04,-0.050065,-0.475197,0.451113,0.067388
2013-01-05,-1.186304,-0.724697,0.208976,1.01763
2013-01-06,-1.121361,0.226982,-0.301775,0.932423


In [18]:
df.tail() #끝에서 마지막 5줄을 불러옴 default값이 5이다.

Unnamed: 0,A,B,C,D
2013-01-02,-2.101543,-1.233838,0.3453,1.462888
2013-01-03,-0.994043,-1.887242,-0.896516,0.577925
2013-01-04,-0.050065,-0.475197,0.451113,0.067388
2013-01-05,-1.186304,-0.724697,0.208976,1.01763
2013-01-06,-1.121361,0.226982,-0.301775,0.932423


In [19]:
df.head() #위에서 5줄을 불러옴. default값이 5이다.

Unnamed: 0,A,B,C,D
2013-01-01,-1.526608,-2.805938,-0.588822,0.371553
2013-01-02,-2.101543,-1.233838,0.3453,1.462888
2013-01-03,-0.994043,-1.887242,-0.896516,0.577925
2013-01-04,-0.050065,-0.475197,0.451113,0.067388
2013-01-05,-1.186304,-0.724697,0.208976,1.01763


In [20]:
#인덱스(index), 열(columns) 그리고 numpy 데이터에 대한 세부 정보를 봅니다.
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [21]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [22]:
df.values

array([[-1.52660831, -2.80593821, -0.58882168,  0.37155264],
       [-2.10154286, -1.23383772,  0.34529985,  1.46288769],
       [-0.99404346, -1.88724239, -0.89651561,  0.57792516],
       [-0.05006497, -0.47519749,  0.45111308,  0.06738849],
       [-1.18630368, -0.72469748,  0.20897591,  1.01763042],
       [-1.12136056,  0.22698201, -0.30177467,  0.93242341]])

In [23]:
#describe()는 데이터의 대략적인 정보 요약을 보여줍니다.
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-1.163321,-1.149989,-0.130287,0.738301
std,0.675651,1.079352,0.548831,0.500216
min,-2.101543,-2.805938,-0.896516,0.067388
25%,-1.441532,-1.723891,-0.51706,0.423146
50%,-1.153832,-0.979268,-0.046399,0.755174
75%,-1.025873,-0.537572,0.311219,0.996329
max,-0.050065,0.226982,0.451113,1.462888


In [24]:
#데이터를 전치합니다.
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-1.526608,-2.101543,-0.994043,-0.050065,-1.186304,-1.121361
B,-2.805938,-1.233838,-1.887242,-0.475197,-0.724697,0.226982
C,-0.588822,0.3453,-0.896516,0.451113,0.208976,-0.301775
D,0.371553,1.462888,0.577925,0.067388,1.01763,0.932423


In [28]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.526608,-2.805938,-0.588822,0.371553
2013-01-02,-2.101543,-1.233838,0.3453,1.462888
2013-01-03,-0.994043,-1.887242,-0.896516,0.577925
2013-01-04,-0.050065,-0.475197,0.451113,0.067388
2013-01-05,-1.186304,-0.724697,0.208976,1.01763
2013-01-06,-1.121361,0.226982,-0.301775,0.932423


In [27]:
#축 별로 정렬합니다.
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.371553,-0.588822,-2.805938,-1.526608
2013-01-02,1.462888,0.3453,-1.233838,-2.101543
2013-01-03,0.577925,-0.896516,-1.887242,-0.994043
2013-01-04,0.067388,0.451113,-0.475197,-0.050065
2013-01-05,1.01763,0.208976,-0.724697,-1.186304
2013-01-06,0.932423,-0.301775,0.226982,-1.121361


In [30]:
#값 별로 정렬합니다.
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-01,-1.526608,-2.805938,-0.588822,0.371553
2013-01-03,-0.994043,-1.887242,-0.896516,0.577925
2013-01-02,-2.101543,-1.233838,0.3453,1.462888
2013-01-05,-1.186304,-0.724697,0.208976,1.01763
2013-01-04,-0.050065,-0.475197,0.451113,0.067388
2013-01-06,-1.121361,0.226982,-0.301775,0.932423


# 3. Selection(선택)

.at, .iat, .loc 및 .iloc를 추천합니다.

## Getting(데이터 얻기)


In [32]:
#df.A와 동일한 Series를 생성하는 단일 열을 선택합니다.
df['A']

2013-01-01   -1.526608
2013-01-02   -2.101543
2013-01-03   -0.994043
2013-01-04   -0.050065
2013-01-05   -1.186304
2013-01-06   -1.121361
Freq: D, Name: A, dtype: float64

In [33]:
#행을 분할하는 []를 통해 선택합니다.
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-1.526608,-2.805938,-0.588822,0.371553
2013-01-02,-2.101543,-1.233838,0.3453,1.462888
2013-01-03,-0.994043,-1.887242,-0.896516,0.577925


In [34]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-2.101543,-1.233838,0.3453,1.462888
2013-01-03,-0.994043,-1.887242,-0.896516,0.577925
2013-01-04,-0.050065,-0.475197,0.451113,0.067388


## Selection by Label(Label을 통한 선택)

In [36]:
#라벨을 사용하여 횡단면을 얻습니다.
df.loc[dates[0]]

A   -1.526608
B   -2.805938
C   -0.588822
D    0.371553
Name: 2013-01-01 00:00:00, dtype: float64

In [38]:
#라벨을 사용하여 여러 축(의 데이터)을 얻습니다
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,-1.526608,-2.805938
2013-01-02,-2.101543,-1.233838
2013-01-03,-0.994043,-1.887242
2013-01-04,-0.050065,-0.475197
2013-01-05,-1.186304,-0.724697
2013-01-06,-1.121361,0.226982


In [39]:
#양쪽 종단점을 포함한 라벨 슬라이싱을 봅니다.
df.loc['20130102':'20130104',['A','B']]

Unnamed: 0,A,B
2013-01-02,-2.101543,-1.233838
2013-01-03,-0.994043,-1.887242
2013-01-04,-0.050065,-0.475197


In [40]:
#반환되는 객체의 차원을 줄입니다.
df.loc['20130102',['A','B']]

A   -2.101543
B   -1.233838
Name: 2013-01-02 00:00:00, dtype: float64

In [41]:
#스칼라 값을 얻습니다.
df.loc[dates[0],'A']

-1.5266083117770506

In [42]:
#스칼라 값을 더 빠르게 구하는 방법입니다(앞선 메소드와 동일합니다.)
df.at[dates[0],'A']

-1.5266083117770506

## Selection by Position(위치로 선택하기)

In [43]:
#넘겨받은 정수의 위치를 기준으로 선택합니다.
df.iloc[3]

A   -0.050065
B   -0.475197
C    0.451113
D    0.067388
Name: 2013-01-04 00:00:00, dtype: float64

In [45]:
#정수로 표기된 슬라이스들을 통해, numpy/python과 유사하게 동작합니다.
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,-0.050065,-0.475197
2013-01-05,-1.186304,-0.724697


In [46]:
#정수로 표기된 위치값의 리스트롤 통해, numpy/python의 스타일과 유사해집니다.
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,-2.101543,0.3453
2013-01-03,-0.994043,-0.896516
2013-01-05,-1.186304,0.208976


In [47]:
#명시적으로 행을 나누고자 하는 경우입니다.
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2013-01-02,-2.101543,-1.233838,0.3453,1.462888
2013-01-03,-0.994043,-1.887242,-0.896516,0.577925


In [48]:
#명시적으로 열을 나누고자 하는 경우입니다.
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,-2.805938,-0.588822
2013-01-02,-1.233838,0.3453
2013-01-03,-1.887242,-0.896516
2013-01-04,-0.475197,0.451113
2013-01-05,-0.724697,0.208976
2013-01-06,0.226982,-0.301775


In [50]:
#명시적으로 (특정한)값을 얻고자 하는 경우입니다.
df.iloc[1,1]

-1.2338377212880878

In [51]:
#스칼라 값을 빠르게 얻는 방법입니다.(위의 방식과 동일합니다.)
df.iat[1,1]

-1.2338377212880878

## Boolean Indexing

In [54]:
#데이터를 선택하기 위해 단일 열의 값을 사용합니다.
df[df.A < 0]

Unnamed: 0,A,B,C,D
2013-01-01,-1.526608,-2.805938,-0.588822,0.371553
2013-01-02,-2.101543,-1.233838,0.3453,1.462888
2013-01-03,-0.994043,-1.887242,-0.896516,0.577925
2013-01-04,-0.050065,-0.475197,0.451113,0.067388
2013-01-05,-1.186304,-0.724697,0.208976,1.01763
2013-01-06,-1.121361,0.226982,-0.301775,0.932423


In [55]:
#Boolean 조건을 충족하는 데이터프레임에서 값을 선택합니다.
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,,,0.371553
2013-01-02,,,0.3453,1.462888
2013-01-03,,,,0.577925
2013-01-04,,,0.451113,0.067388
2013-01-05,,,0.208976,1.01763
2013-01-06,,0.226982,,0.932423


In [58]:
#필터링을 위한 메소드 isin()을 사용합니다.
df2 = df.copy()
df2['E'] = ['one','ono','two','three','four','three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.526608,-2.805938,-0.588822,0.371553,one
2013-01-02,-2.101543,-1.233838,0.3453,1.462888,ono
2013-01-03,-0.994043,-1.887242,-0.896516,0.577925,two
2013-01-04,-0.050065,-0.475197,0.451113,0.067388,three
2013-01-05,-1.186304,-0.724697,0.208976,1.01763,four
2013-01-06,-1.121361,0.226982,-0.301775,0.932423,three


In [60]:
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-0.994043,-1.887242,-0.896516,0.577925,two
2013-01-05,-1.186304,-0.724697,0.208976,1.01763,four


## Setting(설정)

In [61]:
#새 열을 설정하면 데이터가 인덱스 별로 자동 정렬됩니다.
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))

In [62]:
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [63]:
df['F'] = s1

In [64]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.526608,-2.805938,-0.588822,0.371553,
2013-01-02,-2.101543,-1.233838,0.3453,1.462888,1.0
2013-01-03,-0.994043,-1.887242,-0.896516,0.577925,2.0
2013-01-04,-0.050065,-0.475197,0.451113,0.067388,3.0
2013-01-05,-1.186304,-0.724697,0.208976,1.01763,4.0
2013-01-06,-1.121361,0.226982,-0.301775,0.932423,5.0


In [65]:
#라벨에 의해 값을 설정합니다.
df.at[dates[0],'A'] = 0

In [67]:
#위치에 의해 값을 설정합니다.
df.iat[0,1] = 0

In [68]:
#Numpy 배열에 사용한 할당에 의해 값을 설정합니다.
df.loc[:,'D'] = np.array([5] * len(df))

In [69]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.588822,5,
2013-01-02,-2.101543,-1.233838,0.3453,5,1.0
2013-01-03,-0.994043,-1.887242,-0.896516,5,2.0
2013-01-04,-0.050065,-0.475197,0.451113,5,3.0
2013-01-05,-1.186304,-0.724697,0.208976,5,4.0
2013-01-06,-1.121361,0.226982,-0.301775,5,5.0


In [70]:
#where 연산을 설정합니다.
df2 = df.copy()

In [71]:
df2[df2 > 0] = -df2

In [72]:
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.588822,-5,
2013-01-02,-2.101543,-1.233838,-0.3453,-5,-1.0
2013-01-03,-0.994043,-1.887242,-0.896516,-5,-2.0
2013-01-04,-0.050065,-0.475197,-0.451113,-5,-3.0
2013-01-05,-1.186304,-0.724697,-0.208976,-5,-4.0
2013-01-06,-1.121361,-0.226982,-0.301775,-5,-5.0


# 4.Missing Data(결측치)
Pandas는 결측치를 표현하기 위해 주로 np.nan 값을 사용합니다.
이 방법은 기본 설정값이지만 계산에는 포함되지 않습니다.

Reindexing으로 지정된 축 상의 인덱스를 변경/추가/삭제할 수 있습니다.
Reindexing은 데이터의 복사본을 반환합니다.

In [73]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])

In [74]:
df1.loc[dates[0]:dates[1], 'E'] = 1

In [75]:
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-0.588822,5,,1.0
2013-01-02,-2.101543,-1.233838,0.3453,5,1.0,1.0
2013-01-03,-0.994043,-1.887242,-0.896516,5,2.0,
2013-01-04,-0.050065,-0.475197,0.451113,5,3.0,


In [76]:
del df1['F']

In [77]:
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,-0.588822,5,1.0
2013-01-02,-2.101543,-1.233838,0.3453,5,1.0
2013-01-03,-0.994043,-1.887242,-0.896516,5,
2013-01-04,-0.050065,-0.475197,0.451113,5,


In [78]:
#결측치를 가지고 있는 행들을 지웁니다.
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,-0.588822,5,1.0
2013-01-02,-2.101543,-1.233838,0.3453,5,1.0


In [79]:
#결측치를 채워 넣습니다.
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,-0.588822,5,1.0
2013-01-02,-2.101543,-1.233838,0.3453,5,1.0
2013-01-03,-0.994043,-1.887242,-0.896516,5,5.0
2013-01-04,-0.050065,-0.475197,0.451113,5,5.0


In [80]:
#nan인 값에 boolean을 통한 표식을 얻습니다.
pd.isna(df1)

Unnamed: 0,A,B,C,D,E
2013-01-01,False,False,False,False,False
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,True
2013-01-04,False,False,False,False,True


# 5.Operation(연산)

## Stats(통계)
일반적으로 결측치를 제외한 후 연산됩니다.
기술통계를 수행합니다.

In [82]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.588822,5,
2013-01-02,-2.101543,-1.233838,0.3453,5,1.0
2013-01-03,-0.994043,-1.887242,-0.896516,5,2.0
2013-01-04,-0.050065,-0.475197,0.451113,5,3.0
2013-01-05,-1.186304,-0.724697,0.208976,5,4.0
2013-01-06,-1.121361,0.226982,-0.301775,5,5.0


In [81]:
df.mean()

A   -0.908886
B   -0.682332
C   -0.130287
D    5.000000
F    3.000000
dtype: float64

In [83]:
#다른축에서 동일한 연산을 수행합니다.
df.mean(1)

2013-01-01    1.102795
2013-01-02    0.601984
2013-01-03    0.644440
2013-01-04    1.585170
2013-01-05    1.459595
2013-01-06    1.760769
Freq: D, dtype: float64

In [84]:
#정렬이 필요하며, 차원이 다른 객체로 연산해보겠습니다. 또한, Pandas는 지정된 차원을 따라 자동으로
#브로드 캐스팅 됩니다.
# bradcast란 numpy에서 유래한 용어로, n차원이나 스칼라 값으로 연산을 수행할 때 도출되는
# 결과의 규칙을 설명하는 것을 의미합니다.

s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)

In [85]:
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [86]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F
2013-01-01,,,,,
2013-01-02,,,,,
2013-01-03,-1.994043,-2.887242,-1.896516,4.0,1.0
2013-01-04,-3.050065,-3.475197,-2.548887,2.0,0.0
2013-01-05,-6.186304,-5.724697,-4.791024,0.0,-1.0
2013-01-06,,,,,


In [87]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.588822,5,
2013-01-02,-2.101543,-1.233838,0.3453,5,1.0
2013-01-03,-0.994043,-1.887242,-0.896516,5,2.0
2013-01-04,-0.050065,-0.475197,0.451113,5,3.0
2013-01-05,-1.186304,-0.724697,0.208976,5,4.0
2013-01-06,-1.121361,0.226982,-0.301775,5,5.0


## Apply(적용)

In [88]:
#데이터에 함수를 적용합니다.
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.588822,5,
2013-01-02,-2.101543,-1.233838,-0.243522,10,1.0
2013-01-03,-3.095586,-3.12108,-1.140037,15,3.0
2013-01-04,-3.145651,-3.596278,-0.688924,20,6.0
2013-01-05,-4.331955,-4.320975,-0.479948,25,10.0
2013-01-06,-5.453316,-4.093993,-0.781723,30,15.0


In [90]:
df.apply(lambda x : x.max() - x.min())

A    2.101543
B    2.114224
C    1.347629
D    0.000000
F    4.000000
dtype: float64

## Histogramming(히스토그래밍)

In [91]:
s = pd.Series(np.random.randint(0, 7, size=10))

In [92]:
s

0    0
1    0
2    4
3    3
4    2
5    1
6    6
7    4
8    2
9    5
dtype: int32

In [93]:
s.value_counts()

4    2
2    2
0    2
6    1
5    1
3    1
1    1
dtype: int64

## String Methods(문자열 메소드)

In [95]:
#Series는 다음의 코드와 같이 문자열 처리 메소드 모음(set)을 가지고 있습니다.
#이 모음은 배열의 각 요소를 쉽게 조작할 수 있도록 만들어주는 문자열의 속성에 포함되어 있습니다.

#문자열의 패턴 일치 확인은 기본적으로 정규 표현식을 사용하며, 
#몇몇 경우에는 항상 정규 표현식을 사용함에 유의하십시오.

s = pd.Series(['A','B','C','Aaba','Baca',np.nan, 'CABA','dog','cat'])

In [96]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

# 6.Merge (병합)

## Concat(연결)

결합(join) / 병합(merge) 형태의 연산에 대한 인덱스, 관계 대수 기능을 위한
다양한 형태의 논리를 포함한 Series, DataFrame, Panel 객체를 손쉽게
결합할 수 있도록 하는 다양한 기능을 pandas에서 제공합니다.

In [97]:
#concat()으로 pandas 객체를 연결합니다.
df = pd.DataFrame(np.random.randn(10,4))

In [98]:
df

Unnamed: 0,0,1,2,3
0,0.305789,-0.75574,-0.288473,0.233654
1,-0.964395,0.641588,0.929862,-1.029411
2,-0.674402,-0.805081,0.758311,-0.132261
3,-0.241276,-0.855213,-0.163748,0.080961
4,-2.533563,1.215599,0.591322,0.726512
5,0.160328,0.759939,1.529507,-0.126829
6,-0.790351,-0.506519,1.766214,-0.146569
7,-0.236095,-0.91213,-0.120553,0.784565
8,0.007812,0.733754,-0.750837,0.898038
9,1.978522,-0.595515,0.063926,-0.434014


In [99]:
#break it into pieces
pieces = [df[:3], df[3:7], df[7:]]

In [100]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,0.305789,-0.75574,-0.288473,0.233654
1,-0.964395,0.641588,0.929862,-1.029411
2,-0.674402,-0.805081,0.758311,-0.132261
3,-0.241276,-0.855213,-0.163748,0.080961
4,-2.533563,1.215599,0.591322,0.726512
5,0.160328,0.759939,1.529507,-0.126829
6,-0.790351,-0.506519,1.766214,-0.146569
7,-0.236095,-0.91213,-0.120553,0.784565
8,0.007812,0.733754,-0.750837,0.898038
9,1.978522,-0.595515,0.063926,-0.434014


In [101]:
pieces

[          0         1         2         3
 0  0.305789 -0.755740 -0.288473  0.233654
 1 -0.964395  0.641588  0.929862 -1.029411
 2 -0.674402 -0.805081  0.758311 -0.132261,
           0         1         2         3
 3 -0.241276 -0.855213 -0.163748  0.080961
 4 -2.533563  1.215599  0.591322  0.726512
 5  0.160328  0.759939  1.529507 -0.126829
 6 -0.790351 -0.506519  1.766214 -0.146569,
           0         1         2         3
 7 -0.236095 -0.912130 -0.120553  0.784565
 8  0.007812  0.733754 -0.750837  0.898038
 9  1.978522 -0.595515  0.063926 -0.434014]

## Join(결합)

SQL 방식으로 병합합니다.

In [102]:
left = pd.DataFrame({'key': ['foo','foo'], 'lval': [1,2]})

In [103]:
right = pd.DataFrame({'key': ['foo','foo'], 'rval' : [4, 5]})

In [104]:
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [105]:
right

Unnamed: 0,key,rval
0,foo,4
1,foo,5


In [106]:
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


In [107]:
#다른 예시
left = pd.DataFrame({'key' : ['foo','bar'], 'lval' : [1, 2]})

In [108]:
right = pd.DataFrame({'key' : ['foo', 'bar'], 'rval' : [4, 5]})

In [109]:
left

Unnamed: 0,key,lval
0,foo,1
1,bar,2


In [110]:
right

Unnamed: 0,key,rval
0,foo,4
1,bar,5


In [111]:
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


## Append(추가)

데이터프레임에 행을 추가합니다.

In [112]:
df = pd.DataFrame(np.random.randn(8, 4), columns = ['A', 'B', 'C', 'D'])

In [113]:
df

Unnamed: 0,A,B,C,D
0,0.864595,1.544583,-0.568863,0.450888
1,-0.826111,1.384692,-1.84349,-0.68314
2,1.105228,-2.245769,-1.033898,0.081351
3,-2.23064,0.551372,-0.615315,0.082453
4,-0.025385,-1.378332,0.861023,-0.940017
5,-0.645201,0.26687,1.443958,-1.748502
6,0.401533,-0.772384,-0.836505,1.743516
7,1.740851,-0.461395,1.090703,-1.539021


In [114]:
s = df.iloc[3]

In [115]:
s

A   -2.230640
B    0.551372
C   -0.615315
D    0.082453
Name: 3, dtype: float64

In [116]:
df.append(s, ignore_index=True)

Unnamed: 0,A,B,C,D
0,0.864595,1.544583,-0.568863,0.450888
1,-0.826111,1.384692,-1.84349,-0.68314
2,1.105228,-2.245769,-1.033898,0.081351
3,-2.23064,0.551372,-0.615315,0.082453
4,-0.025385,-1.378332,0.861023,-0.940017
5,-0.645201,0.26687,1.443958,-1.748502
6,0.401533,-0.772384,-0.836505,1.743516
7,1.740851,-0.461395,1.090703,-1.539021
8,-2.23064,0.551372,-0.615315,0.082453
