## EDA을 위한 다양한 pandas function 연습  
---
참고링크 : https://pandas.pydata.org/docs/user_guide/10min.html  
한글 : https://dandyrilla.github.io/2017-08-12/pandas-10min/

API ref : https://pandas.pydata.org/docs/reference/index.html

In [1]:
import numpy as np
import pandas as pd

### series and dataFrame 

차이점을 기억해 두자  
series 1차 배열  
dataFrame 2차 배열  

실제 data 특성에 맞는 샘플 데이터를 만들수 있어야 한다.   
해당 샘플 데이터로 각 연산 결과를 미리 확인 할 수 있어야 한다.   
print(type()) 을 이용하여 object 의 타입을 정확히 파악해야 한다.  
( series[index_label] vs dataframe[column_label] )  
answer.ipynb 와 test.ipynb, sample.ipynb 으로 운영한다.  

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6, 4),
                   index=dates, 
                   columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.147133,-0.817674,-0.344967,-0.955441
2013-01-02,0.10473,-1.154512,0.49938,1.010028
2013-01-03,-0.730373,-0.164331,0.847347,0.18018
2013-01-04,-0.700296,-0.5874,0.216813,-0.823385
2013-01-05,-0.426451,1.33792,-1.047702,1.11401
2013-01-06,-0.543087,-0.399717,0.234636,-0.395774


dataFrame :   
DataFrame은 2차원 배열 데이터라고 이해하는 것보다 공통 인덱스를 가진 column series를 딕셔너리로 묶어놓은 것

In [5]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

df2 
# df2[:]
# df2[:][:]
# df2.loc[:,:]

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


### 2. Viewing Data

In [6]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [7]:
print(df.head(2),'\n')

print(df.tail(2), '\n')

print(df.index, '\n')

print(df.columns, '\n')

df.describe()


                   A         B         C         D
2013-01-01  0.147133 -0.817674 -0.344967 -0.955441
2013-01-02  0.104730 -1.154512  0.499380  1.010028 

                   A         B         C         D
2013-01-05 -0.426451  1.337920 -1.047702  1.114010
2013-01-06 -0.543087 -0.399717  0.234636 -0.395774 

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D') 

Index(['A', 'B', 'C', 'D'], dtype='object') 



Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.358057,-0.297619,0.067584,0.021603
std,0.390909,0.870847,0.672118,0.8987
min,-0.730373,-1.154512,-1.047702,-0.955441
25%,-0.660994,-0.760105,-0.204522,-0.716482
50%,-0.484769,-0.493558,0.225724,-0.107797
75%,-0.028065,-0.223177,0.433194,0.802566
max,0.147133,1.33792,0.847347,1.11401


In [8]:
# 행, 열을 바꾼다. 

df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,0.147133,0.10473,-0.730373,-0.700296,-0.426451,-0.543087
B,-0.817674,-1.154512,-0.164331,-0.5874,1.33792,-0.399717
C,-0.344967,0.49938,0.847347,0.216813,-1.047702,0.234636
D,-0.955441,1.010028,0.18018,-0.823385,1.11401,-0.395774


In [9]:
df.to_numpy()

array([[ 0.14713273, -0.81767368, -0.34496697, -0.95544058],
       [ 0.1047299 , -1.15451249,  0.49937985,  1.0100282 ],
       [-0.73037276, -0.16433068,  0.84734669,  0.1801797 ],
       [-0.70029631, -0.58739966,  0.21681258, -0.82338522],
       [-0.42645097,  1.33791978, -1.04770226,  1.11400961],
       [-0.54308684, -0.3997169 ,  0.2346362 , -0.39577431]])

In [10]:
df.select_dtypes(include='int64')

2013-01-01
2013-01-02
2013-01-03
2013-01-04
2013-01-05
2013-01-06


value_counts()

df.value_counts()   : 1 이상이면 동일한 row 존재  (전부 1이면, 다 값이 다르고, 실재 row 갯수 만큼 나온다.)  
series.value_counts() : 1 이상이면 동일한 값 존재

### 3. Selection 

.             : ```df.col```  
[]            : ```df[column]```         , * df[row_lable: ] 은 사용하지 말자.  
loc (or at)   : df.loc[index_lable,] 은 조건일때만 사용     , ```df.loc[ : , col_label]```    
iloc (or iat) : ```df.iloc[몇번째 row]``` , df.iloc[ : , 몇번째 col]     


In [11]:
print(df['A'],'\n')
print(df[:]['A'],'\n')              #df.loc[:,'A']
# df['2013-01-01':]['A']            # chained indexing ?
# df.A

## column 을 선택하게 되며 series 로 리턴된다.


print(df[['A']],'\n')               # df 으로 리턴 
df[['A','C']]                       # dataFrame 리턴

## 2개 이상, 이거나 [[ 'col' ]] 으로 1개 col 을 df 로 리턴


2013-01-01    0.147133
2013-01-02    0.104730
2013-01-03   -0.730373
2013-01-04   -0.700296
2013-01-05   -0.426451
2013-01-06   -0.543087
Freq: D, Name: A, dtype: float64 

2013-01-01    0.147133
2013-01-02    0.104730
2013-01-03   -0.730373
2013-01-04   -0.700296
2013-01-05   -0.426451
2013-01-06   -0.543087
Freq: D, Name: A, dtype: float64 

                   A
2013-01-01  0.147133
2013-01-02  0.104730
2013-01-03 -0.730373
2013-01-04 -0.700296
2013-01-05 -0.426451
2013-01-06 -0.543087 



Unnamed: 0,A,C
2013-01-01,0.147133,-0.344967
2013-01-02,0.10473,0.49938
2013-01-03,-0.730373,0.847347
2013-01-04,-0.700296,0.216813
2013-01-05,-0.426451,-1.047702
2013-01-06,-0.543087,0.234636


In [39]:
# 오류 

# print(df[0],'\n')                # 명시적 인덱스 0 을 찾기때문에 오류
# print(df['2013-01-01'],'\n')     # column 에서 찾기 때문에 오류 
print(df['2013-01-01':'2013-01-02'],'\n')  # 근데 이건 됨... 헐.
print(df[:'2013-01-02'],'\n') 

                   A         B         C         D
2013-01-01  0.147133 -0.817674 -0.344967 -0.955441
2013-01-02  0.104730 -1.154512  0.499380  1.010028 

                   A         B         C         D
2013-01-01  0.147133 -0.817674 -0.344967 -0.955441
2013-01-02  0.104730 -1.154512  0.499380  1.010028 



In [13]:
# df[0:3]
# df['2013-01-01':'2013-01-03']
print(df['2013-01-04':], '\n')
print(df['2013-01-04':'2013-01-04'], '\n')     # 3, 4 열, label 은 끝 포함
print(df[3:4], '\n')                           # 3열만, index 은 -1 까지
df[3:]

# row 을 선택하게 되며, dataFrame 으로 리턴된다

                   A         B         C         D
2013-01-04 -0.700296 -0.587400  0.216813 -0.823385
2013-01-05 -0.426451  1.337920 -1.047702  1.114010
2013-01-06 -0.543087 -0.399717  0.234636 -0.395774 

                   A       B         C         D
2013-01-04 -0.700296 -0.5874  0.216813 -0.823385 

                   A       B         C         D
2013-01-04 -0.700296 -0.5874  0.216813 -0.823385 



Unnamed: 0,A,B,C,D
2013-01-04,-0.700296,-0.5874,0.216813,-0.823385
2013-01-05,-0.426451,1.33792,-1.047702,1.11401
2013-01-06,-0.543087,-0.399717,0.234636,-0.395774


In [21]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [43]:
print(df2[:3])  # 이런 형태는 사용하지 말자.  3번째 까지, column 이랑 혼동된다.
# df2.iloc[,:3] 3번째 columns 까지 인건지, df2.iloc[:3,] 3번째 row 인건지??

print(df2.loc[:3,])  # 이것도 사용하지 말자.  3번 까지 , 3번이 어디있을줄 알고... 
# 정수형이 아니면 df.loc[:'서울',] 이건 괜찬을듯.

df2.iloc[:3,]   # 이걸 사용하자. 3번째 까지

     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo


In [49]:
#chained indexing?
print(df2[2:4]['A'],'\n')     # row / col
print(df2[2:4][0:4], '\n')     # row seriese 의 row serise
df2[2:4][0:1]                 # 2,3 row 로 만들어진 것에서 다시 0,1 row 첫번째 

2    1.0
3    1.0
Name: A, dtype: float64 

     A          B    C  D      E    F
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo 



Unnamed: 0,A,B,C,D,E,F
2,1.0,2013-01-02,1.0,3,test,foo


In [50]:
df2.iloc[0]     # 암묵적 인덱스 0 번째 선택

A                    1.0
B    2013-01-02 00:00:00
C                    1.0
D                      3
E                   test
F                    foo
Name: 0, dtype: object

In [51]:
df2.loc[2]        # 명시적 인덱스 2 선택

A                    1.0
B    2013-01-02 00:00:00
C                    1.0
D                      3
E                   test
F                    foo
Name: 2, dtype: object

Row

df.shape[0] = row 행갯수

df.iloc[몇번째 row,]  
df.loc[row_label,]  ```---> row_label 대신 조건절이 들어가면 조건절은 해당 row 의 index label 을 리턴한다.```

으로만 사용


In [52]:
df2.shape, df2.shape[0], df2.shape[1]

((4, 6), 4, 6)

In [53]:
# row label (명시적, 몇번) 을 이용해서 row 을 인덱싱
# index 가 int 인 경우 df.loc[index_int] 도 가능, 
print(df.loc['20130101'],'\n')              # series
print(df.loc['20130101':'20130102'],'\n')   # dataFrame
print(df.loc[['20130101','20130102']],'\n')

# row location (암묵적 , 몇번째) index 을 이용해서 row 인덱싱
print(df.iloc[0],'\n')                      # series
print(df.iloc[0:1],'\n')                    #dataFrame
print(df.iloc[[0,1]],'\n')



A   -0.259156
B    0.750137
C    1.109735
D   -0.300223
Name: 2013-01-01 00:00:00, dtype: float64 

                   A         B         C         D
2013-01-01 -0.259156  0.750137  1.109735 -0.300223
2013-01-02  1.915858  1.213006  0.308316 -0.323373 

                   A         B         C         D
2013-01-01 -0.259156  0.750137  1.109735 -0.300223
2013-01-02  1.915858  1.213006  0.308316 -0.323373 

A   -0.259156
B    0.750137
C    1.109735
D   -0.300223
Name: 2013-01-01 00:00:00, dtype: float64 

                   A         B         C         D
2013-01-01 -0.259156  0.750137  1.109735 -0.300223 

                   A         B         C         D
2013-01-01 -0.259156  0.750137  1.109735 -0.300223
2013-01-02  1.915858  1.213006  0.308316 -0.323373 



Column

df.shape[1] = columns 열갯수

특정 column 은 몇번째 column 인가? 는 어떻게 찾지?

In [54]:

# column lable 을 이용해서 column 인덱싱
# print(df.loc['A'],'\n')      # 'A' 을 row 인텍싱으로 생각해서 
print(df.loc[:,'A'],'\n')
print(df.loc[:, ['A']],'\n')
print(df.loc[:,'A':'B'],'\n')
print(df.loc[:,['A','B']],'\n')


2013-01-01   -0.259156
2013-01-02    1.915858
2013-01-03    1.158846
2013-01-04    1.329077
2013-01-05   -0.907403
2013-01-06    0.499070
Freq: D, Name: A, dtype: float64 

                   A
2013-01-01 -0.259156
2013-01-02  1.915858
2013-01-03  1.158846
2013-01-04  1.329077
2013-01-05 -0.907403
2013-01-06  0.499070 

                   A         B
2013-01-01 -0.259156  0.750137
2013-01-02  1.915858  1.213006
2013-01-03  1.158846  0.584409
2013-01-04  1.329077 -0.213619
2013-01-05 -0.907403  1.293912
2013-01-06  0.499070 -1.455961 

                   A         B
2013-01-01 -0.259156  0.750137
2013-01-02  1.915858  1.213006
2013-01-03  1.158846  0.584409
2013-01-04  1.329077 -0.213619
2013-01-05 -0.907403  1.293912
2013-01-06  0.499070 -1.455961 



In [55]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.259156,0.750137,1.109735,-0.300223
2013-01-02,1.915858,1.213006,0.308316,-0.323373
2013-01-03,1.158846,0.584409,-0.131491,-1.040146
2013-01-04,1.329077,-0.213619,-0.29708,0.573898
2013-01-05,-0.907403,1.293912,1.334146,1.802053


In [56]:

# column locatin (몇번째) index 을 이용해서 column 인덱싱
print(df.iloc[0],'\n')          # 0 을 row 인텍싱으로 생각해서 0번째 location row 을 인텍싱
print(df.iloc[:,0],'\n')        # 0 번째 column series 리턴
print(df.iloc[:,[0]])           # 0 번째 column df 리턴


A   -0.259156
B    0.750137
C    1.109735
D   -0.300223
Name: 2013-01-01 00:00:00, dtype: float64 

2013-01-01   -0.259156
2013-01-02    1.915858
2013-01-03    1.158846
2013-01-04    1.329077
2013-01-05   -0.907403
2013-01-06    0.499070
Freq: D, Name: A, dtype: float64 

                   A
2013-01-01 -0.259156
2013-01-02  1.915858
2013-01-03  1.158846
2013-01-04  1.329077
2013-01-05 -0.907403
2013-01-06  0.499070


In [57]:
print(df.iloc[:,0:2],'\n')


                   A         B
2013-01-01 -0.259156  0.750137
2013-01-02  1.915858  1.213006
2013-01-03  1.158846  0.584409
2013-01-04  1.329077 -0.213619
2013-01-05 -0.907403  1.293912
2013-01-06  0.499070 -1.455961 



In [58]:
print(df.iloc[:,[0,1,3]],'\n')

                   A         B         D
2013-01-01 -0.259156  0.750137 -0.300223
2013-01-02  1.915858  1.213006 -0.323373
2013-01-03  1.158846  0.584409 -1.040146
2013-01-04  1.329077 -0.213619  0.573898
2013-01-05 -0.907403  1.293912  1.802053
2013-01-06  0.499070 -1.455961  1.120325 



Multi-indxing

In [59]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [146]:
df2.loc[0:2,'A']

0    1.0
1    1.0
2    1.0
Name: A, dtype: float64

In [147]:
print(df2.loc[[0,1],['A','B']])    # 숫자가 lable 이다 df[[0,1]] 은 오류
print(df2.iloc[[0,1],[0,1]])

     A          B
0  1.0 2013-01-02
1  1.0 2013-01-02
     A          B
0  1.0 2013-01-02
1  1.0 2013-01-02


In [148]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.027218,0.134867,1.290976,-0.587499
2013-01-02,1.077611,0.406736,0.699013,0.638519
2013-01-03,0.369045,1.369056,-0.297095,-1.189735
2013-01-04,-1.109311,0.979234,-1.690049,0.369267
2013-01-05,0.283622,0.754991,2.05051,-0.092561
2013-01-06,-0.775822,0.029235,1.319602,-0.49749


In [149]:
df[2:6][df[2:6].A < 0][1:2]

# df 의 2부터 6 row 중 A column  값이 0 보다 작은 두번재 row 

Unnamed: 0,A,B,C,D
2013-01-06,-0.775822,0.029235,1.319602,-0.49749


In [150]:
print(df[2:6])
print(df[2:6].A<0)

# 2개의 index가 같아야 한다. 
# IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

                   A         B         C         D
2013-01-03  0.369045  1.369056 -0.297095 -1.189735
2013-01-04 -1.109311  0.979234 -1.690049  0.369267
2013-01-05  0.283622  0.754991  2.050510 -0.092561
2013-01-06 -0.775822  0.029235  1.319602 -0.497490
2013-01-03    False
2013-01-04     True
2013-01-05    False
2013-01-06     True
Freq: D, Name: A, dtype: bool


Boolean indexing

```(df[])[(df[]) 조건][column]```  
```(df[]).loc[ (df[]) 조건 , column ]```

iloc 는 boolean index 을 사용할 수 없다.  
iLocation based boolean indexing on an integer type is not available

In [151]:
df[df['A']>0]['B']

2013-01-01    0.134867
2013-01-02    0.406736
2013-01-03    1.369056
2013-01-05    0.754991
Name: B, dtype: float64

In [152]:
df.loc[df['A']>0,'B']


2013-01-01    0.134867
2013-01-02    0.406736
2013-01-03    1.369056
2013-01-05    0.754991
Name: B, dtype: float64

In [50]:
print(df[1:3].A > 0)  # 이거 대신
df.iloc[1:3].A > 0   # 이걸 사용하자

2013-01-02     True
2013-01-03    False
Freq: D, Name: A, dtype: bool


2013-01-02     True
2013-01-03    False
Freq: D, Name: A, dtype: bool

In [81]:
df.iloc[1:3][df.iloc[1:3].A >0]

Unnamed: 0,A,B,C,D
2013-01-02,0.10473,-1.154512,0.49938,1.010028


In [72]:
df[1:3][df[1:3].A > 0]   # chained indexing ???

Unnamed: 0,A,B,C,D
2013-01-02,0.10473,-1.154512,0.49938,1.010028


In [54]:
# df[df[1:3].A > 0]  은 오류가 난다.  df 의 index 와 df[1:3] 의 index 가 서로 맞지않다.
# 조건에 대한 true 값의 index label '2013-01-02' 을 리턴한다.
# 따라서 df['2013-01-02'] 가 오류

df[1:3].A > 0

2013-01-02     True
2013-01-03    False
Freq: D, Name: A, dtype: bool

In [101]:
# Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

# df.loc[df[1:3].A > 0] 
df[1:3].loc[df[1:3].A >0]

A    0.104730
B   -1.154512
C    0.499380
D    1.010028
Name: 2013-01-02 00:00:00, dtype: float64

In [86]:
df[1:3][df[0:4].A  > 0]['B']

  


2013-01-02   -1.154512
Freq: D, Name: B, dtype: float64

In [87]:
#  UserWarning: Boolean Series key will be reindexed to match DataFrame index. 해결방법 

X = df[1:3]
X[X[0:4].A > 0]['B']

2013-01-02   -1.154512
Freq: D, Name: B, dtype: float64

In [89]:
df[1:3][df[1:3].A > 0]['B']
# df1 = df[1:3]
# df1.A > 0
# df1['B']

2013-01-02   -1.154512
Freq: D, Name: B, dtype: float64

funtion indexing

```
pipe()  : Tablewise Function Application
apply() : Row or Column-wise Function Application (df 적용, 행/열 전달 )
agg() and transform() : Aggregation API
map() : Series 에서 값에 적용한다. (Series 적용, 값 전달)
applymap() : Applying Elementwise Functions  (df 적용, 값을 전달)

cut()    : 조건에 따른 구간별 label
where()  : 조건에 따른 값표시 (아닌값은 NaN)
```

In [156]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.027218,0.134867,1.290976,-0.587499
2013-01-02,1.077611,0.406736,0.699013,0.638519
2013-01-03,0.369045,1.369056,-0.297095,-1.189735
2013-01-04,-1.109311,0.979234,-1.690049,0.369267
2013-01-05,0.283622,0.754991,2.05051,-0.092561
2013-01-06,-0.775822,0.029235,1.319602,-0.49749


In [157]:
df3 = df.copy()
df3['sum_row'] = df3.apply(np.sum,axis=1)    # df3.sum(axis=1) 과 같다.
df3

Unnamed: 0,A,B,C,D,sum_row
2013-01-01,0.027218,0.134867,1.290976,-0.587499,0.865561
2013-01-02,1.077611,0.406736,0.699013,0.638519,2.821879
2013-01-03,0.369045,1.369056,-0.297095,-1.189735,0.251272
2013-01-04,-1.109311,0.979234,-1.690049,0.369267,-1.450859
2013-01-05,0.283622,0.754991,2.05051,-0.092561,2.996562
2013-01-06,-0.775822,0.029235,1.319602,-0.49749,0.075525


In [158]:
df3["result"] = df3.apply(lambda r: "Over" if r.sum_row > 0 else "Under", axis=1)
df3

Unnamed: 0,A,B,C,D,sum_row,result
2013-01-01,0.027218,0.134867,1.290976,-0.587499,0.865561,Over
2013-01-02,1.077611,0.406736,0.699013,0.638519,2.821879,Over
2013-01-03,0.369045,1.369056,-0.297095,-1.189735,0.251272,Over
2013-01-04,-1.109311,0.979234,-1.690049,0.369267,-1.450859,Under
2013-01-05,0.283622,0.754991,2.05051,-0.092561,2.996562,Over
2013-01-06,-0.775822,0.029235,1.319602,-0.49749,0.075525,Over


In [159]:
# bins=[-100,0,100]
# labels =['under','over']      # bins 보다 하나 적은 갯수 (사이값) 이어야 한다.
pd.cut(df3.sum_row, bins=[-100,0,100], labels=['under','over'] )

2013-01-01     over
2013-01-02     over
2013-01-03     over
2013-01-04    under
2013-01-05     over
2013-01-06     over
Freq: D, Name: sum_row, dtype: category
Categories (2, object): ['under' < 'over']

In [160]:
df3 = df3.drop(columns='result')
df3

Unnamed: 0,A,B,C,D,sum_row
2013-01-01,0.027218,0.134867,1.290976,-0.587499,0.865561
2013-01-02,1.077611,0.406736,0.699013,0.638519,2.821879
2013-01-03,0.369045,1.369056,-0.297095,-1.189735,0.251272
2013-01-04,-1.109311,0.979234,-1.690049,0.369267,-1.450859
2013-01-05,0.283622,0.754991,2.05051,-0.092561,2.996562
2013-01-06,-0.775822,0.029235,1.319602,-0.49749,0.075525


In [161]:
df3.where((df3.sum_row>0) & (df3.B>0))    # boolean 으로 리턴하지 않고, 값을 리턴 하다. dropna() 을 붙일수 있다.

Unnamed: 0,A,B,C,D,sum_row
2013-01-01,0.027218,0.134867,1.290976,-0.587499,0.865561
2013-01-02,1.077611,0.406736,0.699013,0.638519,2.821879
2013-01-03,0.369045,1.369056,-0.297095,-1.189735,0.251272
2013-01-04,,,,,
2013-01-05,0.283622,0.754991,2.05051,-0.092561,2.996562
2013-01-06,-0.775822,0.029235,1.319602,-0.49749,0.075525


In [162]:
df3[(df3.sum_row>0) & (df3.B>0)]

# dropna() 까지 한 결과

Unnamed: 0,A,B,C,D,sum_row
2013-01-01,0.027218,0.134867,1.290976,-0.587499,0.865561
2013-01-02,1.077611,0.406736,0.699013,0.638519,2.821879
2013-01-03,0.369045,1.369056,-0.297095,-1.189735,0.251272
2013-01-05,0.283622,0.754991,2.05051,-0.092561,2.996562
2013-01-06,-0.775822,0.029235,1.319602,-0.49749,0.075525


### 4. Missing 

``` 
dropna()
fillna()
notnull()
notna()   
isnull()
isna() 
```

In [163]:
df3.where((df3.sum_row>0) & (df3.B>0)).fillna(0)


Unnamed: 0,A,B,C,D,sum_row
2013-01-01,0.027218,0.134867,1.290976,-0.587499,0.865561
2013-01-02,1.077611,0.406736,0.699013,0.638519,2.821879
2013-01-03,0.369045,1.369056,-0.297095,-1.189735,0.251272
2013-01-04,0.0,0.0,0.0,0.0,0.0
2013-01-05,0.283622,0.754991,2.05051,-0.092561,2.996562
2013-01-06,-0.775822,0.029235,1.319602,-0.49749,0.075525


### 5. Operations

```
sum, mean, std, var
sort, count
apply
str
```

In [164]:
print(df.sort_index())
print(df.sort_values('A'),'\n')
# df.sort_values(by='A')
print(df.sort_values(by= ['A','B'],ascending=False))

                   A         B         C         D
2013-01-01  0.027218  0.134867  1.290976 -0.587499
2013-01-02  1.077611  0.406736  0.699013  0.638519
2013-01-03  0.369045  1.369056 -0.297095 -1.189735
2013-01-04 -1.109311  0.979234 -1.690049  0.369267
2013-01-05  0.283622  0.754991  2.050510 -0.092561
2013-01-06 -0.775822  0.029235  1.319602 -0.497490
                   A         B         C         D
2013-01-04 -1.109311  0.979234 -1.690049  0.369267
2013-01-06 -0.775822  0.029235  1.319602 -0.497490
2013-01-01  0.027218  0.134867  1.290976 -0.587499
2013-01-05  0.283622  0.754991  2.050510 -0.092561
2013-01-03  0.369045  1.369056 -0.297095 -1.189735
2013-01-02  1.077611  0.406736  0.699013  0.638519 

                   A         B         C         D
2013-01-02  1.077611  0.406736  0.699013  0.638519
2013-01-03  0.369045  1.369056 -0.297095 -1.189735
2013-01-05  0.283622  0.754991  2.050510 -0.092561
2013-01-01  0.027218  0.134867  1.290976 -0.587499
2013-01-06 -0.775822  0.02923

In [165]:
print(df.sum())          # row 들의 합 (col : 합) axis=0 디폴트
print(df.sum(axis=1))    # column 들의 함 (row : 합) axis=1

A   -0.127638
B    3.674119
C    3.372957
D   -1.359498
dtype: float64
2013-01-01    0.865561
2013-01-02    2.821879
2013-01-03    0.251272
2013-01-04   -1.450859
2013-01-05    2.996562
2013-01-06    0.075525
Freq: D, dtype: float64


### 6. Merging

```
concat
join
merge
```

### 7. Grouping

```
Splitting the data into groups based on some criteria
Applying a function to each group independently
Combining the results into a data structure

```

In [108]:
df_g = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)

df_g

Unnamed: 0,A,B,C,D
0,foo,one,0.211901,1.420478
1,bar,one,-0.179244,-0.374105
2,foo,two,-0.96869,-0.252017
3,bar,three,1.082912,0.954083
4,foo,two,0.559591,0.851646
5,bar,two,-1.144417,0.868136
6,foo,one,0.090689,1.935267
7,foo,three,-1.586886,0.209951


df.groupby().size()

In [110]:
df_g.groupby(['A','B']).size()

A    B    
bar  one      1
     three    1
     two      1
foo  one      2
     three    1
     two      2
dtype: int64

value_counts()

In [175]:
df_g.groupby(['A','B'])['C'].value_counts()

# df_g.groupby(['A','B'])['C'].value_counts().reset_index() 는 오류발생. ('C' 가 중복된다. )
# new_df_g = df_g.groupby(['A','B'])['C'].value_counts()
# new_df_g.name = C_count
# new_df_g.reset_index()


A    B      C        
bar  one     0.332875    1
     three   0.269852    1
     two     0.881340    1
foo  one    -0.823885    1
             0.162408    1
     three   0.049036    1
     two    -1.470511    1
             1.209252    1
Name: C, dtype: int64

In [109]:
df_g.groupby(['A','B'])[['C']].sum().reset_index()

Unnamed: 0,A,B,C
0,bar,one,1.429825
1,bar,three,0.137487
2,bar,two,-0.089975
3,foo,one,-0.245829
4,foo,three,-1.093053
5,foo,two,2.171548


### 8. Reshaping

```
reshpae(-1)
stack / unstack 
pivot table
```

In [48]:
df_g.set_index(['A','B'])[['C']]#.unstack()

# Index contains duplicate entries, cannot reshape : 'foo one' 이 중복된다. 
# df.drop_duplicates(subset=['index','column']) 으로 해결

Unnamed: 0_level_0,Unnamed: 1_level_0,C
A,B,Unnamed: 2_level_1
foo,one,0.091632
bar,one,-0.821188
foo,two,-0.142519
bar,three,-2.197673
foo,two,1.76174
bar,two,0.456469
foo,one,-0.480759
foo,three,1.980207


drop_duplicates

subset : 중복을 검사할 column 
keep: 중복값중 남길 행

df.drop ()
axis = 0/1 (행/열)     조건이 있는 경우 slicing 을 이용하자 

In [49]:
# df_g2 = df_g.drop([6,2]).reset_index(drop=True) 
# A,B 열 값이 중복되는 2, 6 row을 삭제

df_g2 = df_g.drop_duplicates(subset=['A','B'],keep='first', ignore_index=True).copy()  # inplace = True 으로 원본 적용

df_g2

Unnamed: 0,A,B,C,D
0,foo,one,0.091632,0.29466
1,bar,one,-0.821188,0.711229
2,foo,two,-0.142519,0.714619
3,bar,three,-2.197673,2.046995
4,bar,two,0.456469,1.471471
5,foo,three,1.980207,0.255492


In [52]:
df_g2.set_index(['A','B'])[['C']].unstack()

Unnamed: 0_level_0,C,C,C
B,one,three,two
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
bar,-0.821188,-2.197673,0.456469
foo,0.091632,1.980207,-0.142519


In [344]:
df_g2.pivot(index='A',columns='B',values='C')

B,one,three,two
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,-0.955413,-0.767378,1.233851
foo,0.020773,-1.128363,-0.070888


### 9. Time Series

In [102]:
rng = pd.date_range("1/1/2012", periods=100, freq="S")

ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)

ts.resample("5Min").sum()

2012-01-01    23746
Freq: 5T, dtype: int64

In [105]:
ts

2012-01-01 00:00:00    125
2012-01-01 00:00:01    437
2012-01-01 00:00:02    101
2012-01-01 00:00:03    389
2012-01-01 00:00:04    115
                      ... 
2012-01-01 00:01:35    310
2012-01-01 00:01:36    434
2012-01-01 00:01:37    298
2012-01-01 00:01:38    108
2012-01-01 00:01:39    153
Freq: S, Length: 100, dtype: int64

### 10. Categroicals

### 11. Plotting