## EDA을 위한 다양한 pandas function 연습  
---
참고링크 : https://pandas.pydata.org/docs/user_guide/10min.html  
한글 : https://dandyrilla.github.io/2017-08-12/pandas-10min/

API ref : https://pandas.pydata.org/docs/reference/index.html

In [2]:
import numpy as np
import pandas as pd

### series and dataFrame 

차이점을 기억해 두자  
series 1차 배열  
dataFrame 2차 배열  

실제 data 특성에 맞는 샘플 데이터를 만들수 있어야 한다.   
해당 샘플 데이터로 각 연산 결과를 미리 확인 할 수 있어야 한다.   
print(type()) 을 이용하여 object 의 타입을 정확히 파악해야 한다.  
( series[index_label] vs dataframe[column_label] )  
answer.ipynb 와 test.ipynb, sample.ipynb 으로 운영한다.  

In [3]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.685896,-0.360552,-0.499072,0.878319
2013-01-02,-0.398987,1.285879,-0.099484,-0.588932
2013-01-03,0.076085,-1.526245,-0.33018,-1.299276
2013-01-04,1.463383,-0.564231,-0.308063,-0.024509
2013-01-05,-0.401422,-0.341176,1.233796,0.599668
2013-01-06,-2.052727,-1.029365,0.085973,-1.724867


dataFrame :   
DataFrame은 2차원 배열 데이터라고 이해하는 것보다 공통 인덱스를 가진 column series를 딕셔너리로 묶어놓은 것

In [6]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

df2 
# df2[:]
# df2[:][:]
# df2.loc[:,:]

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


### 2. Viewing Data

In [7]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [8]:
print(df.head(2),'\n')

print(df.tail(2), '\n')

print(df.index, '\n')

print(df.columns, '\n')

df.describe()


                   A         B         C         D
2013-01-01 -0.685896 -0.360552 -0.499072  0.878319
2013-01-02 -0.398987  1.285879 -0.099484 -0.588932 

                   A         B         C         D
2013-01-05 -0.401422 -0.341176  1.233796  0.599668
2013-01-06 -2.052727 -1.029365  0.085973 -1.724867 

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D') 

Index(['A', 'B', 'C', 'D'], dtype='object') 



Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.333261,-0.422615,0.013828,-0.359933
std,1.138892,0.952432,0.631092,1.035805
min,-2.052727,-1.526245,-0.499072,-1.724867
25%,-0.614778,-0.913082,-0.32465,-1.12169
50%,-0.400204,-0.462392,-0.203773,-0.306721
75%,-0.042683,-0.34602,0.039609,0.443624
max,1.463383,1.285879,1.233796,0.878319


In [9]:
# 행, 열을 바꾼다. 

df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.685896,-0.398987,0.076085,1.463383,-0.401422,-2.052727
B,-0.360552,1.285879,-1.526245,-0.564231,-0.341176,-1.029365
C,-0.499072,-0.099484,-0.33018,-0.308063,1.233796,0.085973
D,0.878319,-0.588932,-1.299276,-0.024509,0.599668,-1.724867


In [10]:
df.to_numpy()

array([[-0.68589597, -0.36055244, -0.49907206,  0.87831914],
       [-0.39898654,  1.28587854, -0.09948391, -0.58893242],
       [ 0.07608466, -1.52624528, -0.33017973, -1.29927583],
       [ 1.46338318, -0.56423122, -0.30806275, -0.02450867],
       [-0.40142235, -0.34117579,  1.23379601,  0.59966796],
       [-2.05272712, -1.02936525,  0.08597291, -1.72486708]])

### 3. Selection 
```
.             : df.col
[]            : df[column]         , df[row_int: ]  
loc (or at)   : df.loc[index]      , df.loc[ : ,col]  
iloc (or iat) : df.iloc[index_int] , df.iloc[ : , col_int]  
```

In [11]:
print(df['A'],'\n')
print(df[:]['A'],'\n')              #df.loc[:,'A']
# df['2013-01-01':]['A']            # chained indexing ?
# df.A

## column 을 선택하게 되며 series 로 리턴된다.


print(df[['A']],'\n')               # df 으로 리턴 
df[['A','C']]                       # dataFrame 리턴

## 2개 이사, 이거나 [[ 'col' ]] 으로 1개 col 을 df 로 리터


2013-01-01   -0.685896
2013-01-02   -0.398987
2013-01-03    0.076085
2013-01-04    1.463383
2013-01-05   -0.401422
2013-01-06   -2.052727
Freq: D, Name: A, dtype: float64 

2013-01-01   -0.685896
2013-01-02   -0.398987
2013-01-03    0.076085
2013-01-04    1.463383
2013-01-05   -0.401422
2013-01-06   -2.052727
Freq: D, Name: A, dtype: float64 

                   A
2013-01-01 -0.685896
2013-01-02 -0.398987
2013-01-03  0.076085
2013-01-04  1.463383
2013-01-05 -0.401422
2013-01-06 -2.052727 



Unnamed: 0,A,C
2013-01-01,-0.685896,-0.499072
2013-01-02,-0.398987,-0.099484
2013-01-03,0.076085,-0.33018
2013-01-04,1.463383,-0.308063
2013-01-05,-0.401422,1.233796
2013-01-06,-2.052727,0.085973


In [None]:
# 오류 

print(df[0],'\n')
print(df['2013-01-01'],'\n')

In [13]:
# df[0:3]
# df['2013-01-01':'2013-01-03']
print(df['2013-01-04':], '\n')
print(df['2013-01-04':'2013-01-04'], '\n')     # 3, 4 열, label 은 끝 포함
print(df[3:4], '\n')                           # 3열만, index 은 -1 까지
df[3:]

# row 을 선택하게 되며, dataFrame 으로 리턴된다

                   A         B         C         D
2013-01-04  1.463383 -0.564231 -0.308063 -0.024509
2013-01-05 -0.401422 -0.341176  1.233796  0.599668
2013-01-06 -2.052727 -1.029365  0.085973 -1.724867 

                   A         B         C         D
2013-01-04  1.463383 -0.564231 -0.308063 -0.024509 

                   A         B         C         D
2013-01-04  1.463383 -0.564231 -0.308063 -0.024509 



Unnamed: 0,A,B,C,D
2013-01-04,1.463383,-0.564231,-0.308063,-0.024509
2013-01-05,-0.401422,-0.341176,1.233796,0.599668
2013-01-06,-2.052727,-1.029365,0.085973,-1.724867


In [14]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [52]:
#chained indexing?
print(df2[2:4]['A'],'\n')     # row / col
print(df2[2:4][0:4], '\n')     # row seriese 의 row serise
df2[2:4][0:1]                 # 2,3 row 로 만들어진 것에서 다시 0,1 row 첫번째 

2    1.0
3    1.0
Name: A, dtype: float64 

     A          B    C  D      E    F
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo 



Unnamed: 0,A,B,C,D,E,F
2,1.0,2013-01-02,1.0,3,test,foo


Row

In [16]:
# row label 을 이용해서 row 을 인덱싱
print(df.loc['20130101'],'\n')              # series
print(df.loc['20130101':'20130102'],'\n')   # dataFrame
print(df.loc[['20130101','20130102']],'\n')

# row index 을 이용해서 row 인덱싱
print(df.iloc[0],'\n')                      # series
print(df.iloc[0:1],'\n')                    #dataFrame
print(df.iloc[[0,1]],'\n')



A   -0.685896
B   -0.360552
C   -0.499072
D    0.878319
Name: 2013-01-01 00:00:00, dtype: float64 

                   A         B         C         D
2013-01-01 -0.685896 -0.360552 -0.499072  0.878319
2013-01-02 -0.398987  1.285879 -0.099484 -0.588932 

                   A         B         C         D
2013-01-01 -0.685896 -0.360552 -0.499072  0.878319
2013-01-02 -0.398987  1.285879 -0.099484 -0.588932 

A   -0.685896
B   -0.360552
C   -0.499072
D    0.878319
Name: 2013-01-01 00:00:00, dtype: float64 

                   A         B         C         D
2013-01-01 -0.685896 -0.360552 -0.499072  0.878319 

                   A         B         C         D
2013-01-01 -0.685896 -0.360552 -0.499072  0.878319
2013-01-02 -0.398987  1.285879 -0.099484 -0.588932 



Column

In [17]:

# column lable 을 이용해서 column 인덱싱
# print(df.loc['A'],'\n')      # 'A' 을 row 인텍싱으로 생각해서 
print(df.loc[:,'A'],'\n')
print(df.loc[:, ['A']],'\n')
print(df.loc[:,'A':'B'],'\n')
print(df.loc[:,['A','B']],'\n')


2013-01-01   -0.685896
2013-01-02   -0.398987
2013-01-03    0.076085
2013-01-04    1.463383
2013-01-05   -0.401422
2013-01-06   -2.052727
Freq: D, Name: A, dtype: float64 

                   A
2013-01-01 -0.685896
2013-01-02 -0.398987
2013-01-03  0.076085
2013-01-04  1.463383
2013-01-05 -0.401422
2013-01-06 -2.052727 

                   A         B
2013-01-01 -0.685896 -0.360552
2013-01-02 -0.398987  1.285879
2013-01-03  0.076085 -1.526245
2013-01-04  1.463383 -0.564231
2013-01-05 -0.401422 -0.341176
2013-01-06 -2.052727 -1.029365 

                   A         B
2013-01-01 -0.685896 -0.360552
2013-01-02 -0.398987  1.285879
2013-01-03  0.076085 -1.526245
2013-01-04  1.463383 -0.564231
2013-01-05 -0.401422 -0.341176
2013-01-06 -2.052727 -1.029365 



In [18]:

# column  index 을 이용해서 column 인덱싱
# print(df.iloc[0],'\n')        # 0 을 row 인텍싱으로 생각해서 row 인텍싱
print(df.iloc[:,0],'\n')        # series 리턴
print(df.iloc[:,[0]])           # df 리턴


2013-01-01   -0.685896
2013-01-02   -0.398987
2013-01-03    0.076085
2013-01-04    1.463383
2013-01-05   -0.401422
2013-01-06   -2.052727
Freq: D, Name: A, dtype: float64 

                   A
2013-01-01 -0.685896
2013-01-02 -0.398987
2013-01-03  0.076085
2013-01-04  1.463383
2013-01-05 -0.401422
2013-01-06 -2.052727


In [19]:
print(df.iloc[:,0:2],'\n')


                   A         B
2013-01-01 -0.685896 -0.360552
2013-01-02 -0.398987  1.285879
2013-01-03  0.076085 -1.526245
2013-01-04  1.463383 -0.564231
2013-01-05 -0.401422 -0.341176
2013-01-06 -2.052727 -1.029365 



In [20]:
print(df.iloc[:,[0,1,3]],'\n')

                   A         B         D
2013-01-01 -0.685896 -0.360552  0.878319
2013-01-02 -0.398987  1.285879 -0.588932
2013-01-03  0.076085 -1.526245 -1.299276
2013-01-04  1.463383 -0.564231 -0.024509
2013-01-05 -0.401422 -0.341176  0.599668
2013-01-06 -2.052727 -1.029365 -1.724867 



Multi-indxing

In [21]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [22]:
print(df2.loc[[0,1],['A','B']])    # 숫자가 lable 이다 df[[0,1]] 은 오류
print(df2.iloc[[0,1],[0,1]])

     A          B
0  1.0 2013-01-02
1  1.0 2013-01-02
     A          B
0  1.0 2013-01-02
1  1.0 2013-01-02


In [67]:
df[1:3][0:4]

Unnamed: 0,A,B,C,D
2013-01-02,-0.398987,1.285879,-0.099484,-0.588932
2013-01-03,0.076085,-1.526245,-0.33018,-1.299276


condition

In [23]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,,,0.878319
2013-01-02,,1.285879,,
2013-01-03,0.076085,,,
2013-01-04,1.463383,,,
2013-01-05,,,1.233796,0.599668
2013-01-06,,,0.085973,


In [35]:
df[1:3].A > 0

2013-01-02    False
2013-01-03     True
Freq: D, Name: A, dtype: bool

In [61]:
df[1:3][df[1:3].A > 0]

# df[df[1:3].A > 0]  은 오류가 난다.  df 의 index 와 df[1:3] 의 index 가 서로 맞지않다.

Unnamed: 0,A,B,C,D
2013-01-03,0.076085,-1.526245,-0.33018,-1.299276


### 4. Missing Data

### 5. Operations

### 6. Merging

sort & sum

In [24]:
print(df.sort_index())
print(df.sort_values('A'),'\n')
# df.sort_values(by='A')
print(df.sort_values(by= ['A','B'],ascending=False))


                   A         B         C         D
2013-01-01 -0.685896 -0.360552 -0.499072  0.878319
2013-01-02 -0.398987  1.285879 -0.099484 -0.588932
2013-01-03  0.076085 -1.526245 -0.330180 -1.299276
2013-01-04  1.463383 -0.564231 -0.308063 -0.024509
2013-01-05 -0.401422 -0.341176  1.233796  0.599668
2013-01-06 -2.052727 -1.029365  0.085973 -1.724867
                   A         B         C         D
2013-01-06 -2.052727 -1.029365  0.085973 -1.724867
2013-01-01 -0.685896 -0.360552 -0.499072  0.878319
2013-01-05 -0.401422 -0.341176  1.233796  0.599668
2013-01-02 -0.398987  1.285879 -0.099484 -0.588932
2013-01-03  0.076085 -1.526245 -0.330180 -1.299276
2013-01-04  1.463383 -0.564231 -0.308063 -0.024509 

                   A         B         C         D
2013-01-04  1.463383 -0.564231 -0.308063 -0.024509
2013-01-03  0.076085 -1.526245 -0.330180 -1.299276
2013-01-02 -0.398987  1.285879 -0.099484 -0.588932
2013-01-05 -0.401422 -0.341176  1.233796  0.599668
2013-01-01 -0.685896 -0.36055

In [25]:
print(df.sum())          # row 들의 합 (col : 합) axis=0 디폴트
print(df.sum(axis=1))    # column 들의 함 (row : 합) axis=1

A   -1.999564
B   -2.535691
C    0.082970
D   -2.159597
dtype: float64
2013-01-01   -0.667201
2013-01-02    0.198476
2013-01-03   -3.079616
2013-01-04    0.566581
2013-01-05    1.090866
2013-01-06   -4.720987
Freq: D, dtype: float64
