### EDA을 위한 다양한 pandas function 연습  
---
참고링크 : https://pandas.pydata.org/docs/user_guide/10min.html  
한글 : https://dandyrilla.github.io/2017-08-12/pandas-10min/

API ref : https://pandas.pydata.org/docs/reference/index.html

In [28]:
import numpy as np
import pandas as pd

series and dataFrame 을 만들어 보고, 차이점을 기억해 두자
series 1차 배열
dataFrame 2차 배열

In [3]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.471062,1.023475,-0.870432,-2.020685
2013-01-02,-1.070431,-1.308385,1.994713,0.492073
2013-01-03,-0.280799,1.307437,1.334779,-0.338241
2013-01-04,-0.102879,-0.316482,-0.966446,1.186705
2013-01-05,-0.793463,0.297576,-0.562996,0.14607
2013-01-06,-0.419314,-0.979869,-0.480027,1.16388


dataFrame : DataFrame은 2차원 배열 데이터라고 이해하는 것보다 공통 인덱스를 가진 column series를 딕셔너리로 묶어놓은 것

In [6]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

df2 
# df2[:]
# df2[:][:]
# df2.loc[:,:]

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [7]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [8]:
print(df.head(2),'\n')

print(df.tail(2), '\n')

print(df.index, '\n')

print(df.columns, '\n')

df.describe()


                   A         B         C         D
2013-01-01  0.471062  1.023475 -0.870432 -2.020685
2013-01-02 -1.070431 -1.308385  1.994713  0.492073 

                   A         B         C        D
2013-01-05 -0.793463  0.297576 -0.562996  0.14607
2013-01-06 -0.419314 -0.979869 -0.480027  1.16388 

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D') 

Index(['A', 'B', 'C', 'D'], dtype='object') 



Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.365971,0.003959,0.074932,0.104967
std,0.540177,1.059955,1.262211,1.19648
min,-1.070431,-1.308385,-0.966446,-2.020685
25%,-0.699926,-0.814022,-0.793573,-0.217163
50%,-0.350057,-0.009453,-0.521511,0.319071
75%,-0.147359,0.842,0.881077,0.995928
max,0.471062,1.307437,1.994713,1.186705


In [9]:
# 행, 열을 바꾼다. 

df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,0.471062,-1.070431,-0.280799,-0.102879,-0.793463,-0.419314
B,1.023475,-1.308385,1.307437,-0.316482,0.297576,-0.979869
C,-0.870432,1.994713,1.334779,-0.966446,-0.562996,-0.480027
D,-2.020685,0.492073,-0.338241,1.186705,0.14607,1.16388


In [10]:
df.to_numpy()

array([[ 0.47106202,  1.02347461, -0.87043236, -2.02068497],
       [-1.07043124, -1.30838482,  1.99471345,  0.49207283],
       [-0.28079917,  1.30743681,  1.33477857, -0.33824067],
       [-0.10287882, -0.3164821 , -0.96644579,  1.18670545],
       [-0.79346272,  0.29757638, -0.56299563,  0.1460699 ],
       [-0.41931447, -0.97986866, -0.48002685,  1.16387964]])

### Selection
---
[]  
loc (or at)  
iloc (or iat)  

In [11]:
print(df['A'],'\n')
print(df[:]['A'],'\n')              #df.loc[:,'A']
# df['2013-01-01':]['A']            # chained indexing ?
# df.A

## column 을 선택하게 되며 series 로 리턴된다.


print(df[['A']],'\n')               # df 으로 리턴 
df[['A','C']]                       # dataFrame 리턴

## 2개 이사, 이거나 [[ 'col' ]] 으로 1개 col 을 df 로 리터


2013-01-01    0.471062
2013-01-02   -1.070431
2013-01-03   -0.280799
2013-01-04   -0.102879
2013-01-05   -0.793463
2013-01-06   -0.419314
Freq: D, Name: A, dtype: float64 

2013-01-01    0.471062
2013-01-02   -1.070431
2013-01-03   -0.280799
2013-01-04   -0.102879
2013-01-05   -0.793463
2013-01-06   -0.419314
Freq: D, Name: A, dtype: float64 

                   A
2013-01-01  0.471062
2013-01-02 -1.070431
2013-01-03 -0.280799
2013-01-04 -0.102879
2013-01-05 -0.793463
2013-01-06 -0.419314 



Unnamed: 0,A,C
2013-01-01,0.471062,-0.870432
2013-01-02,-1.070431,1.994713
2013-01-03,-0.280799,1.334779
2013-01-04,-0.102879,-0.966446
2013-01-05,-0.793463,-0.562996
2013-01-06,-0.419314,-0.480027


In [12]:
# 오류 

print(df[0],'\n')
print(df['2013-01-01'],'\n')

KeyError: 0

In [None]:
# df[0:3]
# df['2013-01-01':'2013-01-03']
print(df['2013-01-04':], '\n')
print(df['2013-01-04':'2013-01-04'], '\n')     # 3, 4 열, label 은 끝 포함
print(df[3:4], '\n')                           # 3열만, index 은 -1 까지
df[3:]

# row 을 선택하게 되며, dataFrame 으로 리턴된다

                   A         B         C         D
2013-01-04  1.172455 -0.848995 -0.408783  1.283429
2013-01-05 -1.323479 -0.351281 -0.094185  1.492026
2013-01-06  0.146790  0.712378 -1.464708  0.742792 

                   A         B         C         D
2013-01-04  1.172455 -0.848995 -0.408783  1.283429 

                   A         B         C         D
2013-01-04  1.172455 -0.848995 -0.408783  1.283429 



Unnamed: 0,A,B,C,D
2013-01-04,1.172455,-0.848995,-0.408783,1.283429
2013-01-05,-1.323479,-0.351281,-0.094185,1.492026
2013-01-06,0.14679,0.712378,-1.464708,0.742792


In [None]:
#chained indexing?
print(df2[2:4]['A'],'\n')
print(df[2:4][0:4], '\n')
df2[2:4][0:1]

2    1.0
3    1.0
Name: A, dtype: float64 

                   A         B         C         D
2013-01-03 -1.491181  1.049826 -0.386138 -0.212055
2013-01-04  1.172455 -0.848995 -0.408783  1.283429 



Unnamed: 0,A,B,C,D,E,F
2,1.0,2013-01-02,1.0,3,test,foo


---

Row

In [None]:
# row label 을 이용해서 row 을 인덱싱
print(df.loc['20130101'],'\n')              # series
print(df.loc['20130101':'20130102'],'\n')   # dataFrame
print(df.loc[['20130101','20130102']],'\n')

# row index 을 이용해서 row 인덱싱
print(df.iloc[0],'\n')                      # series
print(df.iloc[0:1],'\n')                    #dataFrame
print(df.iloc[[0,1]],'\n')



A   -0.724787
B   -1.571578
C   -0.790210
D   -1.593020
Name: 2013-01-01 00:00:00, dtype: float64 

                   A         B         C         D
2013-01-01 -0.724787 -1.571578 -0.790210 -1.593020
2013-01-02  0.628278 -0.008546  2.157422  1.173839 

                   A         B         C         D
2013-01-01 -0.724787 -1.571578 -0.790210 -1.593020
2013-01-02  0.628278 -0.008546  2.157422  1.173839 

A   -0.724787
B   -1.571578
C   -0.790210
D   -1.593020
Name: 2013-01-01 00:00:00, dtype: float64 

                   A         B        C        D
2013-01-01 -0.724787 -1.571578 -0.79021 -1.59302 

                   A         B         C         D
2013-01-01 -0.724787 -1.571578 -0.790210 -1.593020
2013-01-02  0.628278 -0.008546  2.157422  1.173839 



Column

In [None]:

# column lable 을 이용해서 column 인덱싱
# print(df.loc['A'],'\n')      # 'A' 을 row 인텍싱으로 생각해서 
print(df.loc[:,'A'],'\n')
print(df.loc[:, ['A']],'\n')
print(df.loc[:,'A':'B'],'\n')
print(df.loc[:,['A','B']],'\n')


2013-01-01   -0.724787
2013-01-02    0.628278
2013-01-03   -2.634815
2013-01-04    1.375928
2013-01-05    2.213788
2013-01-06   -0.713271
Freq: D, Name: A, dtype: float64 

                   A
2013-01-01 -0.724787
2013-01-02  0.628278
2013-01-03 -2.634815
2013-01-04  1.375928
2013-01-05  2.213788
2013-01-06 -0.713271 

                   A         B
2013-01-01 -0.724787 -1.571578
2013-01-02  0.628278 -0.008546
2013-01-03 -2.634815  0.319209
2013-01-04  1.375928 -0.022579
2013-01-05  2.213788 -1.187252
2013-01-06 -0.713271 -0.177195 

                   A         B
2013-01-01 -0.724787 -1.571578
2013-01-02  0.628278 -0.008546
2013-01-03 -2.634815  0.319209
2013-01-04  1.375928 -0.022579
2013-01-05  2.213788 -1.187252
2013-01-06 -0.713271 -0.177195 



In [None]:

# column  index 을 이용해서 column 인덱싱
# print(df.iloc[0],'\n')        # 0 을 row 인텍싱으로 생각해서 row 인텍싱
print(df.iloc[:,0],'\n')        # series 리턴
print(df.iloc[:,[0]])           # df 리턴


2013-01-01   -0.724787
2013-01-02    0.628278
2013-01-03   -2.634815
2013-01-04    1.375928
2013-01-05    2.213788
2013-01-06   -0.713271
Freq: D, Name: A, dtype: float64 

                   A
2013-01-01 -0.724787
2013-01-02  0.628278
2013-01-03 -2.634815
2013-01-04  1.375928
2013-01-05  2.213788
2013-01-06 -0.713271


In [None]:
print(df.iloc[:,0:2],'\n')


                   A         B
2013-01-01 -0.724787 -1.571578
2013-01-02  0.628278 -0.008546
2013-01-03 -2.634815  0.319209
2013-01-04  1.375928 -0.022579
2013-01-05  2.213788 -1.187252
2013-01-06 -0.713271 -0.177195 



In [None]:
print(df.iloc[:,[0,1,3]],'\n')

                   A         B         D
2013-01-01 -0.724787 -1.571578 -1.593020
2013-01-02  0.628278 -0.008546  1.173839
2013-01-03 -2.634815  0.319209  2.025052
2013-01-04  1.375928 -0.022579  0.800235
2013-01-05  2.213788 -1.187252 -0.007087
2013-01-06 -0.713271 -0.177195  1.057297 



Multi-indxing

In [None]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [None]:
print(df2.loc[[0,1],['A','B']])    # 숫자가 lable 이다
print(df2.iloc[[0,1],[0,1]])

     A          B
0  1.0 2013-01-02
1  1.0 2013-01-02
     A          B
0  1.0 2013-01-02
1  1.0 2013-01-02


sort & sum

In [None]:
print(df.sort_index())
print(df.sort_values('A'),'\n')
# df.sort_values(by='A')
print(df.sort_values(by= ['A','B'],ascending=False))


                   A         B         C         D
2013-01-01 -0.724787 -1.571578 -0.790210 -1.593020
2013-01-02  0.628278 -0.008546  2.157422  1.173839
2013-01-03 -2.634815  0.319209  0.449520  2.025052
2013-01-04  1.375928 -0.022579 -0.274962  0.800235
2013-01-05  2.213788 -1.187252 -0.664868 -0.007087
2013-01-06 -0.713271 -0.177195 -0.844971  1.057297
                   A         B         C         D
2013-01-03 -2.634815  0.319209  0.449520  2.025052
2013-01-01 -0.724787 -1.571578 -0.790210 -1.593020
2013-01-06 -0.713271 -0.177195 -0.844971  1.057297
2013-01-02  0.628278 -0.008546  2.157422  1.173839
2013-01-04  1.375928 -0.022579 -0.274962  0.800235
2013-01-05  2.213788 -1.187252 -0.664868 -0.007087 

                   A         B         C         D
2013-01-05  2.213788 -1.187252 -0.664868 -0.007087
2013-01-04  1.375928 -0.022579 -0.274962  0.800235
2013-01-02  0.628278 -0.008546  2.157422  1.173839
2013-01-06 -0.713271 -0.177195 -0.844971  1.057297
2013-01-01 -0.724787 -1.57157

In [None]:
print(df.sum())          # row 들의 합 (col : 합) axis=0 디폴트
print(df.sum(axis=1))    # column 들의 함 (row : 합) axis=1

A    0.145122
B   -2.647942
C    0.031932
D    3.456315
dtype: float64
2013-01-01   -4.679595
2013-01-02    3.950993
2013-01-03    0.158966
2013-01-04    1.878622
2013-01-05    0.354582
2013-01-06   -0.678140
Freq: D, dtype: float64
