### EDA을 위한 다양한 pandas function 연습  
---
참고링크 : https://pandas.pydata.org/docs/user_guide/10min.html  
한글 : https://dandyrilla.github.io/2017-08-12/pandas-10min/

API ref : https://pandas.pydata.org/docs/reference/index.html

In [1]:
import numpy as np
import pandas as pd

series and dataFrame 을 만들어 보고, 차이점을 기억해 두자
series 1차 배열
dataFrame 2차 배열

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.217713,0.81065,-0.43828,1.008279
2013-01-02,0.304833,0.101501,1.050351,0.999388
2013-01-03,-1.59816,1.083067,1.71849,-0.560146
2013-01-04,0.397187,0.491712,0.301143,1.564522
2013-01-05,1.440711,2.338933,0.338465,-1.508069
2013-01-06,-1.71078,0.001895,0.54342,-0.3684


dataFrame : DataFrame은 2차원 배열 데이터라고 이해하는 것보다 공통 인덱스를 가진 column series를 딕셔너리로 묶어놓은 것

In [5]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

df2 
# df2[:]
# df2[:][:]
# df2.loc[:,:]

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [6]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [7]:
print(df.head(2),'\n')

print(df.tail(2), '\n')

print(df.index, '\n')

print(df.columns, '\n')

df.describe()


                   A         B         C         D
2013-01-01 -0.217713  0.810650 -0.438280  1.008279
2013-01-02  0.304833  0.101501  1.050351  0.999388 

                   A         B         C         D
2013-01-05  1.440711  2.338933  0.338465 -1.508069
2013-01-06 -1.710780  0.001895  0.543420 -0.368400 

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D') 

Index(['A', 'B', 'C', 'D'], dtype='object') 



Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.230654,0.804626,0.585598,0.189262
std,1.227632,0.856544,0.733694,1.180844
min,-1.71078,0.001895,-0.43828,-1.508069
25%,-1.253049,0.199054,0.310473,-0.51221
50%,0.04356,0.651181,0.440942,0.315494
75%,0.374099,1.014963,0.923619,1.006056
max,1.440711,2.338933,1.71849,1.564522


In [8]:
# 행, 열을 바꾼다. 

df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.217713,0.304833,-1.59816,0.397187,1.440711,-1.71078
B,0.81065,0.101501,1.083067,0.491712,2.338933,0.001895
C,-0.43828,1.050351,1.71849,0.301143,0.338465,0.54342
D,1.008279,0.999388,-0.560146,1.564522,-1.508069,-0.3684


In [9]:
df.to_numpy()

array([[-2.17713166e-01,  8.10650282e-01, -4.38280011e-01,
         1.00827874e+00],
       [ 3.04833323e-01,  1.01501194e-01,  1.05035147e+00,
         9.99388499e-01],
       [-1.59816028e+00,  1.08306703e+00,  1.71849028e+00,
        -5.60146198e-01],
       [ 3.97187321e-01,  4.91712157e-01,  3.01142573e-01,
         1.56452229e+00],
       [ 1.44071116e+00,  2.33893334e+00,  3.38464991e-01,
        -1.50806861e+00],
       [-1.71077994e+00,  1.89458805e-03,  5.43419779e-01,
        -3.68399887e-01]])

### Selection
---
[column] , [ row : ]  
loc (or at)  loc[row] , loc [:,col]  
iloc (or iat)  iloc[row_int], iloc[:, col_int]  

In [10]:
print(df['A'],'\n')
print(df[:]['A'],'\n')              #df.loc[:,'A']
# df['2013-01-01':]['A']            # chained indexing ?
# df.A

## column 을 선택하게 되며 series 로 리턴된다.


print(df[['A']],'\n')               # df 으로 리턴 
df[['A','C']]                       # dataFrame 리턴

## 2개 이사, 이거나 [[ 'col' ]] 으로 1개 col 을 df 로 리터


2013-01-01   -0.217713
2013-01-02    0.304833
2013-01-03   -1.598160
2013-01-04    0.397187
2013-01-05    1.440711
2013-01-06   -1.710780
Freq: D, Name: A, dtype: float64 

2013-01-01   -0.217713
2013-01-02    0.304833
2013-01-03   -1.598160
2013-01-04    0.397187
2013-01-05    1.440711
2013-01-06   -1.710780
Freq: D, Name: A, dtype: float64 

                   A
2013-01-01 -0.217713
2013-01-02  0.304833
2013-01-03 -1.598160
2013-01-04  0.397187
2013-01-05  1.440711
2013-01-06 -1.710780 



Unnamed: 0,A,C
2013-01-01,-0.217713,-0.43828
2013-01-02,0.304833,1.050351
2013-01-03,-1.59816,1.71849
2013-01-04,0.397187,0.301143
2013-01-05,1.440711,0.338465
2013-01-06,-1.71078,0.54342


In [None]:
# 오류 

print(df[0],'\n')
print(df['2013-01-01'],'\n')

In [12]:
# df[0:3]
# df['2013-01-01':'2013-01-03']
print(df['2013-01-04':], '\n')
print(df['2013-01-04':'2013-01-04'], '\n')     # 3, 4 열, label 은 끝 포함
print(df[3:4], '\n')                           # 3열만, index 은 -1 까지
df[3:]

# row 을 선택하게 되며, dataFrame 으로 리턴된다

                   A         B         C         D
2013-01-04  0.397187  0.491712  0.301143  1.564522
2013-01-05  1.440711  2.338933  0.338465 -1.508069
2013-01-06 -1.710780  0.001895  0.543420 -0.368400 

                   A         B         C         D
2013-01-04  0.397187  0.491712  0.301143  1.564522 

                   A         B         C         D
2013-01-04  0.397187  0.491712  0.301143  1.564522 



Unnamed: 0,A,B,C,D
2013-01-04,0.397187,0.491712,0.301143,1.564522
2013-01-05,1.440711,2.338933,0.338465,-1.508069
2013-01-06,-1.71078,0.001895,0.54342,-0.3684


In [13]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [14]:
#chained indexing?
print(df2[2:4]['A'],'\n')     # row / col
print(df[2:4][0:4], '\n')     # row seriese 의 row serise
df2[2:4][0:1]                 # 2,3 row 로 만들어진 것에서 다시 0,1 row 첫번째 

2    1.0
3    1.0
Name: A, dtype: float64 

                   A         B         C         D
2013-01-03 -1.598160  1.083067  1.718490 -0.560146
2013-01-04  0.397187  0.491712  0.301143  1.564522 



Unnamed: 0,A,B,C,D,E,F
2,1.0,2013-01-02,1.0,3,test,foo


---

Row

In [15]:
# row label 을 이용해서 row 을 인덱싱
print(df.loc['20130101'],'\n')              # series
print(df.loc['20130101':'20130102'],'\n')   # dataFrame
print(df.loc[['20130101','20130102']],'\n')

# row index 을 이용해서 row 인덱싱
print(df.iloc[0],'\n')                      # series
print(df.iloc[0:1],'\n')                    #dataFrame
print(df.iloc[[0,1]],'\n')



A   -0.217713
B    0.810650
C   -0.438280
D    1.008279
Name: 2013-01-01 00:00:00, dtype: float64 

                   A         B         C         D
2013-01-01 -0.217713  0.810650 -0.438280  1.008279
2013-01-02  0.304833  0.101501  1.050351  0.999388 

                   A         B         C         D
2013-01-01 -0.217713  0.810650 -0.438280  1.008279
2013-01-02  0.304833  0.101501  1.050351  0.999388 

A   -0.217713
B    0.810650
C   -0.438280
D    1.008279
Name: 2013-01-01 00:00:00, dtype: float64 

                   A        B        C         D
2013-01-01 -0.217713  0.81065 -0.43828  1.008279 

                   A         B         C         D
2013-01-01 -0.217713  0.810650 -0.438280  1.008279
2013-01-02  0.304833  0.101501  1.050351  0.999388 



Column

In [16]:

# column lable 을 이용해서 column 인덱싱
# print(df.loc['A'],'\n')      # 'A' 을 row 인텍싱으로 생각해서 
print(df.loc[:,'A'],'\n')
print(df.loc[:, ['A']],'\n')
print(df.loc[:,'A':'B'],'\n')
print(df.loc[:,['A','B']],'\n')


2013-01-01   -0.217713
2013-01-02    0.304833
2013-01-03   -1.598160
2013-01-04    0.397187
2013-01-05    1.440711
2013-01-06   -1.710780
Freq: D, Name: A, dtype: float64 

                   A
2013-01-01 -0.217713
2013-01-02  0.304833
2013-01-03 -1.598160
2013-01-04  0.397187
2013-01-05  1.440711
2013-01-06 -1.710780 

                   A         B
2013-01-01 -0.217713  0.810650
2013-01-02  0.304833  0.101501
2013-01-03 -1.598160  1.083067
2013-01-04  0.397187  0.491712
2013-01-05  1.440711  2.338933
2013-01-06 -1.710780  0.001895 

                   A         B
2013-01-01 -0.217713  0.810650
2013-01-02  0.304833  0.101501
2013-01-03 -1.598160  1.083067
2013-01-04  0.397187  0.491712
2013-01-05  1.440711  2.338933
2013-01-06 -1.710780  0.001895 



In [18]:

# column  index 을 이용해서 column 인덱싱
# print(df.iloc[0],'\n')        # 0 을 row 인텍싱으로 생각해서 row 인텍싱
print(df.iloc[:,0],'\n')        # series 리턴
print(df.iloc[:,[0]])           # df 리턴


2013-01-01   -0.217713
2013-01-02    0.304833
2013-01-03   -1.598160
2013-01-04    0.397187
2013-01-05    1.440711
2013-01-06   -1.710780
Freq: D, Name: A, dtype: float64 

                   A
2013-01-01 -0.217713
2013-01-02  0.304833
2013-01-03 -1.598160
2013-01-04  0.397187
2013-01-05  1.440711
2013-01-06 -1.710780


In [19]:
print(df.iloc[:,0:2],'\n')


                   A         B
2013-01-01 -0.217713  0.810650
2013-01-02  0.304833  0.101501
2013-01-03 -1.598160  1.083067
2013-01-04  0.397187  0.491712
2013-01-05  1.440711  2.338933
2013-01-06 -1.710780  0.001895 



In [20]:
print(df.iloc[:,[0,1,3]],'\n')

                   A         B         D
2013-01-01 -0.217713  0.810650  1.008279
2013-01-02  0.304833  0.101501  0.999388
2013-01-03 -1.598160  1.083067 -0.560146
2013-01-04  0.397187  0.491712  1.564522
2013-01-05  1.440711  2.338933 -1.508069
2013-01-06 -1.710780  0.001895 -0.368400 



Multi-indxing

In [21]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [22]:
print(df2.loc[[0,1],['A','B']])    # 숫자가 lable 이다
print(df2.iloc[[0,1],[0,1]])

     A          B
0  1.0 2013-01-02
1  1.0 2013-01-02
     A          B
0  1.0 2013-01-02
1  1.0 2013-01-02


sort & sum

In [23]:
print(df.sort_index())
print(df.sort_values('A'),'\n')
# df.sort_values(by='A')
print(df.sort_values(by= ['A','B'],ascending=False))


                   A         B         C         D
2013-01-01 -0.217713  0.810650 -0.438280  1.008279
2013-01-02  0.304833  0.101501  1.050351  0.999388
2013-01-03 -1.598160  1.083067  1.718490 -0.560146
2013-01-04  0.397187  0.491712  0.301143  1.564522
2013-01-05  1.440711  2.338933  0.338465 -1.508069
2013-01-06 -1.710780  0.001895  0.543420 -0.368400
                   A         B         C         D
2013-01-06 -1.710780  0.001895  0.543420 -0.368400
2013-01-03 -1.598160  1.083067  1.718490 -0.560146
2013-01-01 -0.217713  0.810650 -0.438280  1.008279
2013-01-02  0.304833  0.101501  1.050351  0.999388
2013-01-04  0.397187  0.491712  0.301143  1.564522
2013-01-05  1.440711  2.338933  0.338465 -1.508069 

                   A         B         C         D
2013-01-05  1.440711  2.338933  0.338465 -1.508069
2013-01-04  0.397187  0.491712  0.301143  1.564522
2013-01-02  0.304833  0.101501  1.050351  0.999388
2013-01-01 -0.217713  0.810650 -0.438280  1.008279
2013-01-03 -1.598160  1.08306

In [24]:
print(df.sum())          # row 들의 합 (col : 합) axis=0 디폴트
print(df.sum(axis=1))    # column 들의 함 (row : 합) axis=1

A   -1.383922
B    4.827759
C    3.513589
D    1.135575
dtype: float64
2013-01-01    1.162936
2013-01-02    2.456074
2013-01-03    0.643251
2013-01-04    2.754564
2013-01-05    2.610041
2013-01-06   -1.533865
Freq: D, dtype: float64
