## EDA을 위한 다양한 pandas function 연습  
---
참고링크 : https://pandas.pydata.org/docs/user_guide/10min.html  
한글 : https://dandyrilla.github.io/2017-08-12/pandas-10min/

API ref : https://pandas.pydata.org/docs/reference/index.html

In [4]:
import numpy as np
import pandas as pd

### series and dataFrame 

차이점을 기억해 두자  
series 1차 배열  
dataFrame 2차 배열  

실제 data 특성에 맞는 샘플 데이터를 만들수 있어야 한다.   
해당 샘플 데이터로 각 연산 결과를 미리 확인 할 수 있어야 한다.   
print(type()) 을 이용하여 object 의 타입을 정확히 파악해야 한다.  
( series[index_label] vs dataframe[column_label] )  
answer.ipynb 와 test.ipynb, sample.ipynb 으로 운영한다.  

In [5]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [6]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [7]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.481825,-0.177666,-0.040761,-0.122394
2013-01-02,1.179803,-1.004881,-1.06893,1.292987
2013-01-03,-0.269763,-0.097249,0.941999,0.447319
2013-01-04,0.324949,0.07631,-1.278213,-0.448845
2013-01-05,-0.945565,0.021478,-0.974567,-0.702727
2013-01-06,0.461092,1.248673,-0.545249,1.214377


dataFrame :   
DataFrame은 2차원 배열 데이터라고 이해하는 것보다 공통 인덱스를 가진 column series를 딕셔너리로 묶어놓은 것

In [8]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

df2 
# df2[:]
# df2[:][:]
# df2.loc[:,:]

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


### 2. Viewing Data

In [9]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [10]:
print(df.head(2),'\n')

print(df.tail(2), '\n')

print(df.index, '\n')

print(df.columns, '\n')

df.describe()


                   A         B         C         D
2013-01-01  0.481825 -0.177666 -0.040761 -0.122394
2013-01-02  1.179803 -1.004881 -1.068930  1.292987 

                   A         B         C         D
2013-01-05 -0.945565  0.021478 -0.974567 -0.702727
2013-01-06  0.461092  1.248673 -0.545249  1.214377 

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D') 

Index(['A', 'B', 'C', 'D'], dtype='object') 



Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.20539,0.011111,-0.494287,0.280119
std,0.72884,0.723261,0.830256,0.846887
min,-0.945565,-1.004881,-1.278213,-0.702727
25%,-0.121085,-0.157562,-1.045339,-0.367233
50%,0.39302,-0.037886,-0.759908,0.162462
75%,0.476642,0.062602,-0.166883,1.022612
max,1.179803,1.248673,0.941999,1.292987


In [11]:
# 행, 열을 바꾼다. 

df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,0.481825,1.179803,-0.269763,0.324949,-0.945565,0.461092
B,-0.177666,-1.004881,-0.097249,0.07631,0.021478,1.248673
C,-0.040761,-1.06893,0.941999,-1.278213,-0.974567,-0.545249
D,-0.122394,1.292987,0.447319,-0.448845,-0.702727,1.214377


In [12]:
df.to_numpy()

array([[ 0.48182528, -0.17766561, -0.04076098, -0.12239447],
       [ 1.17980252, -1.00488069, -1.06892996,  1.29298729],
       [-0.26976253, -0.09724923,  0.94199918,  0.44731903],
       [ 0.32494868,  0.07630981, -1.27821347, -0.44884538],
       [-0.94556504,  0.02147769, -0.97456685, -0.70272695],
       [ 0.46109215,  1.24867267, -0.54524905,  1.21437685]])

In [13]:
df.select_dtypes(include='int64')

2013-01-01
2013-01-02
2013-01-03
2013-01-04
2013-01-05
2013-01-06


### 3. Selection 
```
.             : df.col
[]            : df[column]         , df[row_int: ]  
loc (or at)   : df.loc[index]      , df.loc[ : ,col]  
iloc (or iat) : df.iloc[index_int] , df.iloc[ : , col_int]  
```

In [14]:
print(df['A'],'\n')
print(df[:]['A'],'\n')              #df.loc[:,'A']
# df['2013-01-01':]['A']            # chained indexing ?
# df.A

## column 을 선택하게 되며 series 로 리턴된다.


print(df[['A']],'\n')               # df 으로 리턴 
df[['A','C']]                       # dataFrame 리턴

## 2개 이사, 이거나 [[ 'col' ]] 으로 1개 col 을 df 로 리터


2013-01-01    0.481825
2013-01-02    1.179803
2013-01-03   -0.269763
2013-01-04    0.324949
2013-01-05   -0.945565
2013-01-06    0.461092
Freq: D, Name: A, dtype: float64 

2013-01-01    0.481825
2013-01-02    1.179803
2013-01-03   -0.269763
2013-01-04    0.324949
2013-01-05   -0.945565
2013-01-06    0.461092
Freq: D, Name: A, dtype: float64 

                   A
2013-01-01  0.481825
2013-01-02  1.179803
2013-01-03 -0.269763
2013-01-04  0.324949
2013-01-05 -0.945565
2013-01-06  0.461092 



Unnamed: 0,A,C
2013-01-01,0.481825,-0.040761
2013-01-02,1.179803,-1.06893
2013-01-03,-0.269763,0.941999
2013-01-04,0.324949,-1.278213
2013-01-05,-0.945565,-0.974567
2013-01-06,0.461092,-0.545249


In [None]:
# 오류 

# print(df[0],'\n')                # 명시적 인덱스 0 을 찾기때문에 오류
# print(df['2013-01-01'],'\n')     # column 에서 찾기 때문에 오류 

In [16]:
# df[0:3]
# df['2013-01-01':'2013-01-03']
print(df['2013-01-04':], '\n')
print(df['2013-01-04':'2013-01-04'], '\n')     # 3, 4 열, label 은 끝 포함
print(df[3:4], '\n')                           # 3열만, index 은 -1 까지
df[3:]

# row 을 선택하게 되며, dataFrame 으로 리턴된다

                   A         B         C         D
2013-01-04  0.324949  0.076310 -1.278213 -0.448845
2013-01-05 -0.945565  0.021478 -0.974567 -0.702727
2013-01-06  0.461092  1.248673 -0.545249  1.214377 

                   A        B         C         D
2013-01-04  0.324949  0.07631 -1.278213 -0.448845 

                   A        B         C         D
2013-01-04  0.324949  0.07631 -1.278213 -0.448845 



Unnamed: 0,A,B,C,D
2013-01-04,0.324949,0.07631,-1.278213,-0.448845
2013-01-05,-0.945565,0.021478,-0.974567,-0.702727
2013-01-06,0.461092,1.248673,-0.545249,1.214377


In [17]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [18]:
#chained indexing?
print(df2[2:4]['A'],'\n')     # row / col
print(df2[2:4][0:4], '\n')     # row seriese 의 row serise
df2[2:4][0:1]                 # 2,3 row 로 만들어진 것에서 다시 0,1 row 첫번째 

2    1.0
3    1.0
Name: A, dtype: float64 

     A          B    C  D      E    F
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo 



Unnamed: 0,A,B,C,D,E,F
2,1.0,2013-01-02,1.0,3,test,foo


In [19]:
df2.iloc[0]     # 암묵적 인덱스 0 번째 선택

A                    1.0
B    2013-01-02 00:00:00
C                    1.0
D                      3
E                   test
F                    foo
Name: 0, dtype: object

In [226]:
df2.loc[2]        # 명시적 인덱스 2 선택

A                    1.0
B    2013-01-02 00:00:00
C                    1.0
D                      3
E                   test
F                    foo
Name: 2, dtype: object

Row

In [None]:
# row label (명시적, 몇번) 을 이용해서 row 을 인덱싱
# index 가 int 인 경우 df.loc[index_int] 도 가능, 
print(df.loc['20130101'],'\n')              # series
print(df.loc['20130101':'20130102'],'\n')   # dataFrame
print(df.loc[['20130101','20130102']],'\n')

# row location (암묵적 , 몇번째) index 을 이용해서 row 인덱싱
print(df.iloc[0],'\n')                      # series
print(df.iloc[0:1],'\n')                    #dataFrame
print(df.iloc[[0,1]],'\n')



A   -0.685896
B   -0.360552
C   -0.499072
D    0.878319
Name: 2013-01-01 00:00:00, dtype: float64 

                   A         B         C         D
2013-01-01 -0.685896 -0.360552 -0.499072  0.878319
2013-01-02 -0.398987  1.285879 -0.099484 -0.588932 

                   A         B         C         D
2013-01-01 -0.685896 -0.360552 -0.499072  0.878319
2013-01-02 -0.398987  1.285879 -0.099484 -0.588932 

A   -0.685896
B   -0.360552
C   -0.499072
D    0.878319
Name: 2013-01-01 00:00:00, dtype: float64 

                   A         B         C         D
2013-01-01 -0.685896 -0.360552 -0.499072  0.878319 

                   A         B         C         D
2013-01-01 -0.685896 -0.360552 -0.499072  0.878319
2013-01-02 -0.398987  1.285879 -0.099484 -0.588932 



Column

In [None]:

# column lable 을 이용해서 column 인덱싱
# print(df.loc['A'],'\n')      # 'A' 을 row 인텍싱으로 생각해서 
print(df.loc[:,'A'],'\n')
print(df.loc[:, ['A']],'\n')
print(df.loc[:,'A':'B'],'\n')
print(df.loc[:,['A','B']],'\n')


2013-01-01   -0.685896
2013-01-02   -0.398987
2013-01-03    0.076085
2013-01-04    1.463383
2013-01-05   -0.401422
2013-01-06   -2.052727
Freq: D, Name: A, dtype: float64 

                   A
2013-01-01 -0.685896
2013-01-02 -0.398987
2013-01-03  0.076085
2013-01-04  1.463383
2013-01-05 -0.401422
2013-01-06 -2.052727 

                   A         B
2013-01-01 -0.685896 -0.360552
2013-01-02 -0.398987  1.285879
2013-01-03  0.076085 -1.526245
2013-01-04  1.463383 -0.564231
2013-01-05 -0.401422 -0.341176
2013-01-06 -2.052727 -1.029365 

                   A         B
2013-01-01 -0.685896 -0.360552
2013-01-02 -0.398987  1.285879
2013-01-03  0.076085 -1.526245
2013-01-04  1.463383 -0.564231
2013-01-05 -0.401422 -0.341176
2013-01-06 -2.052727 -1.029365 



In [22]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.481825,-0.177666,-0.040761,-0.122394
2013-01-02,1.179803,-1.004881,-1.06893,1.292987
2013-01-03,-0.269763,-0.097249,0.941999,0.447319
2013-01-04,0.324949,0.07631,-1.278213,-0.448845
2013-01-05,-0.945565,0.021478,-0.974567,-0.702727


In [24]:

# column locatin (몇번째) index 을 이용해서 column 인덱싱
print(df.iloc[0],'\n')          # 0 을 row 인텍싱으로 생각해서 0번째 location row 을 인텍싱
print(df.iloc[:,0],'\n')        # 0 번째 column series 리턴
print(df.iloc[:,[0]])           # 0 번째 column df 리턴


A    0.481825
B   -0.177666
C   -0.040761
D   -0.122394
Name: 2013-01-01 00:00:00, dtype: float64 

2013-01-01    0.481825
2013-01-02    1.179803
2013-01-03   -0.269763
2013-01-04    0.324949
2013-01-05   -0.945565
2013-01-06    0.461092
Freq: D, Name: A, dtype: float64 

                   A
2013-01-01  0.481825
2013-01-02  1.179803
2013-01-03 -0.269763
2013-01-04  0.324949
2013-01-05 -0.945565
2013-01-06  0.461092


In [None]:
print(df.iloc[:,0:2],'\n')


                   A         B
2013-01-01 -0.685896 -0.360552
2013-01-02 -0.398987  1.285879
2013-01-03  0.076085 -1.526245
2013-01-04  1.463383 -0.564231
2013-01-05 -0.401422 -0.341176
2013-01-06 -2.052727 -1.029365 



In [None]:
print(df.iloc[:,[0,1,3]],'\n')

                   A         B         D
2013-01-01 -0.685896 -0.360552  0.878319
2013-01-02 -0.398987  1.285879 -0.588932
2013-01-03  0.076085 -1.526245 -1.299276
2013-01-04  1.463383 -0.564231 -0.024509
2013-01-05 -0.401422 -0.341176  0.599668
2013-01-06 -2.052727 -1.029365 -1.724867 



Multi-indxing

In [None]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [None]:
df2.loc[0:2,'A']

0    1.0
1    1.0
2    1.0
Name: A, dtype: float64

In [None]:
print(df2.loc[[0,1],['A','B']])    # 숫자가 lable 이다 df[[0,1]] 은 오류
print(df2.iloc[[0,1],[0,1]])

     A          B
0  1.0 2013-01-02
1  1.0 2013-01-02
     A          B
0  1.0 2013-01-02
1  1.0 2013-01-02


In [None]:
df

Unnamed: 0,A,B,C,D,sum_row,result
2013-01-01,-1.647909,1.898488,-0.558555,0.492113,0.736547,Over
2013-01-02,1.172068,0.054952,-1.439783,1.300393,4.35052,Over
2013-01-03,-1.315929,1.807103,2.320533,-1.186418,6.501154,Over
2013-01-04,-1.335727,-0.964605,0.480328,-0.29864,-8.47458,Under
2013-01-05,0.8375,0.197,2.186717,-1.292375,7.715364,Over
2013-01-06,1.30635,1.150093,-0.284214,0.646898,11.276512,Over


In [None]:
df[2:6][df[2:6].A < 0][1:2]

# df 의 2부터 6 row 중 A column  값이 0 보다 작은 두번재 row 

Unnamed: 0,A,B,C,D
2013-01-06,-2.052727,-1.029365,0.085973,-1.724867


In [None]:
print(df[2:6])
print(df[2:6].A<0)

# 2개의 index가 같아야 한다. 
# IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

                   A         B         C         D
2013-01-03  0.076085 -1.526245 -0.330180 -1.299276
2013-01-04  1.463383 -0.564231 -0.308063 -0.024509
2013-01-05 -0.401422 -0.341176  1.233796  0.599668
2013-01-06 -2.052727 -1.029365  0.085973 -1.724867
2013-01-03    False
2013-01-04    False
2013-01-05     True
2013-01-06     True
Freq: D, Name: A, dtype: bool


Boolean indexing

```(df[])[(df[]) 조건][column]```  
```(df[]).loc[ (df[]) 조건 , column ]```

iloc 는 boolean index 을 사용할 수 없다.  
iLocation based boolean indexing on an integer type is not available

In [239]:
df[df['A']>0]['B']

2013-01-01    0.240168
2013-01-02   -0.062544
2013-01-03    0.437352
Freq: D, Name: B, dtype: float64

In [238]:
df.loc[df['A']>0,'B']


2013-01-01    0.240168
2013-01-02   -0.062544
2013-01-03    0.437352
Freq: D, Name: B, dtype: float64

In [None]:
df[1:3].A > 0

2013-01-02    False
2013-01-03     True
Freq: D, Name: A, dtype: bool

In [None]:
df[1:3][df[1:3].A > 0]

# df[df[1:3].A > 0]  은 오류가 난다.  df 의 index 와 df[1:3] 의 index 가 서로 맞지않다.

Unnamed: 0,A,B,C,D
2013-01-03,0.076085,-1.526245,-0.33018,-1.299276


In [None]:
df[1:3][df[1:3].A > 0]['B']
# df1 = df[1:3]
# df1.A > 0
# df1['B']

2013-01-03   -1.526245
Freq: D, Name: B, dtype: float64

funtion indexing

```
pipe()  : Tablewise Function Application
apply() : Row or Column-wise Function Application
agg() and transform() : Aggregation API
applymap() : Applying Elementwise Functions

cut()    : 조건에 따른 구간별 label
where()  : 조건에 따른 값표시 (아닌값은 NaN)
```

In [243]:
df

Unnamed: 0,A,B,C,D
2013-01-01,1.934018,0.240168,-0.85657,-1.013682
2013-01-02,0.745105,-0.062544,0.662765,-1.792722
2013-01-03,0.663529,0.437352,0.421723,0.032032
2013-01-04,-0.4668,-0.369758,1.25415,-0.115403
2013-01-05,-1.565627,1.063928,-0.144286,0.572975
2013-01-06,-0.544321,0.439658,-1.073815,0.210661


In [247]:
df3 = df.copy()
df3['sum_row'] = df3.apply(np.sum,axis=1)    # df3.sum(axis=1) 과 같다.
df3

Unnamed: 0,A,B,C,D,sum_row
2013-01-01,1.934018,0.240168,-0.85657,-1.013682,0.303933
2013-01-02,0.745105,-0.062544,0.662765,-1.792722,-0.447396
2013-01-03,0.663529,0.437352,0.421723,0.032032,1.554636
2013-01-04,-0.4668,-0.369758,1.25415,-0.115403,0.30219
2013-01-05,-1.565627,1.063928,-0.144286,0.572975,-0.073009
2013-01-06,-0.544321,0.439658,-1.073815,0.210661,-0.967817


In [253]:
df3["result"] = df3.apply(lambda r: "Over" if r.sum_row > 0 else "Under", axis=1)
df3

Unnamed: 0,A,B,C,D,sum_row,result
2013-01-01,1.934018,0.240168,-0.85657,-1.013682,0.303933,Over
2013-01-02,0.745105,-0.062544,0.662765,-1.792722,-0.447396,Under
2013-01-03,0.663529,0.437352,0.421723,0.032032,1.554636,Over
2013-01-04,-0.4668,-0.369758,1.25415,-0.115403,0.30219,Over
2013-01-05,-1.565627,1.063928,-0.144286,0.572975,-0.073009,Under
2013-01-06,-0.544321,0.439658,-1.073815,0.210661,-0.967817,Under


In [257]:
# bins=[-100,0,100]
# labels =['under','over']      # bins 보다 하나 적은 갯수 (사이값) 이어야 한다.
pd.cut(df3.sum_row, bins=[-100,0,100], labels=['under','over'] )

2013-01-01     over
2013-01-02    under
2013-01-03     over
2013-01-04     over
2013-01-05    under
2013-01-06    under
Freq: D, Name: sum_row, dtype: category
Categories (2, object): ['under' < 'over']

In [254]:
df3 = df3.drop(columns='result')
df3

Unnamed: 0,A,B,C,D,sum_row
2013-01-01,1.934018,0.240168,-0.85657,-1.013682,0.303933
2013-01-02,0.745105,-0.062544,0.662765,-1.792722,-0.447396
2013-01-03,0.663529,0.437352,0.421723,0.032032,1.554636
2013-01-04,-0.4668,-0.369758,1.25415,-0.115403,0.30219
2013-01-05,-1.565627,1.063928,-0.144286,0.572975,-0.073009
2013-01-06,-0.544321,0.439658,-1.073815,0.210661,-0.967817


In [270]:
df3.where((df3.sum_row>0) & (df3.B>0))    # boolean 으로 리턴하지 않고, 값을 리턴 하다. dropna() 을 붙일수 있다.

Unnamed: 0,A,B,C,D,sum_row
2013-01-01,1.934018,0.240168,-0.85657,-1.013682,0.303933
2013-01-02,,,,,
2013-01-03,0.663529,0.437352,0.421723,0.032032,1.554636
2013-01-04,,,,,
2013-01-05,,,,,
2013-01-06,,,,,


In [276]:
df3[(df3.sum_row>0) & (df3.B>0)]

# dropna() 까지 한 결과

Unnamed: 0,A,B,C,D,sum_row
2013-01-01,1.934018,0.240168,-0.85657,-1.013682,0.303933
2013-01-03,0.663529,0.437352,0.421723,0.032032,1.554636


### 4. Missing 

``` 
dropna()
fillna()
notnull()
notna()   
isnull()
isna() 
```

In [283]:
df3.where((df3.sum_row>0) & (df3.B>0)).fillna(0)


Unnamed: 0,A,B,C,D,sum_row
2013-01-01,1.934018,0.240168,-0.85657,-1.013682,0.303933
2013-01-02,0.0,0.0,0.0,0.0,0.0
2013-01-03,0.663529,0.437352,0.421723,0.032032,1.554636
2013-01-04,0.0,0.0,0.0,0.0,0.0
2013-01-05,0.0,0.0,0.0,0.0,0.0
2013-01-06,0.0,0.0,0.0,0.0,0.0


### 5. Operations

```
sum, mean, std, var
sort, count
apply
str
```

In [284]:
print(df.sort_index())
print(df.sort_values('A'),'\n')
# df.sort_values(by='A')
print(df.sort_values(by= ['A','B'],ascending=False))

                   A         B         C         D
2013-01-01  1.934018  0.240168 -0.856570 -1.013682
2013-01-02  0.745105 -0.062544  0.662765 -1.792722
2013-01-03  0.663529  0.437352  0.421723  0.032032
2013-01-04 -0.466800 -0.369758  1.254150 -0.115403
2013-01-05 -1.565627  1.063928 -0.144286  0.572975
2013-01-06 -0.544321  0.439658 -1.073815  0.210661
                   A         B         C         D
2013-01-05 -1.565627  1.063928 -0.144286  0.572975
2013-01-06 -0.544321  0.439658 -1.073815  0.210661
2013-01-04 -0.466800 -0.369758  1.254150 -0.115403
2013-01-03  0.663529  0.437352  0.421723  0.032032
2013-01-02  0.745105 -0.062544  0.662765 -1.792722
2013-01-01  1.934018  0.240168 -0.856570 -1.013682 

                   A         B         C         D
2013-01-01  1.934018  0.240168 -0.856570 -1.013682
2013-01-02  0.745105 -0.062544  0.662765 -1.792722
2013-01-03  0.663529  0.437352  0.421723  0.032032
2013-01-04 -0.466800 -0.369758  1.254150 -0.115403
2013-01-06 -0.544321  0.43965

In [285]:
print(df.sum())          # row 들의 합 (col : 합) axis=0 디폴트
print(df.sum(axis=1))    # column 들의 함 (row : 합) axis=1

A    0.765905
B    1.748804
C    0.263968
D   -2.106140
dtype: float64
2013-01-01    0.303933
2013-01-02   -0.447396
2013-01-03    1.554636
2013-01-04    0.302190
2013-01-05   -0.073009
2013-01-06   -0.967817
Freq: D, dtype: float64


### 6. Merging

```
concat
join
merge
```

### 7. Grouping

```
Splitting the data into groups based on some criteria
Applying a function to each group independently
Combining the results into a data structure

```

In [290]:
df_g = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)

df_g

Unnamed: 0,A,B,C,D
0,foo,one,0.020773,-0.47419
1,bar,one,-0.955413,0.290576
2,foo,two,-0.070888,0.993231
3,bar,three,-0.767378,0.195597
4,foo,two,1.346626,0.270152
5,bar,two,1.233851,-0.167259
6,foo,one,0.640297,0.560557
7,foo,three,-1.128363,-2.381524


In [311]:
df_g.groupby(['A','B'])[['C']].sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C
A,B,Unnamed: 2_level_1
bar,one,-0.955413
bar,three,-0.767378
bar,two,1.233851
foo,one,0.66107
foo,three,-1.128363
foo,two,1.275739


### 8. Reshaping

```
stack / unstack 
pivot table
```

In [340]:
df_g.set_index(['A','B'])[['C']]#.unstack()

# Index contains duplicate entries, cannot reshape : 'foo one' 이 중복된다. 
# df.drop_duplicates(subset=['index','column']) 으로 해결

Unnamed: 0_level_0,Unnamed: 1_level_0,C
A,B,Unnamed: 2_level_1
foo,one,0.020773
bar,one,-0.955413
foo,two,-0.070888
bar,three,-0.767378
foo,two,1.346626
bar,two,1.233851
foo,one,0.640297
foo,three,-1.128363


In [343]:
# df_g2 = df_g.drop([6,2]).reset_index(drop=True) 
# 중복되는 2, 6 row을 삭제

df_g2 = df_g.drop_duplicates(subset=['A','B'],keep='first', ignore_index=True)  # inplace = True 으로 원본 적용

df_g2

Unnamed: 0,A,B,C,D
0,foo,one,0.020773,-0.47419
1,bar,one,-0.955413,0.290576
2,foo,two,-0.070888,0.993231
3,bar,three,-0.767378,0.195597
4,bar,two,1.233851,-0.167259
5,foo,three,-1.128363,-2.381524


In [339]:
df_g2.set_index(['A','B'])[['C']].unstack()

Unnamed: 0_level_0,C,C,C
B,one,three,two
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
bar,-0.955413,-0.767378,1.233851
foo,0.020773,-1.128363,-0.070888


In [344]:
df_g2.pivot(index='A',columns='B',values='C')

B,one,three,two
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,-0.955413,-0.767378,1.233851
foo,0.020773,-1.128363,-0.070888


### 9. Time Series

### 10. Categroicals

### 11. Plotting