<a href="https://colab.research.google.com/github/rldckd0103/pdm02/blob/master/py-pandas/pandas_2_handling_df.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Python module 3. **pandas**

# Using pandas

* [10 Minutes to pandas](https://pandas.pydata.org/pandas-docs/stable/10min.html)
* [Pandas tutorial with interactive exercises](https://www.kaggle.com/pistak/pandas-tutorial-with-interactive-exercises)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# %matplotlib inline  # work for Jupyter notebook or lab



---



## [2] Handling DataFrame
- head()
- tail()
- describe()
- info()

In [2]:
# Creating a DataFrame by passing a NumPy array, with a datetime index and labeled columns:
dates = pd.date_range('20200928', periods=6)
dates

DatetimeIndex(['2020-09-28', '2020-09-29', '2020-09-30', '2020-10-01',
               '2020-10-02', '2020-10-03'],
              dtype='datetime64[ns]', freq='D')

In [3]:
# head()
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df.head()

Unnamed: 0,A,B,C,D
2020-09-28,-0.064682,1.797514,-1.626794,-0.101611
2020-09-29,-0.440401,-0.090169,0.358529,-0.276705
2020-09-30,-0.748833,0.848576,0.184475,1.044607
2020-10-01,0.213169,0.175949,0.403203,0.203424
2020-10-02,-0.421078,-0.03844,1.156815,1.663254


In [4]:
df.head(2)

Unnamed: 0,A,B,C,D
2020-09-28,-0.064682,1.797514,-1.626794,-0.101611
2020-09-29,-0.440401,-0.090169,0.358529,-0.276705


In [5]:
df.tail(3)

Unnamed: 0,A,B,C,D
2020-10-01,0.213169,0.175949,0.403203,0.203424
2020-10-02,-0.421078,-0.03844,1.156815,1.663254
2020-10-03,1.307164,0.548673,-0.132844,-1.072158


In [6]:
# Display the index, columns, and the underlying NumPy data:
df.index

DatetimeIndex(['2020-09-28', '2020-09-29', '2020-09-30', '2020-10-01',
               '2020-10-02', '2020-10-03'],
              dtype='datetime64[ns]', freq='D')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2020-09-28 to 2020-10-03
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float64
 1   B       6 non-null      float64
 2   C       6 non-null      float64
 3   D       6 non-null      float64
dtypes: float64(4)
memory usage: 240.0 bytes


In [None]:
# describe() shows a quick statistic summary of your data:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.079644,-0.138168,0.46509,-0.323308
std,0.692649,0.703952,0.631826,1.354269
min,-1.234122,-0.912862,-0.42679,-1.923248
25%,-0.392637,-0.758391,0.011331,-1.020181
50%,0.145044,-0.154443,0.797605,-0.526307
75%,0.312614,0.439275,0.821175,0.024047
max,0.669561,0.715266,1.043851,2.013323


In [None]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [None]:
# Transposing your dataframe:
df.T

Unnamed: 0,2020-09-28,2020-09-29,2020-09-30,2020-10-01,2020-10-02,2020-10-03
A,-0.540402,-1.234122,0.050658,0.239431,0.337008,0.669561
B,0.715266,-0.386537,0.077652,-0.912862,-0.882342,0.559816
C,1.043851,-0.250567,0.798188,0.797022,-0.42679,0.828837
D,-0.065435,-0.987179,0.053874,-1.923248,2.013323,-1.031182


In [None]:
df.T.index

Index(['A', 'B', 'C', 'D'], dtype='object')

### Sorting

#### Sort by index
- sort_index(axis=0, ascending=False)
- sort_index(axis=1, ascending=False)

> Axis=0 Column-Wise Operation (수직으로)

> Axis=1 Row-Wise Operation (수평으로)

In [None]:
# Sorting by an axis:
df, df.sort_index(axis=0, ascending=False)

(                   A         B         C         D
 2020-09-28 -0.540402  0.715266  1.043851 -0.065435
 2020-09-29 -1.234122 -0.386537 -0.250567 -0.987179
 2020-09-30  0.050658  0.077652  0.798188  0.053874
 2020-10-01  0.239431 -0.912862  0.797022 -1.923248
 2020-10-02  0.337008 -0.882342 -0.426790  2.013323
 2020-10-03  0.669561  0.559816  0.828837 -1.031182,
                    A         B         C         D
 2020-10-03  0.669561  0.559816  0.828837 -1.031182
 2020-10-02  0.337008 -0.882342 -0.426790  2.013323
 2020-10-01  0.239431 -0.912862  0.797022 -1.923248
 2020-09-30  0.050658  0.077652  0.798188  0.053874
 2020-09-29 -1.234122 -0.386537 -0.250567 -0.987179
 2020-09-28 -0.540402  0.715266  1.043851 -0.065435)

In [None]:
df,df.sort_index(axis=1, ascending=False)

(                   A         B         C         D
 2020-09-28 -0.540402  0.715266  1.043851 -0.065435
 2020-09-29 -1.234122 -0.386537 -0.250567 -0.987179
 2020-09-30  0.050658  0.077652  0.798188  0.053874
 2020-10-01  0.239431 -0.912862  0.797022 -1.923248
 2020-10-02  0.337008 -0.882342 -0.426790  2.013323
 2020-10-03  0.669561  0.559816  0.828837 -1.031182,
                    D         C         B         A
 2020-09-28 -0.065435  1.043851  0.715266 -0.540402
 2020-09-29 -0.987179 -0.250567 -0.386537 -1.234122
 2020-09-30  0.053874  0.798188  0.077652  0.050658
 2020-10-01 -1.923248  0.797022 -0.912862  0.239431
 2020-10-02  2.013323 -0.426790 -0.882342  0.337008
 2020-10-03 -1.031182  0.828837  0.559816  0.669561)

#### Sort by value
- sort_values(by='column')

In [8]:
# Sorting by values:
df,df.sort_values(by='B', ascending=False)

(                   A         B         C         D
 2020-09-28 -0.064682  1.797514 -1.626794 -0.101611
 2020-09-29 -0.440401 -0.090169  0.358529 -0.276705
 2020-09-30 -0.748833  0.848576  0.184475  1.044607
 2020-10-01  0.213169  0.175949  0.403203  0.203424
 2020-10-02 -0.421078 -0.038440  1.156815  1.663254
 2020-10-03  1.307164  0.548673 -0.132844 -1.072158,
                    A         B         C         D
 2020-09-28 -0.064682  1.797514 -1.626794 -0.101611
 2020-09-30 -0.748833  0.848576  0.184475  1.044607
 2020-10-03  1.307164  0.548673 -0.132844 -1.072158
 2020-10-01  0.213169  0.175949  0.403203  0.203424
 2020-10-02 -0.421078 -0.038440  1.156815  1.663254
 2020-09-29 -0.440401 -0.090169  0.358529 -0.276705)

## indexing and slicing of DataFrame

#### Selecting data by indexing and slicing
- indexing
- slicing


In [None]:
df

Unnamed: 0,A,B,C,D
2020-09-28,-0.540402,0.715266,1.043851,-0.065435
2020-09-29,-1.234122,-0.386537,-0.250567,-0.987179
2020-09-30,0.050658,0.077652,0.798188,0.053874
2020-10-01,0.239431,-0.912862,0.797022,-1.923248
2020-10-02,0.337008,-0.882342,-0.42679,2.013323
2020-10-03,0.669561,0.559816,0.828837,-1.031182


In [9]:
# Selecting a single column, which yields a Series
df['A']

2020-09-28   -0.064682
2020-09-29   -0.440401
2020-09-30   -0.748833
2020-10-01    0.213169
2020-10-02   -0.421078
2020-10-03    1.307164
Freq: D, Name: A, dtype: float64

### 열-A,B를 다 선택할려면?

In [None]:
df[['A','B']] #df['A','B']

Unnamed: 0,A,B
2020-09-28,-0.540402,0.715266
2020-09-29,-1.234122,-0.386537
2020-09-30,0.050658,0.077652
2020-10-01,0.239431,-0.912862
2020-10-02,0.337008,-0.882342
2020-10-03,0.669561,0.559816


In [None]:
# Selecting via [], which slices the rows.
df[0:3]

Unnamed: 0,A,B,C,D
2020-09-28,-0.540402,0.715266,1.043851,-0.065435
2020-09-29,-1.234122,-0.386537,-0.250567,-0.987179
2020-09-30,0.050658,0.077652,0.798188,0.053874


In [None]:
df['20200928':'20201001'] # 인덱스가 아닌 값인 경우는 지정된 범위가 다 선택된다.

Unnamed: 0,A,B,C,D
2020-09-28,-0.540402,0.715266,1.043851,-0.065435
2020-09-29,-1.234122,-0.386537,-0.250567,-0.987179
2020-09-30,0.050658,0.077652,0.798188,0.053874
2020-10-01,0.239431,-0.912862,0.797022,-1.923248


#### Selecting data by label

> **loc, iloc**


In [None]:
df

Unnamed: 0,A,B,C,D
2020-09-28,-0.540402,0.715266,1.043851,-0.065435
2020-09-29,-1.234122,-0.386537,-0.250567,-0.987179
2020-09-30,0.050658,0.077652,0.798188,0.053874
2020-10-01,0.239431,-0.912862,0.797022,-1.923248
2020-10-02,0.337008,-0.882342,-0.42679,2.013323
2020-10-03,0.669561,0.559816,0.828837,-1.031182


In [10]:
dates[0]

Timestamp('2020-09-28 00:00:00', freq='D')

In [None]:
df.loc[dates[0]]   # loc()

A   -0.540402
B    0.715266
C    1.043851
D   -0.065435
Name: 2020-09-28 00:00:00, dtype: float64

In [None]:
# Selecting on a multi-axis by label:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2020-09-28,-0.540402,0.715266
2020-09-29,-1.234122,-0.386537
2020-09-30,0.050658,0.077652
2020-10-01,0.239431,-0.912862
2020-10-02,0.337008,-0.882342
2020-10-03,0.669561,0.559816


#### [도전코딩]

> Select data for first two days AND comumn 3,4 from df.

In [12]:
df

Unnamed: 0,A,B,C,D
2020-09-28,-0.064682,1.797514,-1.626794,-0.101611
2020-09-29,-0.440401,-0.090169,0.358529,-0.276705
2020-09-30,-0.748833,0.848576,0.184475,1.044607
2020-10-01,0.213169,0.175949,0.403203,0.203424
2020-10-02,-0.421078,-0.03844,1.156815,1.663254
2020-10-03,1.307164,0.548673,-0.132844,-1.072158


In [11]:
#df.loc[0:2,['C','D']]
#df.loc['20200928':'20200929',['C','D']]
df.loc[dates[:2],['C','D']]

Unnamed: 0,C,D
2020-09-28,-1.626794,-0.101611
2020-09-29,0.358529,-0.276705


#### Selecting data by position (iloc())
- index 사용

In [13]:
df

Unnamed: 0,A,B,C,D
2020-09-28,-0.064682,1.797514,-1.626794,-0.101611
2020-09-29,-0.440401,-0.090169,0.358529,-0.276705
2020-09-30,-0.748833,0.848576,0.184475,1.044607
2020-10-01,0.213169,0.175949,0.403203,0.203424
2020-10-02,-0.421078,-0.03844,1.156815,1.663254
2020-10-03,1.307164,0.548673,-0.132844,-1.072158


In [14]:
df.iloc[3]  # 결과는 차원축소형으로 표현됨.

A    0.213169
B    0.175949
C    0.403203
D    0.203424
Name: 2020-10-01 00:00:00, dtype: float64

In [16]:
# [다시 도전]
# Select data for first two days AND comumn 3,4 from df.
# Use iloc
df.iloc[:2,2:4]

Unnamed: 0,C,D
2020-09-28,-1.626794,-0.101611
2020-09-29,0.358529,-0.276705


In [17]:
# Select one item
df.iloc[1,1]

-0.0901694706711372

#### Selecting data by Boolean indexing

In [18]:
df

Unnamed: 0,A,B,C,D
2020-09-28,-0.064682,1.797514,-1.626794,-0.101611
2020-09-29,-0.440401,-0.090169,0.358529,-0.276705
2020-09-30,-0.748833,0.848576,0.184475,1.044607
2020-10-01,0.213169,0.175949,0.403203,0.203424
2020-10-02,-0.421078,-0.03844,1.156815,1.663254
2020-10-03,1.307164,0.548673,-0.132844,-1.072158


In [23]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2020-10-01,0.213169,0.175949,0.403203,0.203424
2020-10-03,1.307164,0.548673,-0.132844,-1.072158


In [20]:
df[df > 0]

Unnamed: 0,A,B,C,D
2020-09-28,,1.797514,,
2020-09-29,,,0.358529,
2020-09-30,,0.848576,0.184475,1.044607
2020-10-01,0.213169,0.175949,0.403203,0.203424
2020-10-02,,,1.156815,1.663254
2020-10-03,1.307164,0.548673,,
