In [1]:
#| code-summary: Load Packages
#| code-fold: true

# numerical calculation & data frames
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so
import plotly.express as px

# statistics
import statsmodels.api as sm

In [2]:
#| echo: false
from IPython.display import display, HTML
HTML('<style>.output {flex-direction: row;}</style>')

In [3]:
#| code-summary: Options
#| code-fold: true

# pandas options
pd.options.display.precision = 2
pd.options.display.float_format = '{:.2f}'.format  # pd.reset_option('display.float_format')
pd.options.display.max_rows = 7

# Numpy options
np.set_printoptions(precision = 2, suppress=True)

Data: *On-time data for all flights that departed NYC (i.e. JFK, LGA or EWR) in 2013*

In [4]:
# import the dataset
flights_data = sm.datasets.get_rdataset("flights", "nycflights13")
flights = flights_data.data
flights = flights.drop(columns="time_hour") # drop the "time_hour" column

In [5]:
#| output: false
# Description
print(flights_data.__doc__)

.. container::

   flights R Documentation

   .. rubric:: Flights data
      :name: flights-data

   .. rubric:: Description
      :name: description

   On-time data for all flights that departed NYC (i.e. JFK, LGA or EWR)
   in 2013.

   .. rubric:: Usage
      :name: usage

   ::

      flights

   .. rubric:: Format
      :name: format

   Data frame with columns

   year, month, day
      Date of departure.

   dep_time, arr_time
      Actual departure and arrival times (format HHMM or HMM), local tz.

   sched_dep_time, sched_arr_time
      Scheduled departure and arrival times (format HHMM or HMM), local
      tz.

   dep_delay, arr_delay
      Departure and arrival delays, in minutes. Negative times represent
      early departures/arrivals.

   carrier
      Two letter carrier abbreviation. See ``airlines`` to get name.

   flight
      Flight number.

   tailnum
      Plane tail number. See ``planes`` for additional metadata.

   origin, dest
      Origin and destination. Se

In [6]:
flights

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute
0,2013,1,1,517.00,515,2.00,830.00,819,11.00,UA,1545,N14228,EWR,IAH,227.00,1400,5,15
1,2013,1,1,533.00,529,4.00,850.00,830,20.00,UA,1714,N24211,LGA,IAH,227.00,1416,5,29
2,2013,1,1,542.00,540,2.00,923.00,850,33.00,AA,1141,N619AA,JFK,MIA,160.00,1089,5,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336773,2013,9,30,,1210,,,1330,,MQ,3461,N535MQ,LGA,BNA,,764,12,10
336774,2013,9,30,,1159,,,1344,,MQ,3572,N511MQ,LGA,CLE,,419,11,59
336775,2013,9,30,,840,,,1020,,MQ,3531,N839MQ,LGA,RDU,,431,8,40


  <br />
  **Subsetting options**

- Bracket []
- Dot-notation .
- iloc
- loc

## Bracket [ ]

*Bracket안에 **labels**이 있는 경우 **columns**을 select*

- A single string: Series로 반환  
- A list of a single string: DataFrame으로 반환  
- A list of strings

In [7]:
flights['dest'] # return as a Series

0         IAH
1         IAH
2         MIA
         ... 
336773    BNA
336774    CLE
336775    RDU
Name: dest, Length: 336776, dtype: object

In [8]:
#| layout-ncol: 4
flights[['dest']] # return as a DataFrame

Unnamed: 0,dest
0,IAH
1,IAH
2,MIA
...,...
336773,BNA
336774,CLE
336775,RDU


In [9]:
#| layout-ncol: 3
flights[['origin', 'dest']]

Unnamed: 0,origin,dest
0,EWR,IAH
1,LGA,IAH
2,JFK,MIA
...,...,...
336773,LGA,BNA
336774,LGA,CLE
336775,LGA,RDU


*Bracket안에 **numbers**가 있는 경우 **rows**를 select - position-based*

- Slicing만 허용
- First index는 포함, last index는 제외
- [1, 5, 8]과 같이 특정 rows를 선택하는 것은 허용안됨

In [10]:
flights[2:5]

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0


<br />
만약, 아래와 같이 index가 number일 때 out of order가 된 경우에도 row position으로 적용됨

In [11]:
#| echo: false
#| label: tbl-df
#| tbl-cap: 
#|  - df_outoforder
#|  - .
#|  - .
#| layout-ncol: 3
df_outoforder = flights[:50][["origin", "dest", "arr_delay"]].nlargest(5, "arr_delay")
df_outoforder

Unnamed: 0,origin,dest,arr_delay
42,LGA,DFW,48.0
2,JFK,MIA,33.0
25,EWR,ORD,32.0
14,LGA,DFW,31.0
33,EWR,MSP,29.0


In [12]:
#| layout-ncol: 3
df_outoforder[2:4]

Unnamed: 0,origin,dest,arr_delay
25,EWR,ORD,32.0
14,LGA,DFW,31.0


<br />
*Chaining with brackets*

In [13]:
#| layout-ncol: 4
flights[['origin', 'dest']][2:5]
# 순서 바꿔어도 동일: flights[2:5][['origin', 'dest']]

Unnamed: 0,origin,dest
2,JFK,MIA
3,JFK,BQN
4,LGA,ATL


## Dot notation .
편리하나 주의해서 사용할 필요가 있음  

::: {.callout-note}
- *space* 또는 *.* 이 있는 변수명 사용 불가
- methods와 동일한 이름의 변수명 사용 불가: 예) 변수명이 `count`인 경우 `df.count`는 `df`의 method로 인식
- 새로운 변수를 만들어 값을 assgin할 수 없음: 예) `df.new_var = 1` 불가, 대신 `df["new_var"] = 1`
- 만약, 다음과 같이 변수을 지정했을 때 `vars_names=["origin", "dest"]`,
  - `df[vars_names]`는 `"orign"`과 `"dest"` columns을 선택
  - `df.vars_names`는 `vars_names`이라는 이름의 column을 의미
:::

In [14]:
flights.dest # flihgts["dest"]와 동일

0         IAH
1         IAH
2         MIA
         ... 
336773    BNA
336774    CLE
336775    RDU
Name: dest, Length: 336776, dtype: object

## loc & iloc
각각 location, integer location의 약자  
`df.(i)loc[row_indexer, column_indexer]`

### loc: label-based indexing
- Index가 number인 경우도 label로 처리
- Slicing의 경우 first, last index 모두 inclusive

In [15]:
#| layout-ncol: 4
flights.loc[2:5, ['origin', 'dest']] # 2:5는 index의 label, not position

Unnamed: 0,origin,dest
2,JFK,MIA
3,JFK,BQN
4,LGA,ATL
5,EWR,ORD


다음과 같이 index가 labels인 경우는 혼동의 염려 없음

In [16]:
#| layout-ncol: 4
#| echo: false
df_labels = flights.loc[2:5, ['origin', 'dest']]
df_labels.index = pd.Index(["red", "blue", "green", "yellow"])
df_labels

Unnamed: 0,origin,dest
red,JFK,MIA
blue,JFK,BQN
green,LGA,ATL
yellow,EWR,ORD


In [17]:
#| layout-ncol: 4
df_labels.loc["blue":"green", :]

Unnamed: 0,origin,dest
blue,JFK,BQN
green,LGA,ATL


하지만, index가 number인 경우는 혼동이 있음  
앞서 본 예에서처럼 index가 out of order인 경우 loc은 다르게 작동

In [18]:
#| echo: false
#| label: tbl-df2
#| tbl-cap: 
#|  - df_outoforder
#|  - .
#|  - .
#| layout-ncol: 3
df_outoforder = flights[:50][["origin", "dest", "arr_delay"]].nlargest(5, "arr_delay")
df_outoforder

Unnamed: 0,origin,dest,arr_delay
42,LGA,DFW,48.0
2,JFK,MIA,33.0
25,EWR,ORD,32.0
14,LGA,DFW,31.0
33,EWR,MSP,29.0


In [19]:
#| layout-ncol: 3
df_outoforder.loc[2:14, :] # position 아님

Unnamed: 0,origin,dest,arr_delay
2,JFK,MIA,33.0
25,EWR,ORD,32.0
14,LGA,DFW,31.0


In [20]:
#| layout-ncol: 3
df_outoforder.loc[[25, 33], :] # slicing이 아닌 특정 index 선택

Unnamed: 0,origin,dest,arr_delay
25,EWR,ORD,32.0
33,EWR,MSP,29.0


In [21]:
flights.loc[2:5, 'dest'] # returns as a Series

2    MIA
3    BQN
4    ATL
5    ORD
Name: dest, dtype: object

In [22]:
#| layout-ncol: 5
flights.loc[2:5, ['dest']] # return as a DataFrame

Unnamed: 0,dest
2,MIA
3,BQN
4,ATL
5,ORD


In [85]:
flights.loc[2:5, :] # ':' means all
# 다음 모두 가능
# flights.loc[2:5]
# flights.loc[2:5, ]

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0
5,2013,1,1,554.0,558,-4.0,740.0,728,12.0,UA,1696,N39463,EWR,ORD,150.0,719,5,58


In [23]:
# select a single row
flights.loc[2, :] # returns as a Series, column names as its index

year        2013
month          1
day            1
            ... 
distance    1089
hour           5
minute        40
Name: 2, Length: 18, dtype: object

<br />

### iloc: position-based indexing
- Slicing의 경우 as usual: first index는 inclusive, last index는 exclusive

In [95]:
#| layout-ncol: 4
flights.iloc[2:5, 12:14] # 2:5는 index의 position, last index는 미포함

Unnamed: 0,origin,dest
2,JFK,MIA
3,JFK,BQN
4,LGA,ATL


In [100]:
flights.iloc[2:5, 12] # return as a Series

2    JFK
3    JFK
4    LGA
Name: origin, dtype: object

In [32]:
flights.iloc[2:5, :]
# 다음 모두 가능
# flights.iloc[2:5]
# flights.iloc[2:5, ]

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0


In [99]:
#| layout-ncol: 4
flights.iloc[2:5, [12]] # return as a DataFrame

Unnamed: 0,origin
2,JFK
3,JFK
4,LGA


In [96]:
#| layout-ncol: 4
flights.iloc[[2, 5, 7], 12:14] # 특정 위치의 rows 선택

Unnamed: 0,origin,dest
2,JFK,MIA
5,EWR,ORD
7,LGA,IAD


## Series의 indexing
DataFrame과 같은 방식으로 이해

*Index가 numbers인 경우*

In [248]:
#| echo: false
s = df_outoforder["dest"]
s

42    DFW
2     MIA
25    ORD
14    DFW
33    MSP
Name: dest, dtype: object

In [249]:
s.loc[25:14]

25    ORD
14    DFW
Name: dest, dtype: object

In [250]:
s.iloc[2:4]

25    ORD
14    DFW
Name: dest, dtype: object

In [251]:
s[:3]

42    DFW
2     MIA
25    ORD
Name: dest, dtype: object

::: {.callout-note}
다음과 같은 경우 혼동스러움
```python
s[3] # 3번째? label 3?
```
#> errors occur
:::

<br />
*Index가 lables인 경우*

In [130]:
#| echo: false
s = df_labels["dest"]
s

red       MIA
blue      BQN
green     ATL
yellow    ORD
Name: dest, dtype: object

In [133]:
s[["red", "green"]]

red      MIA
green    ATL
Name: dest, dtype: object

## Boolean indexing

- Bracket [ ] 이나 loc을 이용
- iloc은 적용 안됨

### Bracket [ ]

In [253]:
#| layout-ncol: 2
np.random.seed(123)
flights_6 = flights[:100][["dep_delay", "arr_delay", "origin", "dest"]].sample(6)
flights_6

Unnamed: 0,dep_delay,arr_delay,origin,dest
8,-3.0,-8.0,JFK,MCO
70,9.0,20.0,LGA,ORD
82,-1.0,-26.0,JFK,SFO
28,0.0,-21.0,JFK,SJU
63,-2.0,2.0,JFK,LAX
0,2.0,11.0,EWR,IAH


In [266]:
#| layout-ncol: 2
flights_6[flights_6["dep_delay"] < 0]

Unnamed: 0,dep_delay,arr_delay,origin,dest,delayed
8,-3.0,-8.0,JFK,MCO,delayed
82,-1.0,-26.0,JFK,SFO,delayed
63,-2.0,2.0,JFK,LAX,delayed


In [255]:
idx = flights_6["dep_delay"] < 0
idx # bool type의 Series

8      True
70    False
82     True
28    False
63     True
0     False
Name: dep_delay, dtype: bool

In [256]:
# Select a column with the boolean indexing
flights_6[idx]["dest"]

8     MCO
82    SFO
63    LAX
Name: dest, dtype: object

::: {.callout-note}
사실, boolean indexing을 할때, DataFrame/Series의 index와 match함  
대부분 염려하지 않아도 되나 다음과 같은 결과 참고

```python
# Reset index
idx_reset = idx.reset_index(drop=True)
# 0     True
# 1    False
# 2     True
# 3    False
# 4     True
# 5    False
# Name: dep_delay, dtype: bool

flights_6[idx_reset]["dest"]
#> IndexingError: Unalignable boolean Series provided as indexer 
#> (index of the boolean Series and of the indexed object do not match)

# Index가 없는 numpy array로 boolean indexing을 하는 경우 문제없음
flights_6[idx_reset.to_numpy()]["dest"]
# 8     MCO
# 82    SFO
# 63    LAX
# Name: dest, dtype: object

```
:::

In [267]:
#| layout-ncol: 3
bool_idx = flights_6[["dep_delay", "arr_delay"]] > 0
bool_idx

Unnamed: 0,dep_delay,arr_delay
8,False,False
70,True,True
82,False,False
28,False,False
63,False,True
0,True,True


In [268]:
idx_any = bool_idx.any(axis=1)
idx_any

8     False
70     True
82    False
28    False
63     True
0      True
dtype: bool

### `np.where()` 활용
`np.where(`*boolean condition, value if True, value if False*`)`

In [269]:
flights_6["delayed"] = np.where(idx, "delayed", "on-time")
flights_6

Unnamed: 0,dep_delay,arr_delay,origin,dest,delayed
8,-3.0,-8.0,JFK,MCO,delayed
70,9.0,20.0,LGA,ORD,on-time
82,-1.0,-26.0,JFK,SFO,delayed
28,0.0,-21.0,JFK,SJU,on-time
63,-2.0,2.0,JFK,LAX,delayed
0,2.0,11.0,EWR,IAH,on-time


In [271]:
np.where(flights_6["dest"].str.startswith("S"), "S", "T") # str method: "S"로 시작하는지 여부

array(['T', 'T', 'S', 'S', 'T', 'T'], dtype='<U1')

In [263]:
bool_idx.all(axis=1)

8      True
70    False
82     True
28    False
63    False
0     False
dtype: bool

### loc

In [264]:
flights_6.loc[idx, "dest"] # flights_6[idx]["dest"]과 동일

8     MCO
82    SFO
63    LAX
Name: dest, dtype: object

만약 column 이름에 "time"을 포함하는 columns만 선택하고자 하면

> Series/Index object는 str method 존재  
> `str.contains()`, `str.startswith()`, `str.endswith()`
>
자세한 사항은 [7.4 String Manipulation/String Functions in pandas](https://wesmckinney.com/book/data-cleaning.html#text_string_manip_vectorized) by Wes McKinney

In [274]:
cols = flights.columns.str.contains("time") # str method: "time"을 포함하는지 여부
cols

array([False, False, False,  True,  True, False,  True,  True, False,
       False, False, False, False, False,  True, False, False, False])

In [275]:
# Columns 쪽으로 boolean indexing
flights.loc[:, cols]

Unnamed: 0,dep_time,sched_dep_time,arr_time,sched_arr_time,air_time
0,517.00,515,830.00,819,227.00
1,533.00,529,850.00,830,227.00
2,542.00,540,923.00,850,160.00
...,...,...,...,...,...
336773,,1210,,1330,
336774,,1159,,1344,
336775,,840,,1020,


::: {.callout-warning}

Chained indexing으로 값을 assign하는 경우 copy vs. view 경고 메세지

```python
flights[flights["arr_delay"] < 0]["arr_delay"] = 0
```
`/var/folders/mp/vcywncl97ml2q4c_5k2r573m0000gn/T/ipykernel_96692/3780864177.py:1`: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.  
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: `https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy`

<br />
경고가 제시하는데로 .loc을 이용하여 assign
```python
flights.loc[flights["arr_delay"] < 0, "arr_delay"] = 0
```

:::

## Summary

- Bracket [ ]의 경우
  - 간단히 columns을 선택하고자 할때 column labels: `df[["var1", "var2"]]`
  - 간단히 rows를 선택하고자 할때 numerical indexing: `df[:10]`
- Dot-notation은 
  - pandas의 methods와 중복된 이름을 피하고, 
  - assignment의 왼편에는 사용을 피할 것
- 가능하면 분명한 loc 또는 iloc을 사용
  - `loc[:, ["var1", "var2"]]`는 `df[["var1", "var2"]]`과 동일
  - `iloc[:10, :]`은 `df[:10]`와 동일
  - loc의 경우, index가 숫자라 할지라도 label로 처리됨
  - loc은 iloc과는 다른게 first, last index 모두 inclusive
- Boolean indexing의 경우 
  - Bracket [ ]: `df[bool_idx]`
  - loc: `df.loc[bool_idx, :]`
  - iloc 불가
- Assignment를 할때는, 
  - chained indexing을 피하고: `df[:5]["dest"]`
  - loc or iloc 사용: 
    - `df.loc[:4, "dest"]`: index가 0부터 정렬되어 있다고 가정했을 때, slicing에서 위치 하나 차이남
    - `df.iloc[:5, 13]`: "dest"의 column 위치 13

- 한 개의 column 혹은 row을 선택하면 Series로 반환: `df["var1"]` 또는 `df.loc[2, :]`