In [1]:
#| code-summary: Load Packages
#| code-fold: true

# numerical calculation & data frames
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so
import plotly.express as px

# statistics
import statsmodels.api as sm

In [2]:
#| echo: false
from IPython.display import display, HTML
HTML('<style>.output {flex-direction: row;}</style>')

In [46]:
#| code-summary: Options
#| code-fold: true

# pandas options
pd.options.display.precision = 2
pd.options.display.float_format = '{:.2f}'.format  # pd.reset_option('display.float_format')
pd.options.display.max_rows = 7

# Numpy options
np.set_printoptions(precision = 2, suppress=True)

이번 장에서는 시각화를 하기 전, 중요한 데이터 패턴을 보기 위해서 새로운 변수를 만들거나 요약한 통계치를 만들 필요가 있는데 이를 다루는 핵심적인 함수들에 대해 익힙니다.  
좀 더 자세한 데이터 가공에 대해서는 추후에 다룰 예정입니다.

___

Data: *On-time data for all flights that departed NYC (i.e. JFK, LGA or EWR) in 2013*

In [5]:
# import the dataset
flights_data = sm.datasets.get_rdataset("flights", "nycflights13")
flights = flights_data.data
flights = flights.drop(columns="time_hour") # drop the "time_hour" column

In [6]:
#| output: false
# Description
print(flights_data.__doc__)

.. container::

   flights R Documentation

   .. rubric:: Flights data
      :name: flights-data

   .. rubric:: Description
      :name: description

   On-time data for all flights that departed NYC (i.e. JFK, LGA or EWR)
   in 2013.

   .. rubric:: Usage
      :name: usage

   ::

      flights

   .. rubric:: Format
      :name: format

   Data frame with columns

   year, month, day
      Date of departure.

   dep_time, arr_time
      Actual departure and arrival times (format HHMM or HMM), local tz.

   sched_dep_time, sched_arr_time
      Scheduled departure and arrival times (format HHMM or HMM), local
      tz.

   dep_delay, arr_delay
      Departure and arrival delays, in minutes. Negative times represent
      early departures/arrivals.

   carrier
      Two letter carrier abbreviation. See ``airlines`` to get name.

   flight
      Flight number.

   tailnum
      Plane tail number. See ``planes`` for additional metadata.

   origin, dest
      Origin and destination. Se

In [7]:
flights

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute
0,2013,1,1,517.00,515,2.00,830.00,819,11.00,UA,1545,N14228,EWR,IAH,227.00,1400,5,15
1,2013,1,1,533.00,529,4.00,850.00,830,20.00,UA,1714,N24211,LGA,IAH,227.00,1416,5,29
2,2013,1,1,542.00,540,2.00,923.00,850,33.00,AA,1141,N619AA,JFK,MIA,160.00,1089,5,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336773,2013,9,30,,1210,,,1330,,MQ,3461,N535MQ,LGA,BNA,,764,12,10
336774,2013,9,30,,1159,,,1344,,MQ,3572,N511MQ,LGA,CLE,,419,11,59
336775,2013,9,30,,840,,,1020,,MQ,3531,N839MQ,LGA,RDU,,431,8,40


In [8]:
flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336776 entries, 0 to 336775
Data columns (total 18 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   year            336776 non-null  int64  
 1   month           336776 non-null  int64  
 2   day             336776 non-null  int64  
 3   dep_time        328521 non-null  float64
 4   sched_dep_time  336776 non-null  int64  
 5   dep_delay       328521 non-null  float64
 6   arr_time        328063 non-null  float64
 7   sched_arr_time  336776 non-null  int64  
 8   arr_delay       327346 non-null  float64
 9   carrier         336776 non-null  object 
 10  flight          336776 non-null  int64  
 11  tailnum         334264 non-null  object 
 12  origin          336776 non-null  object 
 13  dest            336776 non-null  object 
 14  air_time        327346 non-null  float64
 15  distance        336776 non-null  int64  
 16  hour            336776 non-null  int64  
 17  minute    

## Rows
### `query()`

> Conditional operators  
>   `>`, `>=`, `<`, `<=`, 
>   `==` (equal to), `!=` (not equal to)  
>   `&` (and)
>  `|` (or)  
>  `~` (not)  
> `in` (includes)

In [9]:
# Flights that arrived more than 120 minutes (two hours) late
flights.query('arr_delay > 120')

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute
119,2013,1,1,811.00,630,101.00,1047.00,830,137.00,MQ,4576,N531MQ,LGA,CLT,118.00,544,6,30
151,2013,1,1,848.00,1835,853.00,1001.00,1950,851.00,MQ,3944,N942MQ,JFK,BWI,41.00,184,18,35
218,2013,1,1,957.00,733,144.00,1056.00,853,123.00,UA,856,N534UA,EWR,BOS,37.00,200,7,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336724,2013,9,30,2053.00,1815,158.00,2310.00,2054,136.00,EV,5292,N600QX,EWR,ATL,91.00,746,18,15
336757,2013,9,30,2159.00,1845,194.00,2344.00,2030,194.00,9E,3320,N906XJ,JFK,BUF,50.00,301,18,45
336763,2013,9,30,2235.00,2001,154.00,59.00,2249,130.00,B6,1083,N804JB,JFK,MCO,123.00,944,20,1


In [31]:
flights[flights.arr_delay > 120]

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute
119,2013,1,1,811.00,630,101.00,1047.00,830,137.00,MQ,4576,N531MQ,LGA,CLT,118.00,544,6,30
151,2013,1,1,848.00,1835,853.00,1001.00,1950,851.00,MQ,3944,N942MQ,JFK,BWI,41.00,184,18,35
218,2013,1,1,957.00,733,144.00,1056.00,853,123.00,UA,856,N534UA,EWR,BOS,37.00,200,7,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336724,2013,9,30,2053.00,1815,158.00,2310.00,2054,136.00,EV,5292,N600QX,EWR,ATL,91.00,746,18,15
336757,2013,9,30,2159.00,1845,194.00,2344.00,2030,194.00,9E,3320,N906XJ,JFK,BUF,50.00,301,18,45
336763,2013,9,30,2235.00,2001,154.00,59.00,2249,130.00,B6,1083,N804JB,JFK,MCO,123.00,944,20,1


::: {.callout-note}
위의 query 방식의 filtering은 다음과 같은 boolean indexing의 결과와 같음  
`flights[flights["arr_delay"] > 120]`
:::

In [10]:
# Flights that departed on January 1
flights.query('month == 1 & day == 1')

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute
0,2013,1,1,517.00,515,2.00,830.00,819,11.00,UA,1545,N14228,EWR,IAH,227.00,1400,5,15
1,2013,1,1,533.00,529,4.00,850.00,830,20.00,UA,1714,N24211,LGA,IAH,227.00,1416,5,29
2,2013,1,1,542.00,540,2.00,923.00,850,33.00,AA,1141,N619AA,JFK,MIA,160.00,1089,5,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839,2013,1,1,,1935,,,2240,,AA,791,N3EHAA,LGA,DFW,,1389,19,35
840,2013,1,1,,1500,,,1825,,AA,1925,N3EVAA,LGA,MIA,,1096,15,0
841,2013,1,1,,600,,,901,,B6,125,N618JB,JFK,FLL,,1069,6,0


In [11]:
# Flights that departed in January or February
flights.query('month == 1 | month == 2')

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute
0,2013,1,1,517.00,515,2.00,830.00,819,11.00,UA,1545,N14228,EWR,IAH,227.00,1400,5,15
1,2013,1,1,533.00,529,4.00,850.00,830,20.00,UA,1714,N24211,LGA,IAH,227.00,1416,5,29
2,2013,1,1,542.00,540,2.00,923.00,850,33.00,AA,1141,N619AA,JFK,MIA,160.00,1089,5,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136244,2013,2,28,,1115,,,1310,,MQ,4485,N725MQ,LGA,CMH,,479,11,15
136245,2013,2,28,,830,,,1205,,UA,1480,,EWR,SFO,,2565,8,30
136246,2013,2,28,,840,,,1147,,UA,443,,JFK,LAX,,2475,8,40


In [12]:
# A shorter way to select flights that departed in January or February
flights.query('month in [1, 2]')

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute
0,2013,1,1,517.00,515,2.00,830.00,819,11.00,UA,1545,N14228,EWR,IAH,227.00,1400,5,15
1,2013,1,1,533.00,529,4.00,850.00,830,20.00,UA,1714,N24211,LGA,IAH,227.00,1416,5,29
2,2013,1,1,542.00,540,2.00,923.00,850,33.00,AA,1141,N619AA,JFK,MIA,160.00,1089,5,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136244,2013,2,28,,1115,,,1310,,MQ,4485,N725MQ,LGA,CMH,,479,11,15
136245,2013,2,28,,830,,,1205,,UA,1480,,EWR,SFO,,2565,8,30
136246,2013,2,28,,840,,,1147,,UA,443,,JFK,LAX,,2475,8,40


In [13]:
flights.query('arr_delay > 120 & ~(origin == "JFK")')

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute
119,2013,1,1,811.00,630,101.00,1047.00,830,137.00,MQ,4576,N531MQ,LGA,CLT,118.00,544,6,30
218,2013,1,1,957.00,733,144.00,1056.00,853,123.00,UA,856,N534UA,EWR,BOS,37.00,200,7,33
268,2013,1,1,1114.00,900,134.00,1447.00,1222,145.00,UA,1086,N76502,LGA,IAH,248.00,1416,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336529,2013,9,30,1738.00,1529,129.00,1906.00,1649,137.00,EV,4580,N12563,EWR,MKE,110.00,725,15,29
336668,2013,9,30,1951.00,1649,182.00,2157.00,1903,174.00,EV,4294,N13988,EWR,SAV,95.00,708,16,49
336724,2013,9,30,2053.00,1815,158.00,2310.00,2054,136.00,EV,5292,N600QX,EWR,ATL,91.00,746,18,15


### `sort_values()`

In [47]:
# "year", "month", "day", "dep_time" 순서대로 내림차순으로 정렬
flights.sort_values(by=["year", "month", "day", "dep_time"], ascending=False) # default: ascending=True

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute
111279,2013,12,31,2356.00,2359,-3.00,436.00,445,-9.00,B6,745,N665JB,JFK,PSE,200.00,1617,23,59
111278,2013,12,31,2355.00,2359,-4.00,430.00,440,-10.00,B6,1503,N509JB,JFK,SJU,195.00,1598,23,59
111277,2013,12,31,2332.00,2245,47.00,58.00,3,55.00,B6,486,N334JB,JFK,ROC,60.00,264,22,45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839,2013,1,1,,1935,,,2240,,AA,791,N3EHAA,LGA,DFW,,1389,19,35
840,2013,1,1,,1500,,,1825,,AA,1925,N3EVAA,LGA,MIA,,1096,15,0
841,2013,1,1,,600,,,901,,B6,125,N618JB,JFK,FLL,,1069,6,0


In [48]:
# "dep_time"은 내림차순으로, "arr_delay"는 오름차순으로
flights.sort_values(by=["dep_time", "arr_delay"], ascending=[False, True])

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute
150301,2013,3,15,2400.00,2359,1.00,324.00,338,-14.00,B6,727,N636JB,JFK,BQN,186.00,1576,23,59
87893,2013,12,5,2400.00,2359,1.00,427.00,440,-13.00,B6,1503,N587JB,JFK,SJU,182.00,1598,23,59
212941,2013,5,21,2400.00,2359,1.00,339.00,350,-11.00,B6,739,N527JB,JFK,PSE,199.00,1617,23,59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336773,2013,9,30,,1210,,,1330,,MQ,3461,N535MQ,LGA,BNA,,764,12,10
336774,2013,9,30,,1159,,,1344,,MQ,3572,N511MQ,LGA,CLE,,419,11,59
336775,2013,9,30,,840,,,1020,,MQ,3531,N839MQ,LGA,RDU,,431,8,40


<br />
`query()`와 `sort_values()`을 함께 이용하여 좀 더 복잡한 문제를 해결할 수 있음  
예를 들어, 다음과 같이 거의 제시간에 출발한(+- 10분) 항공편들 중 가장 도착 지연이 큰 항공편을 찾을 수 있음

In [49]:
(
    flights
    .query('dep_delay <= 10 & dep_delay >= -10')
    .sort_values("arr_delay", ascending=False)
)

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute
55985,2013,11,1,658.00,700,-2.00,1329.00,1015,194.00,VX,399,N629VA,JFK,LAX,336.00,2475,7,0
181270,2013,4,18,558.00,600,-2.00,1149.00,850,179.00,AA,707,N3EXAA,LGA,DFW,234.00,1389,6,0
256340,2013,7,7,1659.00,1700,-1.00,2050.00,1823,147.00,US,2183,N948UW,LGA,DCA,64.00,214,17,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334354,2013,9,28,847.00,839,8.00,1130.00,959,,EV,4510,N14542,EWR,MKE,,725,8,39
334412,2013,9,28,1010.00,1020,-10.00,1344.00,1222,,EV,4412,N12175,EWR,DSM,,1017,10,20
335805,2013,9,30,559.00,600,-1.00,,715,,WN,464,N411WN,EWR,MDW,,711,6,0


::: {.callout-note}
Row가 재정렬되는 operation을 한 후에는 index 순서가 바뀌는데, 이를 reset하려면,  
`.sort_values(ignore_index=True)`
:::

### `unique()`
Series method

In [17]:
flights["origin"].unique() # return as a NumPy array, but depends on the dtypes

array(['EWR', 'LGA', 'JFK'], dtype=object)

In [32]:
# finds all unique origin and destination pairs.
flights[["origin", "dest"]].value_counts() # default: dropna=True

origin  dest
JFK     LAX     11262
LGA     ATL     10263
        ORD      8857
                ...  
        LEX         1
JFK     MEM         1
        BHM         1
Length: 224, dtype: int64

In [50]:
#| layout-ncol: 3
flights[["origin", "dest"]].value_counts().reset_index(name="n")

Unnamed: 0,origin,dest,n
0,JFK,LAX,11262
1,LGA,ATL,10263
2,LGA,ORD,8857
...,...,...,...
221,LGA,LEX,1
222,JFK,MEM,1
223,JFK,BHM,1


In [51]:
flights[["origin", "dest"]].value_counts()

origin  dest
JFK     LAX     11262
LGA     ATL     10263
        ORD      8857
                ...  
        LEX         1
JFK     MEM         1
        BHM         1
Length: 224, dtype: int64

## Columns
### `assign`

### select

index selection: reindex

### 

### `rename()`

• unique(): Returns the distinct values of the column.

• value_counts(): Returns a frequency table of the number of times each unique value in a given column appears, or, alternatively, the percentage of times each unique value appears when passed normalize=True.

• mode(): Returns the most common value of the column.

`isin()`

## Groups

`map`, `mapapply`, `apply`