# R 데이터 처리 관련 함수
- tidyverse 패키지 이용해서 처리, 아래 함수들은 group_by() 함수와 사용 가능, 데이터프레임이 들어가서 결과도 데이터프레임이 나옴
  - 필터링 : filter()
  - 행 정렬 : arrange()
  - 변수 선택 : select()
  - 새로운 변수 생성 : mutate()
  - 값 요약 : summarize()
  

In [34]:
library(nycflights13)
library(tidyverse)
# v ggplot2 3.3.5     v purrr   0.3.4
# v tibble  3.1.5     v dplyr   1.0.6
# v tidyr   1.1.3     v stringr 1.4.0
# v readr   1.4.0     v forcats 0.5.1

# nycflights13::flights 데이터셋 이용
# int - 정수, dbl - 더블(실수), chr(문자열), dttm(날짜+시간)
str(flights)

# summary 확인, NA(결측값) 확인 가능
summary(flights)

# 1월 1잉ㄹ 비행 정보 저장
jan1 <- filter(flights, year ==2013, month == 1, day == 1)

# 비행기 출발 월이 11, 12월인 경우만 조회
nov_dec <- filter(flights, month %in% c(11, 12))

# 논리 연산자 포함 조회
test01 <- filter(flights, !(arr_delay > 120 | dep_delay > 120))

# 파이프라인 연산자로 아래와 같이도 가능
flights %>% 
    filter(year == 2013, month == 1, day == 1) %>%
    head(5)

# 데이터 정렬
#arrange(flights, year, month, day)

# 내림차순 정렬
# NA(결측값)은 항상 마지막에 위치함
head(arrange(flights, desc(arr_delay)))

# 열 선택 => year, day를 포함한 사이 열 모두 선택
head(select(flights, year:day))

# 제외하고 선택
head(select(flights, -(year:day)))

# 컬럼을 선택할때, 사용할수 있는 헬퍼 함수들
# starts_with(
# contains()

# 컬럼의 순서를 바꾸려고할때 everything() 사용하면 됨
head(select(flights, dep_time, everything()))

# 신규 컬럼 생성
# 컬럼 마지막에 추가됨
head(mutate(flights, gain = arr_delay - dep_delay, speed = distance / air_time * 60))

# 일자별 지연 시간 평균

by_day <- group_by(flights, year, month, day)
head(summarize(by_day, delay = mean(dep_delay, na.rm = TRUE)))

# 파이프 연산자로 쉽게 처리하기

delays <- flights %>%
    group_by(dest) %>% # 목적지별로 항공편을 그룹화
    summarize( # 요약, 평균거리, 평균지연시간, 항공편 수
        count = n()
        , dist = mean(distance, na.rm = TRUE)
        , delay = mean(arr_delay, na.rm = TRUE)
    ) %>%
    filter(count >20, dest != "HNL") # 잡음이 많은 점과 호놀룰루 공항 제외

head(delays)



tibble [336,776 x 19] (S3: tbl_df/tbl/data.frame)
 $ year          : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
 $ month         : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
 $ day           : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
 $ dep_time      : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
 $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
 $ dep_delay     : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
 $ arr_time      : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
 $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
 $ arr_delay     : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
 $ carrier       : chr [1:336776] "UA" "UA" "AA" "B6" ...
 $ flight        : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
 $ tailnum       : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
 $ origin        : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
 $ dest          : ch

      year          month             day           dep_time    sched_dep_time
 Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 106  
 1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 906  
 Median :2013   Median : 7.000   Median :16.00   Median :1401   Median :1359  
 Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349   Mean   :1344  
 3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729  
 Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
                                                 NA's   :8255                 
   dep_delay          arr_time    sched_arr_time   arr_delay       
 Min.   : -43.00   Min.   :   1   Min.   :   1   Min.   : -86.000  
 1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1124   1st Qu.: -17.000  
 Median :  -2.00   Median :1535   Median :1556   Median :  -5.000  
 Mean   :  12.64   Mean   :1502   Mean   :1536   Mean   :   6.895  
 3rd Qu.:  11.00   3rd Qu.:1

year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dttm>
2013,1,1,517,515,2,830,819,11,UA,1545,N14228,EWR,IAH,227,1400,5,15,2013-01-01 05:00:00
2013,1,1,533,529,4,850,830,20,UA,1714,N24211,LGA,IAH,227,1416,5,29,2013-01-01 05:00:00
2013,1,1,542,540,2,923,850,33,AA,1141,N619AA,JFK,MIA,160,1089,5,40,2013-01-01 05:00:00
2013,1,1,544,545,-1,1004,1022,-18,B6,725,N804JB,JFK,BQN,183,1576,5,45,2013-01-01 05:00:00
2013,1,1,554,600,-6,812,837,-25,DL,461,N668DN,LGA,ATL,116,762,6,0,2013-01-01 06:00:00


year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dttm>
2013,1,9,641,900,1301,1242,1530,1272,HA,51,N384HA,JFK,HNL,640,4983,9,0,2013-01-09 09:00:00
2013,6,15,1432,1935,1137,1607,2120,1127,MQ,3535,N504MQ,JFK,CMH,74,483,19,35,2013-06-15 19:00:00
2013,1,10,1121,1635,1126,1239,1810,1109,MQ,3695,N517MQ,EWR,ORD,111,719,16,35,2013-01-10 16:00:00
2013,9,20,1139,1845,1014,1457,2210,1007,AA,177,N338AA,JFK,SFO,354,2586,18,45,2013-09-20 18:00:00
2013,7,22,845,1600,1005,1044,1815,989,MQ,3075,N665MQ,JFK,CVG,96,589,16,0,2013-07-22 16:00:00
2013,4,10,1100,1900,960,1342,2211,931,DL,2391,N959DL,JFK,TPA,139,1005,19,0,2013-04-10 19:00:00


year,month,day
<int>,<int>,<int>
2013,1,1
2013,1,1
2013,1,1
2013,1,1
2013,1,1
2013,1,1


dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
<int>,<int>,<dbl>,<int>,<int>,<dbl>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dttm>
517,515,2,830,819,11,UA,1545,N14228,EWR,IAH,227,1400,5,15,2013-01-01 05:00:00
533,529,4,850,830,20,UA,1714,N24211,LGA,IAH,227,1416,5,29,2013-01-01 05:00:00
542,540,2,923,850,33,AA,1141,N619AA,JFK,MIA,160,1089,5,40,2013-01-01 05:00:00
544,545,-1,1004,1022,-18,B6,725,N804JB,JFK,BQN,183,1576,5,45,2013-01-01 05:00:00
554,600,-6,812,837,-25,DL,461,N668DN,LGA,ATL,116,762,6,0,2013-01-01 06:00:00
554,558,-4,740,728,12,UA,1696,N39463,EWR,ORD,150,719,5,58,2013-01-01 05:00:00


dep_time,year,month,day,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dttm>
517,2013,1,1,515,2,830,819,11,UA,1545,N14228,EWR,IAH,227,1400,5,15,2013-01-01 05:00:00
533,2013,1,1,529,4,850,830,20,UA,1714,N24211,LGA,IAH,227,1416,5,29,2013-01-01 05:00:00
542,2013,1,1,540,2,923,850,33,AA,1141,N619AA,JFK,MIA,160,1089,5,40,2013-01-01 05:00:00
544,2013,1,1,545,-1,1004,1022,-18,B6,725,N804JB,JFK,BQN,183,1576,5,45,2013-01-01 05:00:00
554,2013,1,1,600,-6,812,837,-25,DL,461,N668DN,LGA,ATL,116,762,6,0,2013-01-01 06:00:00
554,2013,1,1,558,-4,740,728,12,UA,1696,N39463,EWR,ORD,150,719,5,58,2013-01-01 05:00:00


year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,...,tailnum,origin,dest,air_time,distance,hour,minute,time_hour,gain,speed
<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<chr>,...,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dttm>,<dbl>,<dbl>
2013,1,1,517,515,2,830,819,11,UA,...,N14228,EWR,IAH,227,1400,5,15,2013-01-01 05:00:00,9,370.0441
2013,1,1,533,529,4,850,830,20,UA,...,N24211,LGA,IAH,227,1416,5,29,2013-01-01 05:00:00,16,374.2731
2013,1,1,542,540,2,923,850,33,AA,...,N619AA,JFK,MIA,160,1089,5,40,2013-01-01 05:00:00,31,408.375
2013,1,1,544,545,-1,1004,1022,-18,B6,...,N804JB,JFK,BQN,183,1576,5,45,2013-01-01 05:00:00,-17,516.7213
2013,1,1,554,600,-6,812,837,-25,DL,...,N668DN,LGA,ATL,116,762,6,0,2013-01-01 06:00:00,-19,394.1379
2013,1,1,554,558,-4,740,728,12,UA,...,N39463,EWR,ORD,150,719,5,58,2013-01-01 05:00:00,16,287.6


`summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.



year,month,day,delay
<int>,<int>,<int>,<dbl>
2013,1,1,11.548926
2013,1,2,13.858824
2013,1,3,10.987832
2013,1,4,8.951595
2013,1,5,5.732218
2013,1,6,7.148014


dest,count,dist,delay
<chr>,<int>,<dbl>,<dbl>
ABQ,254,1826.0,4.38189
ACK,265,199.0,4.852273
ALB,439,143.0,14.397129
ATL,17215,757.1082,11.300113
AUS,2439,1514.253,6.019909
AVL,275,583.5818,8.003831
