In [3]:
# 라이브러리를 불러옴
if (!require(rvest)) install.packages('rvest')
library(rvest)

if (!require(tidyverse)) install.packages('tidyverse')
library(tidyverse)


# 날짜와 페이지를 입력하면 조건에 따른 영화 코드를 반환하는 함수를 정의함
get_movie_code <- function(date, page){ # ex) date = 20220502, page = 1
    
    base_url <- 'https://movie.naver.com/movie/sdb/rank/rmovie.naver?sel=pnt&date='
    target_url <- paste0(base_url, date, '&page=', page)
    
    tables <- target_url %>%
        read_html(encoding = 'UTF-8') %>%
        html_nodes('table')
    
    hrefs <- tables[[1]] %>%
        html_nodes('a') %>%
        html_attr('href')
    
    hrefs_odd <- hrefs[c(TRUE, FALSE)] # 같은 코드이나 서로 다른링크가 짝으로 존재함을 확인했다. 홀수번째 원소만 인덱싱한다.
    codes <- substr(hrefs_odd, unlist(gregexpr('=', hrefs_odd)) + 1, nchar(hrefs_odd)) # 다섯자리 코드와 여섯자리 코드가 혼재하므로 등호를 기준으로 인덱싱한다.
    
    return(codes)
}

# 영화의 코드를 입력하면 영화의 정보를 반환하는 함수를 정의함
get_movie_info <- function(code){
    base_url <- 'https://movie.naver.com/movie/bi/mi/point.nhn?code='
    target_url <- paste0(base_url, code)
    html <- read_html(target_url)
    
    title_unclean <- html %>%
    html_nodes("title") %>%
    html_text()

    title <- substr(title_unclean, 1, unlist(gregexpr(' : 네이버 영화', title_unclean))-1)
    exist <- html %>% html_nodes('dl[class=info_spec]') %>% html_nodes('dt') %>% html_text()

    steps <- html %>%
        html_nodes('dl[class=info_spec]') %>%
        html_nodes('dd')

    # 개요, 감독, 출연, 등급 중 결측값이 존재하는 경우를 대비함
    step1 = NA
    step2 = NA
    step3 = NA
    step4 = NA
    for (i in 1:length(exist)){
        if (exist[i] == '개요()'){
            step1_unclean <- steps[i] %>%
                html_nodes('p') %>%
                    html_nodes('span') %>%
                html_text()
            step1 <- gsub('\\t|\\n|\\r', '', step1_unclean)
        }else if (exist[i] == '감독'){
            step2 <- steps[i] %>%
                html_nodes('p') %>%
                html_text()
        }else if (exist[i] == '출연'){
            step3 <- steps[i] %>%
                html_text()
        }else if (exist[i] == '등급'){
            step4_unclean <- steps[i] %>%
                html_nodes('p') %>%
                html_text()
            step4 <- gsub('\\t|\\n|\\r', '', step4_unclean)
        }
    }
    
    tdt <- html %>%
        html_nodes('div[class=viewing_graph]')
    
    # 성별, 나이별 관람추이가 존재하지 않는 경우 관람객 통계가 존재하지 않으므로 결측값으로 처리함
    if (length(tdt) == 0){
        audience_age_10 <- NA
        audience_age_20 <- NA
        audience_age_30 <- NA
        audience_age_40 <- NA
        audience_age_50 <- NA
        audience_score <- NA
        audience_count <- NA
        audience_male <- NA
        audience_female <- NA
        audience_10 <- NA
        audience_20 <- NA
        audience_30 <- NA
        audience_40 <- NA
        audience_50 <- NA
    } else {
        audi_age <- html %>%
            html_nodes('strong[class=graph_percent]') %>%
            html_text()
        audience_age_10 <- audi_age[1]
        audience_age_20 <- audi_age[2]
        audience_age_30 <- audi_age[3]
        audience_age_40 <- audi_age[4]
        audience_age_50 <- audi_age[5]

        audience_score <- html %>% 
            html_nodes('div[class=grade_audience]') %>%
            html_nodes('div[class=star_score]') %>%
            html_nodes('em') %>%
            html_text() %>% paste(collapse='')

        audience_count <- html %>% 
            html_nodes('div[class=grade_audience]') %>%
            html_nodes('span[class=user_count]') %>%
            html_nodes('em') %>%
            html_text() %>% paste(collapse='')
        
        audience_male <- (html %>%
            html_nodes('div[class=graph_area]') %>%
            html_nodes('div[class=grp_male]') %>%
            html_nodes('strong[class=graph_point]') %>%
            html_text())[2]

        audience_female <- (html %>%
            html_nodes('div[class=graph_area]') %>%
            html_nodes('div[class=grp_female]') %>%
            html_nodes('strong[class=graph_point]') %>%
            html_text())[2]

        audience_age <- html %>%
            html_nodes('div[class=grp_age]') %>%
            html_nodes('strong[class=graph_point]') %>%
            html_text()
        
        audience_10 <- audience_age[6]
        audience_20 <- audience_age[7]
        audience_30 <- audience_age[8]
        audience_40 <- audience_age[9]
        audience_50 <- audience_age[10]
        
    }
    
    netizen_score <- html %>% 
        html_nodes('div[class=grade_netizen]') %>%
        html_nodes('div[class=star_score]') %>%
        html_nodes('em') %>%
        html_text() %>% paste(collapse='')
    
    netizen_count <- html %>% 
        html_nodes('div[class=grade_netizen]') %>%
        html_nodes('span[class=user_count]') %>%
        html_nodes('em') %>%
        html_text() %>% paste(collapse='')
    
    ntz_male <- (html %>%
        html_nodes('div[class=graph_area]') %>%
        html_nodes('div[class=grp_male]') %>%
        html_nodes('strong[class=graph_point]') %>%
        html_text())[1]
    
    ntz_female <- (html %>%
        html_nodes('div[class=graph_area]') %>%
        html_nodes('div[class=grp_female]') %>%
        html_nodes('strong[class=graph_point]') %>%
        html_text())[1]
    
    ntz_age <- html %>%
        html_nodes('div[class=grp_age]') %>%
        html_nodes('strong[class=graph_point]') %>%
        html_text()
    ntz_10 <- ntz_age[1]
    ntz_20 <- ntz_age[2]
    ntz_30 <- ntz_age[3]
    ntz_40 <- ntz_age[4]
    ntz_50 <- ntz_age[5]
    
    return(c(title, code, step1, step2, step3, step4, audience_age_10, audience_age_20, audience_age_30,
     audience_age_40, audience_age_50, netizen_score, netizen_count, ntz_male, ntz_female,
      ntz_10, ntz_20, ntz_30, ntz_40, ntz_50, audience_score, audience_count, audience_male,
     audience_female, audience_10, audience_20, audience_30, audience_40, audience_50))
}

In [4]:
info <- vector('list', 100)
top_100_codes <- c(get_movie_code(20220502, 1), get_movie_code(20220502, 2))

In [6]:
get_movie_info(top_100_codes[1])

In [None]:
for (i in 1:length(top_100_codes)){ # 한 줄씩 차곡차곡 쌓는다.
    info[[i]] <- get_movie_info(top_100_codes[i])
}

In [22]:
for(i in 1:100){
    print(i)
    print(((length(info[[i]])==32)))
}

[1] 1
[1] TRUE
[1] 2
[1] TRUE
[1] 3
[1] TRUE
[1] 4
[1] TRUE
[1] 5
[1] TRUE
[1] 6
[1] TRUE
[1] 7
[1] TRUE
[1] 8
[1] TRUE
[1] 9
[1] TRUE
[1] 10
[1] TRUE
[1] 11
[1] FALSE
[1] 12
[1] TRUE
[1] 13
[1] TRUE
[1] 14
[1] TRUE
[1] 15
[1] TRUE
[1] 16
[1] TRUE
[1] 17
[1] TRUE
[1] 18
[1] TRUE
[1] 19
[1] TRUE
[1] 20
[1] TRUE
[1] 21
[1] TRUE
[1] 22
[1] TRUE
[1] 23
[1] TRUE
[1] 24
[1] TRUE
[1] 25
[1] TRUE
[1] 26
[1] TRUE
[1] 27
[1] TRUE
[1] 28
[1] TRUE
[1] 29
[1] TRUE
[1] 30
[1] TRUE
[1] 31
[1] TRUE
[1] 32
[1] TRUE
[1] 33
[1] TRUE
[1] 34
[1] TRUE
[1] 35
[1] TRUE
[1] 36
[1] TRUE
[1] 37
[1] TRUE
[1] 38
[1] TRUE
[1] 39
[1] TRUE
[1] 40
[1] TRUE
[1] 41
[1] FALSE
[1] 42
[1] TRUE
[1] 43
[1] TRUE
[1] 44
[1] TRUE
[1] 45
[1] TRUE
[1] 46
[1] TRUE
[1] 47
[1] TRUE
[1] 48
[1] TRUE
[1] 49
[1] TRUE
[1] 50
[1] TRUE
[1] 51
[1] TRUE
[1] 52
[1] TRUE
[1] 53
[1] TRUE
[1] 54
[1] TRUE
[1] 55
[1] TRUE
[1] 56
[1] TRUE
[1] 57
[1] TRUE
[1] 58
[1] TRUE
[1] 59
[1] TRUE
[1] 60
[1] TRUE
[1] 61
[1] TRUE
[1] 62
[1] TRUE
[1] 63
[1] TRUE

In [23]:
info[[41]]

In [21]:
info[[10]]

In [None]:
info

In [20]:
info[[11]]

In [2]:
final_info <- do.call('rbind', info)
# 컬럼명을 지정함
colnames(final_info) <- c("title","code","genre","country","runtime","release",
        "director","actor","view_class","audience_age_10","audience_age_20",
        "audience_age_30","audience_age_40","audience_age_50",
        "netizen_score","netizen_count","ntz_male","ntz_female","ntz_10",
        "ntz_20","ntz_30","ntz_40","ntz_50","audience_score",
        "audience_count","audience_male","audience_female","audience_10",
        "audience_20","audience_30","audience_40","audience_50")

write.csv(final_info, 'movie.csv', row.names=T) # 최종 csv파일 생성함

"number of columns of result is not a multiple of vector length (arg 11)"


In [None]:
info <- vector