# **Load Library(tidyverse) And Library(rvest)**

In [None]:
library(tidyverse)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.3     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [None]:
library(rvest)


Attaching package: ‘rvest’


The following object is masked from ‘package:readr’:

    guess_encoding




# **Import Data From IMDB Website**

In [None]:
url <- "https://www.imdb.com/search/title/?groups=top_100&sort=user_rating,desc"

In [None]:
names <- url %>%
  read_html() %>%
  html_elements("h3.lister-item-header") %>%
  html_text2()

In [None]:
ratings <- url %>%
  read_html() %>%
  html_elements("div.ratings-imdb-rating") %>%
  html_text2() %>%
  as.numeric()

In [None]:
movie_rate <- url %>%
  read_html() %>%
  html_elements("span.certificate ") %>%
  html_text2()

In [None]:
times <- url %>%
  read_html() %>%
  html_elements("span.runtime ") %>%
  html_text2()


In [None]:
type <- url %>%
  read_html() %>%
  html_elements("span.genre ") %>%
  html_text2()

In [None]:
votes <- url %>%
  read_html() %>%
  html_elements("p.sort-num_votes-visible ") %>%
  html_text2()

# **Crate Dataframe**

In [None]:
df_imdb = data.frame(names, ratings, movie_rate, times, type, votes)
df_imdb

names,ratings,movie_rate,times,type,votes
<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>
1. The Shawshank Redemption (1994),9.3,R,142 min,Drama,"Votes: 2,794,416 | Gross: $28.34M | Top 250: #1"
2. The Godfather (1972),9.2,R,175 min,"Crime, Drama","Votes: 1,946,579 | Gross: $134.97M | Top 250: #2"
3. The Dark Knight (2008),9.0,PG-13,152 min,"Action, Crime, Drama","Votes: 2,775,029 | Gross: $534.86M | Top 250: #3"
4. Schindler's List (1993),9.0,R,195 min,"Biography, Drama, History","Votes: 1,405,120 | Gross: $96.90M | Top 250: #6"
5. The Lord of the Rings: The Return of the King (2003),9.0,PG-13,201 min,"Action, Adventure, Drama","Votes: 1,913,750 | Gross: $377.85M | Top 250: #7"
6. 12 Angry Men (1957),9.0,Approved,96 min,"Crime, Drama","Votes: 830,263 | Gross: $4.36M | Top 250: #5"
7. The Godfather Part II (1974),9.0,R,202 min,"Crime, Drama","Votes: 1,322,285 | Gross: $57.30M | Top 250: #4"
8. Pulp Fiction (1994),8.9,R,154 min,"Crime, Drama","Votes: 2,143,599 | Gross: $107.93M | Top 250: #8"
9. Spider-Man: Across the Spider-Verse (2023),8.8,PG,140 min,"Animation, Action, Adventure","Votes: 250,642 | Top 250: #19"
10. Inception (2010),8.8,PG-13,148 min,"Action, Adventure, Sci-Fi","Votes: 2,463,620 | Gross: $292.58M | Top 250: #14"


# **Clean Data**

In [None]:
df <- df_imdb %>%
  separate(col = votes, into = c("votes","gross","top"), sep = " \\| ")

“[1m[22mExpected 3 pieces. Missing pieces filled with `NA` in 4 rows [9, 21, 32, 33].”


In [None]:
df$top[c(30,31,48)] <- c("Top 250: #20", "Top 250: #45", "Top 250: #44")
df$gross[c(30,31,48)] <- c(NA,NA,NA)

In [None]:
df <- df %>%
  separate(col = votes, into = c("o_votes","n_votes"), sep = "\\: ")

In [None]:
df <- df %>%
  separate(gross, into = c("o_gross","n_gross"), sep = "\\: ")

In [None]:
df <- df %>%
  separate(names, into = c("id","n_names"), sep = "\\. +")

In [None]:
df$id <- as.numeric(df$id)

In [None]:
df$times <- df$times %>%
  str_replace_all(.," min","") %>%
  as.numeric

In [None]:
df$n_votes <- df$n_votes %>%
  str_replace_all(.,",","") %>%
  as.numeric

In [None]:
df$top <- df$top %>%
  str_replace_all(.,"Top 250: #","") %>%
  as.numeric

In [None]:
sub <- substring(df$n_gross, 2)
glimpse(sub)

 chr [1:50] "28.34M" "134.97M" "534.86M" "96.90M" "377.85M" "4.36M" ...


In [None]:
sub <- sub %>%
  str_replace_all(.,"M","") %>%
  as.numeric
View(sub)

In [None]:
id = 1 : 50
View(id)

In [None]:
sub2 <- data.frame(id, sub)
View(sub2)

id,sub
<int>,<dbl>
1,28.34
2,134.97
3,534.86
4,96.9
5,377.85
6,4.36
7,57.3
8,107.93
9,19.0
10,292.58


In [None]:
new_df <- df %>%
  left_join(sub2, by = "id")
View(new_df)

id,n_names,ratings,movie_rate,times,type,o_votes,n_votes,o_gross,n_gross,top,sub
<dbl>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<dbl>,<chr>,<chr>,<dbl>,<dbl>
1,The Shawshank Redemption (1994),9.3,R,142,Drama,Votes,2794416,Gross,$28.34M,1.0,28.34
2,The Godfather (1972),9.2,R,175,"Crime, Drama",Votes,1946579,Gross,$134.97M,2.0,134.97
3,The Dark Knight (2008),9.0,PG-13,152,"Action, Crime, Drama",Votes,2775029,Gross,$534.86M,3.0,534.86
4,Schindler's List (1993),9.0,R,195,"Biography, Drama, History",Votes,1405120,Gross,$96.90M,6.0,96.9
5,The Lord of the Rings: The Return of the King (2003),9.0,PG-13,201,"Action, Adventure, Drama",Votes,1913750,Gross,$377.85M,7.0,377.85
6,12 Angry Men (1957),9.0,Approved,96,"Crime, Drama",Votes,830263,Gross,$4.36M,5.0,4.36
7,The Godfather Part II (1974),9.0,R,202,"Crime, Drama",Votes,1322285,Gross,$57.30M,4.0,57.3
8,Pulp Fiction (1994),8.9,R,154,"Crime, Drama",Votes,2143599,Gross,$107.93M,8.0,107.93
9,Spider-Man: Across the Spider-Verse (2023),8.8,PG,140,"Animation, Action, Adventure",Votes,250642,Top 250,#19,,19.0
10,Inception (2010),8.8,PG-13,148,"Action, Adventure, Sci-Fi",Votes,2463620,Gross,$292.58M,14.0,292.58


# **Summary Data**

In [None]:
top_50 <- new_df %>%
  select(id,
    name = n_names,
    ratings,
    type,
    runtimes_Minutes = times,
    gross_Millions = sub,
    range = top
    )
View(top_50)

id,name,ratings,type,runtimes_Minutes,gross_Millions,range
<dbl>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>
1,The Shawshank Redemption (1994),9.3,Drama,142,28.34,1.0
2,The Godfather (1972),9.2,"Crime, Drama",175,134.97,2.0
3,The Dark Knight (2008),9.0,"Action, Crime, Drama",152,534.86,3.0
4,Schindler's List (1993),9.0,"Biography, Drama, History",195,96.9,6.0
5,The Lord of the Rings: The Return of the King (2003),9.0,"Action, Adventure, Drama",201,377.85,7.0
6,12 Angry Men (1957),9.0,"Crime, Drama",96,4.36,5.0
7,The Godfather Part II (1974),9.0,"Crime, Drama",202,57.3,4.0
8,Pulp Fiction (1994),8.9,"Crime, Drama",154,107.93,8.0
9,Spider-Man: Across the Spider-Verse (2023),8.8,"Animation, Action, Adventure",140,19.0,
10,Inception (2010),8.8,"Action, Adventure, Sci-Fi",148,292.58,14.0
