# Project Proposal

In [1]:
# set up
library(tidyverse)
library(repr)
library(tidymodels)
options(repr.matrix.max.rows = 6)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.3     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0

“package ‘ggplot2’ was built under R version 4.0.1”
“package ‘tibble’ was built under R version 4.0.2”
“package ‘tidyr’ was built under R version 4.0.2”
“package ‘dplyr’ was built under R version 4.0.2”
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

“package ‘tidymodels’ was built under R version 4.0.2”
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 0.1.1 ──

[32m✔

In [2]:
# reading in data
url <- "https://raw.githubusercontent.com/karlie-tr/dataset/main/Highest%20Holywood%20Grossing%20Movies.csv"
download.file(url, destfile = "highest-hollywood-grossing-movies.csv")
movies_data <- read_csv("highest-hollywood-grossing-movies.csv")

# remove white space in headers
colnames(movies_data) <- make.names(colnames(movies_data))
movies_data

“Missing column names filled in: 'X1' [1]”
Parsed with column specification:
cols(
  X1 = [32mcol_double()[39m,
  Title = [31mcol_character()[39m,
  `Movie Info` = [31mcol_character()[39m,
  Distributor = [31mcol_character()[39m,
  `Release Date` = [31mcol_character()[39m,
  `Domestic Sales (in $)` = [32mcol_double()[39m,
  `International Sales (in $)` = [32mcol_double()[39m,
  `World Sales (in $)` = [32mcol_double()[39m,
  Genre = [31mcol_character()[39m,
  `Movie Runtime` = [31mcol_character()[39m,
  License = [31mcol_character()[39m
)



X1,Title,Movie.Info,Distributor,Release.Date,Domestic.Sales..in...,International.Sales..in...,World.Sales..in...,Genre,Movie.Runtime,License
<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>
0,Star Wars: Episode VII - The Force Awakens (2015),"As a new threat to the galaxy rises, Rey, a desert scavenger, and Finn, an ex-stormtrooper, must join Han Solo and Chewbacca to search for the one hope of restoring peace.",Walt Disney Studios Motion Pictures,"December 16, 2015",936662225,1132859475,2069521700,"['Action', 'Adventure', 'Sci-Fi']",2 hr 18 min,PG-13
1,Avengers: Endgame (2019),"After the devastating events of Avengers: Infinity War, the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe.",Walt Disney Studios Motion Pictures,"April 24, 2019",858373000,1939128328,2797501328,"['Action', 'Adventure', 'Drama', 'Sci-Fi']",3 hr 1 min,PG-13
2,Avatar (2009),A paraplegic Marine dispatched to the moon Pandora on a unique mission becomes torn between following his orders and protecting the world he feels is his home.,Twentieth Century Fox,"December 16, 2009",760507625,2086738578,2847246203,"['Action', 'Adventure', 'Fantasy', 'Sci-Fi']",2 hr 42 min,PG-13
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
915,Eat Pray Love (2010),"A married woman realizes how unhappy her marriage really is, and that her life needs to go in a different direction. After a painful divorce, she takes off on a round-the-world journey to ""find herself"".",Sony Pictures Entertainment (SPE),"August 13, 2010",80574010,124020006,204594016,"['Biography', 'Drama', 'Romance']",2 hr 13 min,PG-13
916,The Texas Chainsaw Massacre (2003),"After picking up a traumatized young hitchhiker, five friends find themselves stalked and hunted by a deformed chainsaw-wielding loon and his family of equally psychopathic killers.",New Line Cinema,"October 17, 2003",80571655,26792250,107363905,"['Crime', 'Horror']",1 hr 38 min,R
917,Zookeeper (2011),"A group of zoo animals decide to break their code of silence in order to help their lovable zookeeper find love, without opting to leave his current job for something more illustrious.",Sony Pictures Entertainment (SPE),"July 6, 2011",80360843,89491916,169852759,"['Comedy', 'Family', 'Fantasy', 'Romance']",1 hr 42 min,PG


In [3]:
# remove unusable columns
vars <- c("Title","Distributor","Release.Date","World.Sales..in...","Genre","Movie.Runtime") # vars that we want to keep
movies_data_filtered <- movies_data %>%
                        select(all_of(vars))

# change column name
colnames(movies_data_filtered) <- c("Title", "Distributor","Release_Date","World_Sales","Genre","Runtime")

# remove all white space in dataframe
movies_data_no_space <- as.data.frame(apply(movies_data_filtered, 2, str_remove_all, " "))

# separate release year from Release_Date
movies_release_date <- separate(movies_data_no_space,
                            col = Release_Date,
                            into = c("date", "Release_Year"),
                            sep = ","
                           )

# separate release year from movie title
movies_release_date <- separate(movies_release_date,
                            col = Title,
                            into = c("Title", "year"),
                            sep = "[()]"
                           )

# filter for movies released since 2010, remove NAs
movies_2010 <- movies_release_date %>%
                    select(-date, -year) %>%
                    filter(Release_Year >= 2010) %>%
                    drop_na()
      

movies_2010                 

“Expected 2 pieces. Additional pieces discarded in 918 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].”


Title,Distributor,Release_Year,World_Sales,Genre,Runtime
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
StarWars:EpisodeVII-TheForceAwakens,WaltDisneyStudiosMotionPictures,2015,2069521700,"['Action','Adventure','Sci-Fi']",2hr18min
Avengers:Endgame,WaltDisneyStudiosMotionPictures,2019,2797501328,"['Action','Adventure','Drama','Sci-Fi']",3hr1min
JurassicWorld,UniversalPictures,2015,1670516444,"['Action','Adventure','Sci-Fi']",2hr4min
⋮,⋮,⋮,⋮,⋮,⋮
Ted2,UniversalPictures,2015,215863606,['Comedy'],1hr55min
EatPrayLove,SonyPicturesEntertainment(SPE),2010,204594016,"['Biography','Drama','Romance']",2hr13min
Zookeeper,SonyPicturesEntertainment(SPE),2011,169852759,"['Comedy','Family','Fantasy','Romance']",1hr42min


In [4]:
# tidying genre column

genre_list <- movies_data %>%
    select(Genre) %>%
    # remove brackets, quotation marks, and spaces from string
    mutate(Genre = str_replace_all(string = Genre, pattern = regex("[]\\[\' ]"), replacement = "")) %>% 
    # change string into list
    mutate(Genre = as.list(strsplit(Genre, ","))) %>%
    # make a list of all unique values
    pull() %>%
    flatten() %>%
    unique()

genre_list

In [5]:
# tidying Runtime column
movies_2010_separate_runtime <- separate(movies_2010,
                                            col = Runtime,
                                            into = c("Runtime_hr", "Runtime_min"),
                                            sep = "hr"
                                         )
# remove min from Runtime_min
movies_2010_separate_minute <- separate(movies_2010_separate_runtime,
                                        col = Runtime_min,
                                        into = c("Runtime_min"),
                                        sep = "min"
                                       )
movies_2010_separate_minute

# change hr and time into numeric
numeric_duration <- movies_2010_separate_minute %>%
                    select(Runtime_hr, Runtime_min)

numeric_duration <- as.data.frame(apply(numeric_duration, 2, as.numeric)) %>%
                    mutate(Duration = Runtime_hr * 60 + Runtime_min)
numeric_duration




“Expected 1 pieces. Additional pieces discarded in 352 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].”


Title,Distributor,Release_Year,World_Sales,Genre,Runtime_hr,Runtime_min
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
StarWars:EpisodeVII-TheForceAwakens,WaltDisneyStudiosMotionPictures,2015,2069521700,"['Action','Adventure','Sci-Fi']",2,18
Avengers:Endgame,WaltDisneyStudiosMotionPictures,2019,2797501328,"['Action','Adventure','Drama','Sci-Fi']",3,1
JurassicWorld,UniversalPictures,2015,1670516444,"['Action','Adventure','Sci-Fi']",2,4
⋮,⋮,⋮,⋮,⋮,⋮,⋮
Ted2,UniversalPictures,2015,215863606,['Comedy'],1,55
EatPrayLove,SonyPicturesEntertainment(SPE),2010,204594016,"['Biography','Drama','Romance']",2,13
Zookeeper,SonyPicturesEntertainment(SPE),2011,169852759,"['Comedy','Family','Fantasy','Romance']",1,42


Runtime_hr,Runtime_min,Duration
<dbl>,<dbl>,<dbl>
2,18,138
3,1,181
2,4,124
⋮,⋮,⋮
1,55,115
2,13,133
1,42,102
