# **Mengumpulkan dataset film dengan web-scrapping**


## **1. Kode untuk Google Colab**


Install Selenium

In [None]:
# install chromium, chromedriver, dan selenium
!apt update
!apt install chromium-chromedriver
!pip install selenium

# import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# ganti opsi untuk menjalankan selenium secara headless
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

In [None]:
! pip install tqdm
from tqdm.notebook import tqdm

Web-scrape

In [None]:
# akses website IMDb 
driver = webdriver.Chrome(options=options)
driver.get("https://www.imdb.com/search/title/?title_type=feature&num_votes=10000,&sort=user_rating,desc&ref_=adv_prv")

list_of_movies = []
page = 1

# jumlah data = 9370
# jumlah data per halaman = 50
# jumlah halaman = 9370 / 50 = 187
page_num = list(range(187))
for i in tqdm(page_num):
  # mendapatkan list film
  movies = driver.find_elements(By.CLASS_NAME, "lister-item")

  for movie in movies:
    data_movies = []
    content = movie.find_element(By.CLASS_NAME, "lister-item-content")

    # mendapatkan judul dan tahun
    header = content.find_element(By.CLASS_NAME, "lister-item-header")
    title = header.find_element(By.TAG_NAME, "a").text
    date = header.find_element(By.CLASS_NAME, "lister-item-year").text.split()
    year = date[-1].strip("()")
    data_movies.append(title)
    data_movies.append(year)

    # mendapatkan data film
    data = content.find_elements(By.TAG_NAME, "p")
    
    # mendapatkan durasi, genre
    #age = data[0].find_element(By.CLASS_NAME, "certificate").text
    runtime = data[0].find_element(By.CLASS_NAME, "runtime").text.split()
    genre_list = data[0].find_element(By.CLASS_NAME, "genre").text.split(", ")
    genre = []
    for _ in genre_list:
      genre.append(_)
    data_movies.append(runtime[0])
    data_movies.append(genre)

    # mendapatkan rating
    rating = content.find_element(By.NAME, "ir").get_attribute("data-value")
    data_movies.append(rating)

    # mendapatkan director dan pemain film
    data2 = data[2].find_elements(By.TAG_NAME, "a")
    director = data2[0].text
    stars = []
    for i in range(1, len(data2)):
      stars.append(data2[i].text)
    data_movies.append(director)
    data_movies.append(stars)

    # mendapatkan jumlah orang yang melakukan vote
    vote = data[3].find_element(By.NAME, "nv").get_attribute("data-value")
    data_movies.append(vote)
    
    list_of_movies.append(data_movies)

  # navigasi web-scrape ke halaman selanjutnya
  if len(driver.find_elements(By.CSS_SELECTOR, "a.next-page")) > 0:
    page += 50
    url = "https://www.imdb.com/search/title/?title_type=feature&num_votes=10000,&sort=user_rating,desc&start={}&ref_=adv_nxt".format(page)
    driver.get(url)
  else:
    break

# **Membuat dataset dalam bentuk Pandas DataFrame**

In [None]:
import pandas as pd

In [None]:
# convert list menjadi dataframe
df = pd.DataFrame(list_of_movies, columns=["Title", "Year", "Runtime", "Genre", "Rating", "Director", "Stars", "Vote"])

In [None]:
# convert type data agar sesuai
df["Year"] = df["Year"].astype(int)
df["Runtime"] = df["Runtime"].astype(int)
df["Rating"] = df["Rating"].astype(float)
df["Vote"] = df["Vote"].astype(int)
df

Import-Eksport dataset

In [None]:
# mount gdrive
from google.colab import drive
drive.mount('drive')

In [None]:
# Eksport ke .csv
df.to_csv('IMDb_Movie_Dataset.csv')
!cp IMDb_Movie_Dataset.csv "drive/My Drive/Colab Notebooks/"

In [None]:
# import dataset dari gdrive
df = pd.read_csv('drive/My Drive/Colab Notebooks/IMDb_Movie_Dataset.csv')

# **Telaah Data**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# melihat distribusi film berdasarkan tahun rilis
fig, ax = plt.subplots(figsize=(20,8))
ax.set_title('Distribusi Film setiap tahun')
sns.countplot(x='Year', data=df, ax=ax)
plt.xticks(rotation=90)

In [None]:
# judul film dengan vote terbanyak (populer)
popular = df['Vote'].idxmax()
print("Film yang paling poluper yaitu {} yang dirilis pada tahun {}".format(df['Title'][popular], df['Year'][popular]))

# judul film dengan vote paling sedikit
unpopular = df['Vote'].idxmin()
print("Film yang kurang poluper yaitu {} yang dirilis pada tahun {}".format(df['Title'][unpopular], df['Year'][unpopular]))

In [None]:
# judul film dengan durasi terpanjang
longest = df['Runtime'].idxmax()
print("Film dengan durasi terpanjang adalah {} dengan durasi {} menit".format(df['Title'][longest], df['Runtime'][longest]))

# judul film dengan durasi terpendek
sortest = df['Runtime'].idxmin()
print("Film dengan durasi terpendek adalah {} dengan durasi {} menit".format(df['Title'][sortest], df['Runtime'][sortest]))

In [None]:
# 10 director dengan film terbanyak
director_film = df['Director'].value_counts()
director_film.head(10)