#### Importing the relevants libraries

In [21]:
import numpy as np
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup

#### Accessing each webpage and parsing their content with BeautifulSoup

In [22]:
# Will be 5 pages of 200 titles each, resulting in 1.000 titles
soups = []

# Here we create one BeautifulSoup object to each page
# start=1 / start=201 / ... / start=801
for page_num in range(0,5):
    url = 'https://www.imdb.com/search/title/?count=200&groups=top_1000&sort=user_rating&start='
    url += str(page_num*200 + 1)
    soups.append( BeautifulSoup(requests.get(url).text, 'html.parser') ) 

#### Grabbing the relevant information of each movie, in each parsed page

In [23]:
titles = []
ratings = []
genres = []
directors = []
castings = []
imdb_ids = []
years = []
plots = []

for soup in soups:
    # Pegar todo o texto da div lister-list (tem todos os filmes)
    movies = soup.find_all("div", "lister-item")

    for movie in movies:
        titles.append(movie.h3.a.text)
        ratings.append(float(movie.find('div', 'ratings-bar').find('strong').text))
        genres.append(re.findall(r'\w+(?:-\w+)+|[a-zA-Z]+', movie.find('span', 'genre').text))
        imdb_ids.append(str.replace(movie.h3.a['href'], '/title/', '')[:-1])
        year_text = movie.find('span', 'lister-item-year').text
        years.append(int(re.findall(r'[0-9]+', year_text)[0]))
        plots.append(str.replace(movie.findAll("p", "text-muted")[1].text, '\n', ''))
        directors.append(movie.findAll("p")[2].a.text)

#### Creating a Pandas DataFrame

In [24]:
movies_df = pd.DataFrame({
        'imdb_id': imdb_ids,
        'title': titles,
        'rate': ratings,
        'year': years,
        'director': directors,
        'genres': genres,
        'plot': plots
    })

movies_df.head()

Unnamed: 0,imdb_id,title,rate,year,director,genres,plot
0,tt0111161,Um Sonho de Liberdade,9.3,1994,Frank Darabont,[Drama],Two imprisoned men bond over a number of years...
1,tt0068646,O Poderoso Chefão,9.2,1972,Francis Ford Coppola,"[Crime, Drama]",The aging patriarch of an organized crime dyna...
2,tt9263550,Rocketry: The Nambi Effect,9.0,2022,Madhavan,"[Biography, Drama]",Based on the life of Indian Space Research Org...
3,tt0468569,Batman: O Cavaleiro das Trevas,9.0,2008,Christopher Nolan,"[Action, Crime, Drama]",When the menace known as the Joker wreaks havo...
4,tt0167260,O Senhor dos Anéis: O Retorno do Rei,9.0,2003,Peter Jackson,"[Action, Adventure, Drama]",Gandalf and Aragorn lead the World of Men agai...


#### Saving the DataFrame in a CSV file to use in the Machine Learning model

In [25]:
movies_df.to_csv('imdb_top_1000.csv', index=False)