# Crawling

In [48]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [49]:
HEADERS = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
    "Accept-Encoding":"gzip, deflate",
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "DNT":"1","Connection":"close",
    "Upgrade-Insecure-Requests":"1"
}

#### The get_movies function get a url of website with movies (generally 50 movies in a page) and retrieve the data about them.

In [50]:
def get_movies(url):
    resp = requests.get(url, headers=HEADERS)
    content = BeautifulSoup(resp.content, "lxml")

    movies = []

    for movie in content.select(".lister-item-content"):
        try:
            data = {
                "title": movie.select(".lister-item-header")[0].get_text().strip().split("\n")[1],
                "time": movie.select(".runtime")[0].get_text().strip().split()[0],
                "genre": movie.select(".genre")[0].get_text().strip().replace(" ", "").split(",")[0],
                "rating": movie.select(".ratings-imdb-rating")[0].get_text().strip(),
                "description": movie.select(".text-muted")[2].get_text().strip(),
                "votes": movie.select(".sort-num_votes-visible")[0].get_text().strip().split("\n")[1].replace(",", ""),
            }

            for x in movie:
                if "Director" in x.get_text():
                    num_directos = x.get_text().split("|")[0].count("\n") - 2
                    data["directors"] = [x.get_text() for x in movie.select('a[href^="/name"]')[:num_directos]]
                    data["stars"] = [x.get_text() for x in movie.select('a[href^="/name"]')[num_directos:]]
                    
            year = movie.select(".lister-item-year")[0].get_text().strip()
            year = "".join([i for i in year if i.isdigit()])
            # assert len(year) == 4, movie.select(".lister-item-year")[0].get_text().strip()[1:-1] + " " + year
            data["year"] = int(year)
            
            gross = None
            if len(movie.select(".sort-num_votes-visible")[0].get_text().strip().split("\n")) > 3:
                gross = movie.select(".sort-num_votes-visible")[0].get_text().strip().split("\n")[3]
                gross = int(float(gross[1:-1]) * 1e6)
            data["gross"] = gross

        except IndexError:
            continue

        movies.append(data)

    return movies


#### Use the get_movies function to crawl the data about all movies in IMDB with at least 10000 votes.

In [52]:
MINIMUM_VOTES = 10000

movies = []

for i in range(1, 10000, 50):
    url = f"https://www.imdb.com/search/title/?title_type=feature&num_votes={MINIMUM_VOTES},&sort=alpha,asc&start={i}&ref_=adv_nxt"
    new_movies = get_movies(url)
    if len(new_movies) == 0:
        break
    movies += new_movies
    # print(len(movies))

dataframe = pd.DataFrame(movies)
dataframe.to_csv("movies.csv", index=False)

dataframe

Unnamed: 0,title,time,genre,rating,description,votes,directors,stars,year,gross
0,#Home,158,Drama,8.9,Oliver Twist (Indrans) wants to be tech-savvy ...,12361,[Rojin Thomas],"[Indrans, Sreenath Bhasi, Manju Pillai, Naslen]",2021,
1,#Saraitda,98,Action,6.3,The rapid spread of an unknown infection has l...,37072,[Il Cho],"[Yoo Ah-in, Park Shin-Hye, Jeon Bae-soo, Hyun-...",2020,
2,'71,99,Action,7.2,"In 1971, a young and disoriented British soldi...",55642,[Yann Demange],"[Jack O'Connell, Sam Reid, Sean Harris, Paul P...",2014,1270000.0
3,'A' gai wak,105,Action,7.3,Fighting against pirates at the turn of the 20...,16657,"[Jackie Chan, Jackie Chan]","[Sammo Kam-Bo Hung, Jackie Chan, Sammo Kam-Bo ...",1983,
4,'A' gai wak 2,106,Action,7.0,Dragon is now transferred to be the police hea...,10180,[Jackie Chan],"[Jackie Chan, Maggie Cheung, Rosamund Kwan, Ca...",1987,
...,...,...,...,...,...,...,...,...,...,...
9684,È stata la mano di Dio,130,Drama,7.3,"In 1980s Naples, young Fabietto pursues his lo...",37807,[Paolo Sorrentino],"[Filippo Scotti, Toni Servillo, Teresa Saponan...",2021,
9685,Ôdishon,115,Drama,7.1,A widower takes an offer to screen girls at a ...,79633,[Takashi Miike],"[Ryo Ishibashi, Eihi Shiina, Tetsu Sawaki, Jun...",1999,
9686,Ôkami kodomo no Ame to Yuki,117,Animation,8.1,After her werewolf lover unexpectedly dies in ...,43791,[Mamoru Hosoda],"[Aoi Miyazaki, Takao Osawa, Haru Kuroki, Yukit...",2012,
9687,Ölümlü Dünya,107,Action,7.6,Mermer Family lives a double life working at t...,26066,[Ali Atay],"[Ahmet Mümtaz Taylan, Alper Kul, Sarp Apak, Ir...",2018,
