# Scrape the Top 250 Movies from IMDb (31/03/2021)

In [1]:
from htmldownloader import download

In [2]:
# Download the HTML code into a file and a variable (html)
URL = "http://www.imdb.com/chart/top"
html = download(URL)

In [3]:
# Parse html using lxml and declare the BeautifulSoup object
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "lxml")

## Get the Name, Director(s) and Main Cast, Rating

In [4]:
# Get the lists of elements containing movie details and
# the corresponding ratings
detailsList = soup.find_all('td', class_ = "titleColumn")
ratingsList = soup.find_all('td', class_ = "imdbRating")

# Could have used the following in place of the first line:
# soup.select("td.titleColumn")     [CSS selector]

In [5]:
# Check the first elements
print(detailsList[0])
print(ratingsList[0])

<td class="titleColumn">
                 1.
                 <a href="/title/tt0111161/" title="Frank Darabont (dir.), Tim Robbins, Morgan Freeman">
                  The Shawshank Redemption
                 </a>
<span class="secondaryInfo">
                  (1994)
                 </span>
</td>
<td class="ratingColumn imdbRating">
<strong title="9.2 based on 2,368,965 user ratings">
                  9.2
                 </strong>
</td>


In [6]:
# Get the (separate) lists of movie names, directors, casts,
# initial release years and IMDb ratings
names = []
directors = []
casts = []
release = []
ratings = []

In [7]:
for i in range(250):
    details = detailsList[i]
    names.append(details.a.string.strip())
    
    fullCast = details.a.attrs.get("title")
    # or     ...a["title"]
    fullCastList = fullCast.split(',')
    director = fullCastList[0].strip()[:-7]
    cast = ", ".join(fullCastList[1:]).strip()
    directors.append(director)
    casts.append(cast)
    
    release.append(int(details.span.string.strip()[1:-1]))
    
    rating = ratingsList[i]
    ratings.append(float(rating.strong.string.strip()))

In [8]:
# Check the lists
print(len(names))
print(len(directors))
print(len(casts))
print(len(release))
print(len(ratings))

250
250
250
250
250


In [9]:
# Store the results in a Pandas DataFrame
import pandas as pd

In [10]:
df = pd.DataFrame({"Name" : names, "Director" : directors, "Cast" : casts, "Initial Release" : release, "Rating" : ratings})

In [11]:
# Add 1 to each row label
df.index = df.index + 1

In [12]:
# Preview the DataFrame
df

Unnamed: 0,Name,Director,Cast,Initial Release,Rating
1,The Shawshank Redemption,Frank Darabont,"Tim Robbins, Morgan Freeman",1994,9.2
2,The Godfather,Francis Ford Coppola,"Marlon Brando, Al Pacino",1972,9.1
3,The Godfather: Part II,Francis Ford Coppola,"Al Pacino, Robert De Niro",1974,9.0
4,The Dark Knight,Christopher Nolan,"Christian Bale, Heath Ledger",2008,9.0
5,12 Angry Men,Sidney Lumet,"Henry Fonda, Lee J. Cobb",1957,8.9
...,...,...,...,...,...
246,Shin seiki Evangelion Gekijô-ban: Air/Magokoro...,Hideaki Anno,"Megumi Ogata, Megumi Hayashibara",1997,8.0
247,Mandariinid,Zaza Urushadze,"Lembit Ulfsak, Elmo Nüganen",2013,8.0
248,Koe no katachi,Naoko Yamada,"Miyu Irino, Saori Hayami",2016,8.0
249,Drishyam,Nishikant Kamat,"Ajay Devgn, Shriya Saran",2015,8.0


In [13]:
# Save the df into a CSV file with ';' as the separator
df.to_csv("imdb-top-250-movies.csv", sep = ';')

### Bonus: Load the CSV file into a new DataFrame

In [14]:
df2 = pd.read_csv("imdb-top-250-movies.csv",
                  sep = ";", index_col = "Unnamed: 0")
df2

Unnamed: 0,Name,Director,Cast,Initial Release,Rating
1,The Shawshank Redemption,Frank Darabont,"Tim Robbins, Morgan Freeman",1994,9.2
2,The Godfather,Francis Ford Coppola,"Marlon Brando, Al Pacino",1972,9.1
3,The Godfather: Part II,Francis Ford Coppola,"Al Pacino, Robert De Niro",1974,9.0
4,The Dark Knight,Christopher Nolan,"Christian Bale, Heath Ledger",2008,9.0
5,12 Angry Men,Sidney Lumet,"Henry Fonda, Lee J. Cobb",1957,8.9
...,...,...,...,...,...
246,Shin seiki Evangelion Gekijô-ban: Air/Magokoro...,Hideaki Anno,"Megumi Ogata, Megumi Hayashibara",1997,8.0
247,Mandariinid,Zaza Urushadze,"Lembit Ulfsak, Elmo Nüganen",2013,8.0
248,Koe no katachi,Naoko Yamada,"Miyu Irino, Saori Hayami",2016,8.0
249,Drishyam,Nishikant Kamat,"Ajay Devgn, Shriya Saran",2015,8.0
