### Imports

In [5]:
import requests
import sqlite3
import pandas as pd
from bs4 import BeautifulSoup

### Constants & Initializations

In [6]:
URL = "https://web.archive.org/web/20230902185655/https://en.everybodywiki.com/100_Most_Highly-Ranked_Films"
DB_NAME = "Movies.db"
DB_TABLE_NAME = "Top_50"
CSV_PATH = "top_50_films.csv"

df = pd.DataFrame(columns=["Average Rank", "Film", "Year"])
count = 0

### Loading Webpage

In [7]:
html_page = requests.get(URL).text
data = BeautifulSoup(html_page, "html.parser")

### Load Table & Rows

In [22]:
tables = data.find_all("tbody")
rows = tables[0].find_all("tr")

for row in rows:
    print(row)
    # limit to 50 rows
    if count < 50:
        col = row.find_all("td")

        if len(col) != 0:
            data_dict = {
                "Average Rank": col[0].contents[0],
                "Film": col[1].contents[0],
                "Year": col[2].contents[0]
            }

            df1 = pd.DataFrame(data_dict, index=[0])

            df = pd.concat(
                [df, df1],
                ignore_index=True
            )

            count += 1
    else:
        break

<tr>
<th>Average Rank</th>
<th>Film</th>
<th>Year</th>
<th>Rotten Tomatoes' Top 100<sup class="reference" id="cite_ref-1"><a href="#cite_note-1">[1]</a></sup></th>
<th>IMDb's Top 250 <sup class="reference" id="cite_ref-2"><a href="#cite_note-2">[2]</a></sup></th>
<th>Empire's Top 100 <sup class="reference" id="cite_ref-3"><a href="#cite_note-3">[3]</a></sup></th>
<th>AFI's Top 100 <sup class="reference" id="cite_ref-4"><a href="#cite_note-4">[4]</a></sup></th>
<th>BFI's Top 100 <sup class="reference" id="cite_ref-5"><a href="#cite_note-5">[5]</a></sup>
</th></tr>


In [23]:
df

Unnamed: 0,Average Rank,Film,Year
0,1,The Godfather,1972
1,2,Citizen Kane,1941
2,3,Casablanca,1942
3,4,"The Godfather, Part II",1974
4,5,Singin' in the Rain,1952
5,6,Psycho,1960
6,7,Rear Window,1954
7,8,Apocalypse Now,1979
8,9,2001: A Space Odyssey,1968
9,10,Seven Samurai,1954


### Save to CSV

In [25]:
df.to_csv(CSV_PATH, index=False)

### Save to SQLite Database

In [26]:
conn = sqlite3.connect(DB_NAME)
df.to_sql(DB_TABLE_NAME, conn, if_exists="replace", index=False)
conn.close()

<hr>

### Extract Film, Year, and Rotten Tomatoes columns instead

<i>

Modify the code to extract Film, Year, and Rotten Tomatoes' Top 100 headers.

Restrict the results to only the top 25 entries.

Filter the output to print only the films released in the 2000s (year 2000 included).

</i>

In [45]:
tables2 = data.find_all("tbody")
rows2 = tables2[0].find_all("tr")
count2 = 0
df3 = pd.DataFrame(columns=["Film", "Year", "Rotten Tomatoes' Top 100"])

for row in rows2:
    if count2 >= 25:
        break
    
    col = row.find_all("td")
    
    if len(col) > 0:

        data_dict2 = {
            "Film": col[1].contents[0],
            "Year": col[2].contents[0],
            "Rotten Tomatoes' Top 100": col[3].contents[0]
        }

        df4 = pd.DataFrame(data_dict2, index=[0])

        df3 = pd.concat(
            [df3, df4],
            ignore_index=True
        )

        count += 1

df3

Unnamed: 0,Film,Year,Rotten Tomatoes' Top 100
0,The Godfather,1972,17
1,Citizen Kane,1941,2
2,Casablanca,1942,8
3,"The Godfather, Part II",1974,99
4,Singin' in the Rain,1952,52
...,...,...,...
103,Titanic,1997,unranked
104,Toy Story,1995,unranked
105,Reservoir Dogs,1992,unranked
106,Paddington 2,2018,93


In [53]:
df3["Year"] = df3["Year"]

df3_2000 = df3[df3["Year"].str.startswith("200")]
df3_2000

Unnamed: 0,Film,Year,Rotten Tomatoes' Top 100
18,Lord of the Rings: The Fellowship of the Ring,2001,unranked
36,The Dark Knight,2008,unranked
42,Lord of the Rings: Return of the King,2003,unranked
48,Lord of the Rings: The Two Towers,2002,unranked
59,Gladiator,2000,unranked
63,Spirited Away,2001,unranked
75,Mulholland Drive,2001,unranked
76,In the Mood for Love,2000,unranked
89,Eternal Sunshine of the Spotless Mind,2004,unranked
96,Inglourious Basterds,2009,unranked
