In [42]:
import urllib.request
from bs4 import BeautifulSoup

In [43]:
def get_all_tables():
    tables = []
    url_location_2017 = "https://en.wikipedia.org/wiki/2017_in_film#2017_films"
    url_location_2018 = "https://en.wikipedia.org/wiki/2018_in_film#2018_films"
    url_location_2019 = "https://en.wikipedia.org/wiki/List_of_American_films_of_2019"
    for url_location in [url_location_2017, url_location_2018, url_location_2019]:
        with urllib.request.urlopen(url_location) as response:
            page = response.read()
            soup = BeautifulSoup(page, "html.parser")
            name_box = soup.findAll("table", attrs={"class": "wikitable sortable"})
            if(len(name_box) == 5):
                name_box = name_box[1:]
            for table in name_box:
                tables.append(table)
    return tables

In [44]:
tables = get_all_tables()

In [45]:
movies_dict = {}
studios_dict = {}
for i in range(len(tables)):
    rows = tables[i].find("tbody")
    rows = rows.findAll('tr')[5:100]
    for row in rows:
        cols = row.findAll('td')[0:3]
        if len(cols) and cols[0].get("rowspan"):
            cols = cols[1:3]
        else:
            cols = cols[0:2]
        if len(cols) and cols[0].find('a') and cols[1].find('a'):
            movie_url = "https://en.wikipedia.org" + cols[0].find('a').get('href')
            movie_title = cols[0].find('a').get('title')
            studio_url = "https://en.wikipedia.org" + cols[1].find('a').get('href')
            studio_title = cols[1].find('a').get('title')
            year = 2017 + (i/4)
            movies_dict[movie_title] = [movie_url, year, studio_title]
            studios_dict[studio_title] = studio_url

In [46]:
def get_studio_details(studio_url):
    with urllib.request.urlopen(studio_url) as response:
        page = response.read()
        soup = BeautifulSoup(page, "html.parser")
        info_box = soup.find("table", attrs={"class": "infobox"})
        if(not info_box):
            return {}
        info_box = info_box.find("tbody")
        rows = info_box.findAll("tr")
        details_map = {}
        found = False
        for row in rows:
            heading = row.find("th")
            data = row.find("td")
            if heading and data:
                heading = heading.getText()
                data = data.getText()
                if "founder" not in details_map.keys() and heading in ["Founder", "Founders", "Founder(s)", "Parent", "Key people", "Owned by"]:
                    found = True
                    details_map["founder"] = data.split(",")[0]
                if heading in ["Headquarters"]:
                    details_map["head_quarters"] = data
                elif heading in ["Website", "website"]:
                    details_map["website"] = data
    return details_map

In [47]:
studios_data = []

In [48]:
count = 0
progress = 0
for key, val in studios_dict.items():
    details_map = get_studio_details(val)
    studio_map = {}
    studio_map['studio_name'] = key
    studio_map['founder'] =  details_map['founder'] if 'founder' in details_map.keys() else "Not found"
    studio_map['head_quarters'] = details_map['head_quarters'] if 'head_quarters' in details_map.keys() else "Not found"
    studio_map['website'] = details_map['website'] if 'website' in details_map.keys() else "Not found"
    studios_data.append(studio_map)

In [49]:
import pickle
with open('studios_data.pkl', 'wb') as f:
    pickle.dump(studios_data, f)

In [398]:
def get_movie_details(movie_url):
    import re
    details_dict = {}
    with urllib.request.urlopen(movie_url) as response:
        page = response.read()
        soup = BeautifulSoup(page, "html.parser")
        if(not soup):
            return {}
        info_box = soup.find("table", attrs={"class": "infobox vevent"})
        if(not info_box):
            return {}
        info_box= info_box.find("tbody")
        rows = info_box.findAll("tr")
        count = 0
        for row in rows[1:]:
            heading = row.find("th")
            if(heading):
                field =  heading.getText()
                details = []
                data = row.find("td")
                if(data.find('div')):
                    if(data.find('div', attrs={"class": "plainlist"})):
                        elements = (data.find('div', attrs={"class": "plainlist"}).find('ul').findAll('li'))
                        for ele in elements:
                            details.append(ele.getText())
                else:
                    details.append(data.getText())

                if field in ["Directed by", "Produced by"]:
                    count += 1
                    details_dict[field] = details[0]
                elif field in ["Starring"]:
                    count += 1
                    starring = ""
                    for star in details:
                        starring += star
                        starring += ','
                    details_dict[field] = starring[:-1]
                elif field in ["Language"]:
                    count += 1
                    if 'English' in details[0]:
                        details_dict[field] = "English"
                    else:
                        details_dict[field] = "Non English"
    return details_dict

In [399]:
valid_movies = 0
progress = 0

In [400]:
movies_data = []

In [401]:
for key, val in movies_dict.items():
    details = get_movie_details(val[0])
    movie_map = {}
    if(len(details) == 4):
        movie_map['movie_name'] = key
        movie_map['production_studio'] = val[2]
        movie_map['release_year'] = val[1]
        movie_map['director'] = details['Directed by']
        movie_map['producer'] = details['Produced by']
        movie_map['cast_and_crew'] = details['Starring']
        movie_map['language'] = details['Language']
        movie_map['movie_category'] = "Not updated"
        movies_data.append(movie_map)
        valid_movies += 1
    progress += 1
    if progress%30 == 0:
        print(progress, valid_movies)

30 26
60 56
90 82
120 112
150 141
180 170
210 198
240 227
270 256
300 285
330 315
360 344
390 374
420 404
450 434
480 462
510 492
540 520
570 549
600 578


In [404]:
import pickle
with open('movies_data.pkl', 'wb') as f:
    pickle.dump(movies_data, f)

In [405]:
with open('movies_data.pkl', 'rb') as f:
    mynewlist = pickle.load(f)

In [39]:
import requests
import time
url = 'http://localhost:5000/studios'

In [40]:
count = 0
for data in studios_data:
    payload = json.dumps(data)
    headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8'}
    r = requests.post(url, data=payload, headers=headers)