<a href="https://colab.research.google.com/github/rodgersxy/Web-Scraping/blob/main/moviescraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing relevant libraries
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS

sns.set()

In [2]:
# Create a list for all pages
pages = list(range(1,7))

# Create lists for columns needed for data frame
title = []
runtime = []
genre = []
imdb_rating = []
year = []
metascore = []
director_names = []
actors_names = []
gross = []

# Loop through all ages to get information
for page in tqdm(pages):
    params = {
        'st_dt': '',
        'mode': 'detail',
        'page': page,
        'sort': 'list_order,asc'
    }

    web_page_url = 'https://www.imdb.com/list/ls062911411/'

    response = requests.get(web_page_url, params=params)


    # create an instance of beautiful soup for film information
    film_soup = BeautifulSoup(response.content, 'html.parser')

    # Get the divs where the movie information is located
    film_info = film_soup.find_all('div', class_= 'lister-item-content')



    # Loop through film_info object to extract necessary information
    for item in tqdm(film_info):
        title.append((item.a.string))

        time_ = item.find('span', class_ = 'runtime')
        runtime.append(time_.string)

        genre_= item.find('span', class_ = 'genre')
        genre.append(((genre_.string).replace('\n', '')).strip())

        rate = item.find('span',class_ = 'ipl-rating-star__rating')
        imdb_rating.append(float(rate.string))

        year_ = item.find('span', class_ = 'lister-item-year text-muted unbold').string.split()
        if len(year_) <= 1:
            year.append(int(year_[0][1:5]))
        else:
            year.append(int(year_[1][1:5]))

        if item.find('span', class_ = 'metascore favorable') == None:
            metascore.append(np.nan)
        else:
            metascore_ = item.find('span', class_ = 'metascore favorable').string.strip()
            metascore.append(int(metascore_))


    # Create an instance of beautiful soup for directors and actors information
    directors_actors_soup = BeautifulSoup(response.content, 'html.parser')

    # Retrieve all tags and links that have directors and actors information
    director_links = directors_actors_soup.find_all('p', {'class': 'text-muted text-small'})

    directors_info = []
    for links in director_links:
        directors_info.append(links.a)

    # Retreving director names. Ensuring the list is 100
    for name in tqdm(directors_info):
        if name != None:
            director_names.append(name.string)

    # Retreive actors information
    actors_links = []
    for each_tag in directors_actors_soup.find_all('p', {'class': 'text-muted text-small'}):
        if each_tag.find_all('a') != []:
            actors_links.append(each_tag.findAll('a')[1:])


    for links in actors_links:
        each_name = []
        for each_link in tqdm(links):
            each_name.append(each_link.string)
        actors_names.append(each_name)


    # Retrieve list for gross
    list_of_grossEarnings = []
    for i in directors_actors_soup.find_all('p', {'class': 'text-muted text-small'}):
        list_of_grossEarnings.append(i.findAll('span', {'name': 'nv' }))


    for lists in tqdm(list_of_grossEarnings):
        if len(lists) > 1:
            gross.append(lists[1].string)
        elif len(lists) == 1:
            gross.append('N/A')

print('Done Scrapping!')

  0%|          | 0/6 [00:00<?, ?it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
 48%|████▊     | 48/100 [00:00<00:00, 472.74it/s][A
100%|██████████| 100/100 [00:00<00:00, 453.95it/s]

100%|██████████| 300/300 [00:00<00:00, 455242.84it/s]

100%|██████████| 4/4 [00:00<00:00, 30066.70it/s]

100%|██████████| 4/4 [00:00<00:00, 33825.03it/s]

100%|██████████| 5/5 [00:00<00:00, 42974.43it/s]

100%|██████████| 4/4 [00:00<00:00, 36002.61it/s]

100%|██████████| 4/4 [00:00<00:00, 35544.95it/s]

100%|██████████| 4/4 [00:00<00:00, 33893.37it/s]

100%|██████████| 4/4 [00:00<00:00, 37200.04it/s]

100%|██████████| 4/4 [00:00<00:00, 24244.53it/s]

100%|██████████| 4/4 [00:00<00:00, 26420.81it/s]

100%|██████████| 4/4 [00:00<00:00, 38043.57it/s]

100%|██████████| 4/4 [00:00<00:00, 36631.48it/s]

100%|██████████| 5/5 [00:00<00:00, 47662.55it/s]

100%|██████████| 4/4 [00:00<00:00, 26011.19it/s]

100%|██████████| 4/4 [00:00<00:00, 38043.57it/s]

100%|██████████| 4/4 [00:00<00:00, 40136.88it/s]

100%|████

Done Scrapping!





In [3]:
# Check each list to ensure we have 517 entries
check = [title, runtime, genre, imdb_rating, year, metascore, director_names, actors_names,
         gross]
for each_list in check:
    print(len(each_list))

517
517
517
517
517
517
517
517
517


In [4]:
# Putting our data in a data frame
data_frame = pd.DataFrame({'Movie_title': title, 'Genre': genre, 'Director': director_names, 'Actors': actors_names, 'Duration': runtime, 'Year': year, 'IMDB Rating': imdb_rating, 'Meta Score': metascore, 'Gross earnings': gross}, index=range(1, len(title)+1))
data_frame

Unnamed: 0,Movie_title,Genre,Director,Actors,Duration,Year,IMDB Rating,Meta Score,Gross earnings
1,Citizen Kane,"Drama, Mystery",Orson Welles,"[Orson Welles, Joseph Cotten, Dorothy Comingor...",119 min,1941,8.3,100.0,$1.59M
2,The Godfather,"Crime, Drama",Francis Ford Coppola,"[Marlon Brando, Al Pacino, James Caan, Diane K...",175 min,1972,9.2,100.0,$134.97M
3,The Wizard of Oz,"Adventure, Family, Fantasy",Victor Fleming,"[King Vidor, Judy Garland, Frank Morgan, Ray B...",102 min,1939,8.1,92.0,$2.08M
4,The Shawshank Redemption,Drama,Frank Darabont,"[Tim Robbins, Morgan Freeman, Bob Gunton, Will...",142 min,1994,9.3,82.0,$28.34M
5,Pulp Fiction,"Crime, Drama",Quentin Tarantino,"[John Travolta, Uma Thurman, Samuel L. Jackson...",154 min,1994,8.9,95.0,$107.93M
...,...,...,...,...,...,...,...,...,...
513,From Here to Eternity,"Drama, Romance, War",Fred Zinnemann,"[Burt Lancaster, Montgomery Clift, Deborah Ker...",118 min,1953,7.6,85.0,$30.50M
514,Walk the Line,"Biography, Drama, Music",James Mangold,"[Joaquin Phoenix, Reese Witherspoon, Ginnifer ...",136 min,2005,7.8,72.0,$119.52M
515,The Last Emperor,"Biography, Drama, History",Bernardo Bertolucci,"[John Lone, Joan Chen, Peter O'Toole, Ruocheng...",163 min,1987,7.7,76.0,$43.98M
516,Mutiny on the Bounty,"Adventure, Biography, Drama",Frank Lloyd,"[Charles Laughton, Clark Gable, Franchot Tone,...",132 min,1935,7.6,87.0,
