# The IMDB Top 1000 Data will allow us to see top 1000 movies in term of popularity based on submitted user ratings
Below is commented code for the webscraping

In [1]:
# import the needed packages
import pandas as pd
import numpy as np
import requests
from requests import get
from bs4 import BeautifulSoup


from time import sleep # websites will react negatively if the scraping is seemingly done by a machine
                        # so the sleep function allows for pauses
from random import randint # going to need this for the sleep function

headers = {"Accept-Language": "en-US,en;q=0.5"} # had to look this up, hopefully pull english titles if there is one

# set up the empty lists

metascores = []
votes = []
us_gross = []
titles = []
years = []
time = []
imdb_ratings = []


pages = np.arange(1, 1001, 50)

for page in pages: # run through every page on the website that has the list. the sleep function will cause a delay so the website does not stop the scraping

  page = requests.get("https://www.imdb.com/search/title/?groups=top_1000&start=" + str(page) + "&ref_=adv_nxt", headers=headers) # this is the url we are pulling from, the results are spread over several pages so this iterates over them

  soup = BeautifulSoup(page.text, 'html.parser') # beautiful soup is used to go through the html code
  movie_div = soup.find_all('div', class_='lister-item mode-advanced')
  
  sleep(randint(2,10)) # creates a random space of time before the next pull

  for container in movie_div: # iterates through every container in the movie divider
        
        #the below code combs through and collects the data, sorting the data as it goes, each of these lists will become series in a dataframe eventually
        
        name = container.h3.a.text
        titles.append(name)
        
        year = container.h3.find('span', class_='lister-item-year').text
        years.append(year)

        runtime = container.p.find('span', class_='runtime') if container.p.find('span', class_='runtime') else ''
        time.append(runtime)

        imdb = float(container.strong.text)
        imdb_ratings.append(imdb)

        m_score = container.find('span', class_='metascore').text if container.find('span', class_='metascore') else ''
        metascores.append(m_score)

        nv = container.find_all('span', attrs={'name': 'nv'})
        
        vote = nv[0].text
        votes.append(vote)
        
        grosses = nv[1].text if len(nv) > 1 else ''
        us_gross.append(grosses)

        # creation of a dataframe titled 'movies'. This can be changed after it is created but may be a chore to do so in the code itself
movies = pd.DataFrame({
'movie': titles,
'year': years,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes,
'us_grossMillions': us_gross,
'timeMin': time
})

# cleaning up the data

movies['votes'] = movies['votes'].str.replace(',', '').astype(int) # take out commas  and make into integers

movies.loc[:, 'year'] = movies['year'].str[-5:-1].astype(int)

movies['timeMin'] = movies['timeMin'].astype(str) # change to a string to make it easier to manipulate
movies['timeMin'] = movies['timeMin'].str.extract('(\d+)').astype(int) # strip it and convert to an integer

movies['metascore'] = movies['metascore'].str.extract('(\d+)') 
movies['metascore'] = pd.to_numeric(movies['metascore'], errors='coerce')

movies['us_grossMillions'] = movies['us_grossMillions'].map(lambda x: x.lstrip('$').rstrip('M')) # strip the characters
movies['us_grossMillions'] = pd.to_numeric(movies['us_grossMillions'], errors='coerce') # turn into a number 


# to see your dataframe
print(movies)

# to see the datatypes of your columns
print(movies.dtypes)

# to see where you're missing data and how much data is missing 
print(movies.isnull().sum())

# to move all your scraped data to a CSV file
movies.to_csv('movies.csv')

                                          movie  year  imdb  metascore  \
0                                    Knives Out  2019   7.9       82.0   
1              Once Upon a Time... in Hollywood  2019   7.7       83.0   
2                                 The Gentlemen  2019   7.9       51.0   
3                                      Parasite  2019   8.6       96.0   
4                                Ford v Ferrari  2019   8.1       81.0   
..                                          ...   ...   ...        ...   
995                        Carry On, Munna Bhai  2006   8.1        NaN   
996                            Le Dîner de Cons  1998   7.7       73.0   
997  And Now for Something Completely Different  1971   7.6        NaN   
998                                  The Breath  2009   8.0        NaN   
999                                   Vizontele  2001   8.0        NaN   

      votes  us_grossMillions  timeMin  
0    340943            165.36      130  
1    487755            142.50

In [3]:
# while I have the movies df, imdb_1000 is much more descriptive 
# and will allow for better tracking of what data I am using


imdb_1000 = pd.read_csv('movies.csv')
imdb_1000



Unnamed: 0.1,Unnamed: 0,movie,year,imdb,metascore,votes,us_grossMillions,timeMin
0,0,Knives Out,2019,7.9,82.0,340943,165.36,130
1,1,Once Upon a Time... in Hollywood,2019,7.7,83.0,487755,142.50,161
2,2,The Gentlemen,2019,7.9,51.0,157213,,113
3,3,Parasite,2019,8.6,96.0,441832,53.37,132
4,4,Ford v Ferrari,2019,8.1,81.0,231742,117.62,152
...,...,...,...,...,...,...,...,...
995,995,"Carry On, Munna Bhai",2006,8.1,,41455,2.22,144
996,996,Le Dîner de Cons,1998,7.7,73.0,36386,4.07,80
997,997,And Now for Something Completely Different,1971,7.6,,28293,,88
998,998,The Breath,2009,8.0,,31047,,128


In [6]:
imdb_1000['year'].min()

1920

In [4]:
imdb_1000['year'].max()

2019

In [8]:
imdb_1000['us_grossMillions'].max()

936.66

In [10]:
imdb_1000[imdb_1000['us_grossMillions'] == imdb_1000['us_grossMillions'].max()]

Unnamed: 0.1,Unnamed: 0,movie,year,imdb,metascore,votes,us_grossMillions,timeMin
191,191,Star Wars: Episode VII - The Force Awakens,2015,7.9,80.0,841179,936.66,138
