# IMDB Top 500Movies Scraper

In [84]:
from IPython.display import Javascript, display
from ipywidgets import widgets

def run_all(ev):
    display(Javascript('IPython.notebook.execute_cell_range(IPython.notebook.get_selected_index()+1, IPython.notebook.ncells())'))

button = widgets.Button(description="Run all below")
button.on_click(run_all)
display(button)

Button(description='Run all below', style=ButtonStyle())

### Importing all libraries

In [68]:
from bs4 import BeautifulSoup
import urllib.request, urllib.parse, urllib.error
import re
import time
import pandas as pd
#pd.set_option('display.max_rows', 700)
#pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)

### 1. Index Extraction function

In [69]:
def index_extract(movie):
    index_elem=movie.find(class_='lister-item-index unbold text-primary')
    
    if index_elem is None:
        return None
    else:
        ans=index_elem.text
        return int(ans.split('.')[0])

### 2. Name Extraction Function

In [70]:
def name_extract(movie):
    header_elem=movie.find(class_='lister-item-header')
    if header_elem is None:
        return None
    else:
        header=header_elem.find('a').text
        return header

### 3. Year of Release Extraction Function

In [85]:
def year_extract(movie):
    year_elem=movie.find(class_='lister-item-year text-muted unbold')
    ans=re.findall(r'\((\d+)\)',year_elem.text)
    if len(ans)<1:
        return None
    else:
        return int(ans[0])

### 4. Genre Extraction Function

In [72]:
def genre_extract(movie):
    ans=movie.find(class_='genre')
    if ans is None:
        return None
    else:
        return ans.text.strip().replace('\n','')

### 5. IMDB Rating Extraction Function

In [86]:
def imdb_extract(movie):
    rating_elem=movie.find(class_='inline-block ratings-imdb-rating')
    if rating_elem is None:
        return None
    else:
        rating=rating_elem.find('strong').text
        return float(rating)

### 6. Metascore Extraction Function

In [74]:
def metascore_extract(movie):
    temp=movie.find(class_='metascore')
    if temp is None:
        return None
    else:
        return str(temp.text.strip())

### 7. Summary Extraction Function

In [75]:
def summary_extract(movie):
    temp=movie.find_all('p',{'class':'text-muted'})
    if temp is None:
        return None
    else:
        temp=temp[-1].text
        return str(temp.replace('\n','').strip())

### 8. Director and Star Extraction Function

In [87]:
def director_and_star_extract(movie):
    send=[]
    temp=''
    director_star_elem=movie.find_all('p',{'class':''})
    # made by Kunal N Gohrani
    # Director column got no class attribute so need to check if Director word is present or not
    for i in director_star_elem:
        if 'Director' in i.text:
            temp=i.text
    temp=temp.replace('\n','').strip()
    # using regular expressions to take out directors and stars
    #taking out directors
    director=re.findall(r'^Director[s:]*([a-zA-Z0-9 ,.]*)',temp)
    if len(director)<1:
        send.append(None)
    else:
        send.append(director[0])
    #taking out stars
    star=re.findall(r'Star[s:]+([a-zA-Z0-9 ,.]*)',temp)
    if len(star)<1:
        send.append(None)
    else:
        send.append(star[0])
    return send

### 9. Votes Extraction Function

In [77]:
def votes_extract(movie):
    temp=movie.find(class_='sort-num_votes-visible')
    temp=temp.text.strip().replace('\n','')
    vote=re.findall(r'Vote[s:]+([0-9,]+)',temp)
    if len(vote)<1:
        return None
    else:
        return vote[0]

### 10. Gross Extraction Function

In [78]:
def gross_extract(movie):
    temp=movie.find(class_='sort-num_votes-visible')
    temp=temp.text.strip().replace('\n','')
    gross=re.findall(r'Gross:([\$0-9.,A-Z]+)',temp)
    if len(gross)<1:
        return None
    else:
        return gross[0]

### 11. Next URL Finder

In [79]:
def next_url_fetcher(soup):
    link=soup.find(class_='lister-page-next next-page')
    if link is None:
        return None
    else:
        link=link.get('href')
        link='https://www.imdb.com'+link
    return link


### 12. URL Fetcher

In [88]:
def url_fetcher(url=''):
    if url=='':
        url='https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc&ref_=adv_prv'
    user_agent={}
    user_agent['Mozila'] = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
    user_agent['Android'] = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0'
    headers={'User-Agent':user_agent['Android'],'Accept-Language': 'en-US,en;q=0.8'}
    urlHandle=urllib.request.Request(url,None,headers)
    urlHandle=urllib.request.urlopen(urlHandle)
    content=urlHandle.read()
    return BeautifulSoup(content,'html.parser')

### 13. DataFrame Maker and Exporter

In [81]:
def DataFrame_Printer_and_Exporter():
    data={}
    data['Name']=name
    data['Year of Release']=year
    data['Genre']=genre
    data['IMDB Rating']=IMDB_rating
    data['Metascore']=metascore
    data['Director(s)']=directors
    data['Star(s)']=stars
    data['Summary']=summary
    data['Votes']=votes
    data['Gross Earning']=grosses
    df = pd.DataFrame(data=data,index=index)
    df.to_csv('IMDB.csv')
    return df

### 14. Main Function

In [82]:

# Main Cell Executor
index=[]
name=[]
year=[]
genre=[]
IMDB_rating=[]
metascore=[]
summary=[]
directors=[]
stars=[]
votes=[]
grosses=[]

def main():
    wait=input('Enter number of seconds to wait before fetching the next page (default: 25): ')
    if wait=='':
        wait=25
    else:
        wait=int(wait)
    print('Fetching page')
    soup=url_fetcher()
    for i in range(1,6):
        all_movies=soup.find_all(class_='lister-item-content')
        print('Fetching page')
        print('Scraping.....')
        for movie in all_movies:
            # Extracting all information needed by calling related functions and appending
            # to related list
            index.append(index_extract(movie))
            name.append(name_extract(movie))
            year.append(year_extract(movie))
            genre.append(genre_extract(movie))
            IMDB_rating.append(imdb_extract(movie))
            metascore.append(metascore_extract(movie))
            summary.append(summary_extract(movie))
            director_and_star=director_and_star_extract(movie)
            directors.append(director_and_star[0])
            stars.append(director_and_star[1])
            votes.append(votes_extract(movie))
            grosses.append(gross_extract(movie))
        next_link=next_url_fetcher(soup)
        if next_link is None:
            print('Scraping completed..Exiting')
            break;
        print('Scraping completed... sleeping for ',wait,' seconds')
        time.sleep(wait)
        soup=url_fetcher(next_link)
    df=DataFrame_Printer_and_Exporter()
    return df

In [83]:
# Calling main Function, main functions returns the dataframe to be printed below, and also exports a csv file.
df=main()

Enter number of seconds to wait before fetching the next page (default: 25): 25
Fetching page
Fetching page
Scraping.....
Scraping completed... sleeping for  25  seconds
Fetching page
Scraping.....
Scraping completed... sleeping for  25  seconds
Fetching page
Scraping.....
Scraping completed... sleeping for  25  seconds
Fetching page
Scraping.....
Scraping completed..Exiting
