# Task: Extract all relevant information about the movies from the webpage:https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating%27

In [1]:
# https://help.imdb.com/article/contribution/titles/certificates/GU757M8ZJ9ZPXB39?ref_=helpart_nav_27#

In [2]:
import json
import pandas as pd #pip install pandas
from datetime import date
from datetime import datetime
from bs4 import BeautifulSoup #pip install BeautifulSoup4
from tqdm import tqdm_notebook as tqdm #pip install tqdm
from urllib.request import urlopen, Request
from pprint import pprint

In [3]:
webpage='https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating%27'

In [4]:
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}

In [5]:
def extract_soup_object(link):
    request=Request(link, headers=hdr)
    html = urlopen(request).read().decode()
    soup = BeautifulSoup(html,'html.parser')
    return soup

In [6]:
def extract_links(page):
    soup=extract_soup_object(page)
    data=soup.find_all('h3',{'class':"lister-item-header"})
    return ['https://www.imdb.com'+item.a.get('href') for item in data]

In [7]:
movie_links=extract_links(webpage)

## Step 2: Making Functions of all the attributes

In [8]:
# 1. Extract Scraping Time
def get_scraping_date():
    now = datetime.now()
    date=now.strftime("%d-%m-%Y %H:%M:%S")
    return date

In [9]:
# 2. Movie name
def movie_name(soup):
    return soup.find('div',{'class':'title_wrapper'}).find('h1').text.split('\xa0')[0]

In [10]:
# 3-7. 
def other_attributes_part1(soup):
    temp=soup.find('div',{'class':'title_wrapper'}).find('div',{'class':'subtext'}).text.replace(' ','').split('|')
    certification=temp[0].split('\n')[1]
    time=temp[1].split('\n')[1]
    genre=temp[2].replace('\n','')
    release_date=temp[3].split('\n')[1].split('(')[0]
    release_country=temp[3].split('\n')[1].split('(')[1].split(')')[0]
    return certification, time, genre, release_date, release_country

In [11]:
# 8 Rating
def movie_rating(soup):
    return soup.find('div',{'class':'ratingValue'}).text.replace('\n','')

In [12]:
# 9,10
def user_critic(soup):
    q=soup.find('div',{'class':'imdbRating'}).find('div',{'class':'hiddenImportant'}).find_all('span')
    temp=[item.text for item in q]
    user,critic=temp[0].split(' ')[0],temp[1].split(' ')[0]
    return user, critic

In [13]:
# 11. Summary
def summary(soup):
    return soup.find('div',{'class':'plot_summary'}).find('div',{'class':'summary_text'}).text.replace('\n','').replace('  ','')

In [14]:
#12-14
def writer_director_stars(soup):
    t=soup.find_all('div',{'class':'credit_summary_item'})
    temp=[item.text for item in t]
    director=temp[0].split(':')[1].replace('\n','')
    writer=temp[1].split(':')[1].replace('\n','')
    primary_actor=temp[2].split(':')[1].replace('\n','').split('|')[0]
    return director, writer, primary_actor
    

In [15]:
#15 meta score
def meta_score(soup):
    return soup.find('div',{'class':'metacriticScore score_favorable titleReviewBarSubItem'}).text.replace('\n','')

In [16]:
# 16 primary image
def primary_image(soup):
    return soup.find('div',{'class':'poster'}).find('img')['src']

In [17]:
# 17 primary video
def primary_video(soup):
    return 'https://www.imdb.com'+soup.find('div',{'class':'videoPreview__videoContainer'}).find('a')['href']

In [18]:
# 18,19 other images, video
def other_image_video(soup):
    temp=soup.find_all('div',{'class':'combined-see-more see-more'})
    temp_1=[item.find('a')['href'] for item in temp]
    image=temp_1[0]
    video=temp_1[1]
    return image, video

In [19]:
# 20 all actors
def all_actors(soup, value):
    temp=soup.find_all('tr', {'class':value})
    return [item.text for item in temp]

In [20]:
def beautify(temp_list):
    actor=[]
    character=[]
    for item in temp_list:
        actor.append(item.split('...')[0])
        character.append(item.split('...')[1])
    actors=[item.replace('\n','').replace(' ','') for item in actor]
    characters=[item.replace('\n','').replace(' ','') for item in character]
    return actors, characters

In [21]:
#21. plot
def plot(soup):
    return soup.find('div',{'class':'inline canwrap'}).find('span').text.replace('  ','')

In [22]:
## 22. 
def plot_keywords(soup):
    keywords=[]
    temp=soup.find('div',{'class':'see-more inline canwrap'}).text.replace('\n','')
    keywords.append(temp.split('|')[0].split(':')[1])
    keywords.extend(temp.split('|')[1:-1])
    return keywords

In [23]:
## 23-31
def other_attributes_part2(soup):
    h=soup.find('div',{'id':'titleDetails'}).find_all('div',{'class':'txt-block'})
    details=[]
    for item in h:
        try:
            details.append(item.text.split(':')[1])
        except:
            continue
    languages=details[2].replace('\n','').split('|')
    filming_location=details[5].replace('\n','').replace('\xa0»','')
    budget=details[6].replace('\n','').split('(')[0]
    opening_weekend=details[7].replace('\n','')
    gross_amount=details[8].replace('  ','')
    cumilative_gross=details[9].replace('  ','')
    production_company=details[10].replace('\n','').replace('\xa0»','')
    sound_mix=details[12].replace('\n','').split('|')
    aspect_ratio=details[-1]
    return languages, filming_location, budget, opening_weekend, gross_amount, cumilative_gross, production_company, sound_mix,aspect_ratio


In [24]:
def imdb_scrapper(link):
    data={}
    data['website']='IMDB'
    data['link']=link

    scrape_date=get_scraping_date()
    data['scrape_date']=scrape_date
    
    name=link.split('/')[4]
    data['unique_id']=name
    
    soup=extract_soup_object(link)
    
    try:
        data['movie_name']=movie_name(soup)
    except:
        data['movie_name']=None
        
    try:
        certification, time, genre, release_date, release_country=other_attributes_part1(soup)
        data['certification']=certification
        data['time']=time
        data['genre']=genre
        data['release_date']=release_date
        data['release_country']=release_country
    except:
        data['certification']=None
        data['time']=None
        data['genre']=None
        data['release_date']=None
        data['release_country']=None
        
    try:
        data['movie_rating']=movie_rating(soup)
    except:
        data['movie_rating']=None
        
    try:
        user, critic=user_critic(soup)
        data['user']=user
        data['critic']=critic
    except:
        data['user']=None
        data['critic']=None

        
    try:
        data['summary']=summary(soup)
    except:
        data['summary']=None
        
    try:
        director, writer, primary_actor=writer_director_stars(soup)
        data['director']=director
        data['writer']=writer
        data['primary_actor']=primary_actor
    except:
        data['director']=None
        data['writer']=None
        data['primary_actor']=None
        
    try:
        data['meta_score']=meta_score(soup)
    except:
        data['meta_score']=None
        
    try:
        data['primary_image']=primary_image(soup)
    except:
        data['primary_image']=None
        
    try:
        data['primary_video']=primary_video(soup)
    except:
        data['primary_video']=None
        
    try:
        image_links,video_links=other_image_video(soup)
        data['other_image_links']=image_links
        data['other_video_links']=video_links
    except:
        data['other_image_links']=None
        data['other_video_links']=None
        
    try:
        odd_actors=all_actors(soup, 'odd')
        even_actors=all_actors(soup, 'even')
        actors=odd_actors+even_actors
        actors, characters=beautify(actors)
        data['all_actors']=actors
        data['all_characters']=characters
    except:
        data['all_actors']=None
        data['all_characters']=None
        
    try:
        data['plot']=plot(soup)
    except:
        data['plot']=None
        
    try:
        data['plot_keywords']=plot_keywords(soup)
    except:
        data['plot_keywords']=None
        
    try:
        languages, filming_location, budget, opening_weekend, gross_amount, cumilative_gross, production_company, sound_mix,aspect_ratio=other_attributes_part2(soup)
        data['languages']=languages
        data['filming_location']=filming_location
        data['budget']=budget
        data['opening_weekend']=opening_weekend
        data['gross_amount']=gross_amount
        data['cumilative_gross']=cumilative_gross
        data['production_company']=production_company
        data['sound_mix']=sound_mix
        data['aspect_ratio']=aspect_ratio
    except:
        data['languages']=None
        data['filming_location']=None
        data['budget']=None
        data['opening_weekend']=None
        data['gross_amount']=None
        data['cumilative_gross']=None
        data['production_company']=None
        data['sound_mix']=None
        data['aspect_ratio']=None
        
    return data

In [25]:
imdb_scrapper('https://www.imdb.com/title/tt8946378/?ref_=adv_li_tt')

{'website': 'IMDB',
 'link': 'https://www.imdb.com/title/tt8946378/?ref_=adv_li_tt',
 'scrape_date': '19-06-2020 20:20:40',
 'unique_id': 'tt8946378',
 'movie_name': 'Knives Out',
 'certification': 'PG-13',
 'time': '2h10min',
 'genre': 'Comedy,Crime,Drama',
 'release_date': '27November2019',
 'release_country': 'USA',
 'movie_rating': '7.9/10 ',
 'user': '2,058',
 'critic': '437',
 'summary': 'A detective investigates the death of a patriarch of an eccentric, combative family.',
 'director': 'Rian Johnson ',
 'writer': 'Rian Johnson ',
 'primary_actor': 'Daniel Craig, Chris Evans, Ana de Armas ',
 'meta_score': '82',
 'primary_image': 'https://m.media-amazon.com/images/M/MV5BMGUwZjliMTAtNzAxZi00MWNiLWE2NzgtZGUxMGQxZjhhNDRiXkEyXkFqcGdeQXVyNjU1NzU3MzE@._V1_UX182_CR0,0,182,268_AL__QL50.jpg',
 'primary_video': None,
 'other_image_links': '/title/tt8946378/videogallery?ref_=tt_pv_vi_sm',
 'other_video_links': '/title/tt8946378/mediaindex?ref_=tt_pv_mi_sm',
 'all_actors': ['DanielCraig',
  

In [27]:
for item in tqdm(movie_links):
        data=imdb_scrapper(item)
        name=item.split('/')[4]
        with open(f'/home/shivangi/Desktop/movie_database/{name}.json', 'w',encoding='utf-8') as outfile:
            json.dump(data, outfile,indent=4, ensure_ascii=False)