In [167]:
! pip install BeautifulSoup4



In [19]:
## Import Statement
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import bs4
import re
import logging

#Global Varaiable
max_movie_limit = 155
base_url = "https://www.imdb.com/"

In [20]:
## To Generate Log Files:
from datetime import datetime
LOG_FILENAME = "imdb_logfile.log"
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
logging.basicConfig(filename=LOG_FILENAME,level=logging.DEBUG,format='%(asctime)s %(levelname)-8s %(message)s',datefmt='%Y-%m-%d %H:%M:%S')  
logging.info('App is Starting!!...')
logging.debug('Testing Started...')

In [21]:
## URL of Imdb to be scraped
myUrl = 'https://www.imdb.com/chart/top'

In [22]:
uClient = uReq(myUrl)
page_html = uClient.read()
uClient.close()

In [23]:
page_soup = soup(page_html,"html.parser")

In [24]:
page_soup.h1

<h1 class="header">Top Rated Movies</h1>

In [25]:
# Step 1: Extract the Table on whole
page_table = page_soup.findAll("table")

In [26]:
page_row = page_table[0].findAll("tr")

In [28]:
# Example to retrieve one particular row of the table to retrieve the movie title
page_row[2].find('td',attrs={'class':'titleColumn'}).find('a').text

'The Godfather'

In [29]:
page_rows = page_table[0].findAll("tr")
_movie_list = []

## Scrapping the page is of 2 parts.
    # 1. Scrap the home page data that Consists of Movie Title, Year of Release and Rating
    # 2. Scrap Each movie's internal data to obtain Director,Genre,Box Office Collection and Story-Line
    
# 1. Scrapping the home page
def _scrap_homeUrl():
    try:
        _scrap_count = 0
        for i in range(0,max_movie_limit):
            _row = page_rows[i]
            ## To scrap all movie lists under the max limit defined
            if((_row.find('td',attrs={'class':'titleColumn'}) != None) and (_scrap_count<=max_movie_limit)):
                _link =_row.find('td',attrs={'class':'titleColumn'}).find('a')
                title =_row.find('td',attrs={'class':'titleColumn'}).find('a').text
                logging.info("Movie being scrapped::")
                logging.info(title)
                year =_row.find('td',attrs={'class':'titleColumn'}).find('span').text.replace("(","").replace(")","")
                rating =_row.find('td',attrs={'class':'ratingColumn'}).find('strong').text
                _movie_details_dict = {}
                _scrap_count += 1 
                _movie_details_dict['Movie_Title'+str(_scrap_count)]=title
                _movie_details_dict['Year_Released'+str(_scrap_count)]=year
                _movie_details_dict['Rating'+str(_scrap_count)]=rating
                #2. Automating the flow to scrap each movie's internal data
                _data_bunch = _scrap_movieDetails(_link,_scrap_count,_movie_details_dict)
                logging.info(_data_bunch)
                _movie_list.append(_data_bunch)
        return _movie_list
    except:
        print('Oops! Problem Occured please try again Later!')

def  _scrap_movieDetails(links,count,_movie_details_dict):
    ## Scrap Each movie details::
    try:
        if((count<=155)):
                _moviehref_link = links['href']
                movie_page_soup = page_soup(_moviehref_link)
                
                ## Scrap director data
                movie_director = movie_page_soup.find('div',attrs={'class':'credit_summary_item'}).find('a').text
                _movie_details_dict['Director'+str(count)]=movie_director
                
                ## Scrap Story Lineee
                movie_storyline = movie_page_soup.find('div',attrs={'id':'titleStoryLine'}).find('p').find('span').text.strip()
                _movie_details_dict['StoryLine'+str(count)]=movie_storyline
                
                ##Scrap Genre Data
                movie_div = movie_page_soup.find('div',attrs={'id':'titleStoryLine'}).findAll('a')
                movie_genre_dict = _scrap_movie_genre(count,movie_div)
                _movie_details_dict['Genre'+str(count)] = movie_genre_dict.get('genre'+str(count))
                
                ## Scrap box officee
                movie_boxofc_div_list = movie_page_soup.find('div',attrs={'id':'titleDetails'}).findAll('div',attrs={'class':'txt-block'})
                _movie_details_dict['BoxOfficeGross'+str(count)] = _scrap_boxoffice_gross(movie_boxofc_div_list,count)[0].strip()
                return _movie_details_dict
    except:
        print('Oops! Problem occured scrapping the request. Check for Logs!')
                        
def page_soup(_href):
    ## To iterate through each url link from base URL
    try:
        _movie_base_url = base_url+_href
        uClient = uReq(_movie_base_url)
        movie_link_html = uClient.read()
        uClient.close()
        _page_soup_html = soup(movie_link_html,"html.parser")
        return _page_soup_html
    except:
        print("Oops! URL mismatched. Please check the URL!")
        
def _scrap_movie_genre(count,_div_details):
    # To scrap the movie genre; Sometime a movie can have muktiple genre as well
    genre_list = []
    for _div_link in _div_details:
        if('genres'in _div_link['href']):
                 genre_list.append(_div_link.text)
    _movie_dict={}
    for _each_genre in genre_list:
        if(_movie_dict.get('genre'+str(count)) != None):
                _movie_dict['genre'+str(count)] = _movie_dict.get('genre'+str(count))+"|"+_each_genre
        else:
            _movie_dict['genre'+str(count)]=_each_genre
    return _movie_dict


def _scrap_boxoffice_gross(movie_boxofc_div_list,count):
    # To scrap the box_office gross; Old Movies do not hold gross amount; In that case we assume total gross as $0
    final_gross = []
    for each_div in movie_boxofc_div_list:
                    if(each_div.find('h4')!=None and each_div.find('h4').text == 'Cumulative Worldwide Gross:'):
                               amount = each_div.text.strip()
                               final_gross = re.findall(r'\s[$]\w+.*',amount)
                    else:
                        final_gross.append('$0')
    return final_gross

def main():
    # Programming Execution Start point
    logging.info('Starting execution..')
    try:
        _scrap_homeUrl()
    except NameError:
        print("There's a problem with variable. Please check the log files for trace!!")
    except:
        print("There's a problem scrapping the URL. Please check the log files for trace!!")


if __name__ == "__main__":
    main()

In [30]:
_movie_list

[{'Movie_Title1': 'The Shawshank Redemption',
  'Year_Released1': '1994',
  'Rating1': '9.2',
  'Director1': 'Frank Darabont',
  'StoryLine1': "Chronicles the experiences of a formerly successful banker as a prisoner in the gloomy jailhouse of Shawshank after being found guilty of a crime he did not commit. The film portrays the man's unique way of dealing with his new, torturous life; along the way he befriends a number of fellow prisoners, most notably a wise long-term inmate named Red.",
  'Genre1': ' Drama',
  'BoxOfficeGross1': '$28,815,291'},
 {'Movie_Title2': 'The Godfather',
  'Year_Released2': '1972',
  'Rating2': '9.1',
  'Director2': 'Francis Ford Coppola',
  'StoryLine2': 'The Godfather "Don" Vito Corleone is the head of the Corleone mafia family in New York. He is at the event of his daughter\'s wedding. Michael, Vito\'s youngest son and a decorated WW II Marine is also present at the wedding. Michael seems to be uninterested in being a part of the family business. Vito is

In [32]:
## Munging the Scrapped Data into a CSV, with Categories of Genre

import csv
csvFile = open("./imdb_scrapped_review_data.csv", 'w+', newline='')
count = 0

try:
    writer = csv.writer(csvFile)
    logging.info("Writing to the file")
    writer.writerow(('Ranking','Movie Title','Movie Release Year','IMDB Rating','Director','Genre','Box Office','Story Line'))
    for _each_movie in _movie_list:
            logging.info("munging into csv:")
            logging.info(_each_movie)
            count+=1
            _ranking = count
            _movie_title = _each_movie.get('Movie_Title'+str(count))
            _year_released = _each_movie.get('Year_Released'+str(count))
            _rating = _each_movie.get('Rating'+str(count))
            _director = _each_movie.get('Director'+str(count))
            _genre = _each_movie.get('Genre'+str(count))
            _boxOffice = _each_movie.get('BoxOfficeGross'+str(count))
            _story_line = _each_movie.get('StoryLine'+str(count))
            writer.writerow((_ranking,_movie_title,_year_released,_rating,_director,_genre,_boxOffice,_story_line))
            logging.info("file Writing has stoppped! Please check the data!!")

except:
    print("There's a problem writing to the file.. Please check the permissions!!")
    
finally:
    csvFile.close()