### Calling Libraries

In [1]:
# packages

# beautiful soup to work with html responses
from bs4 import BeautifulSoup

# packages to get requests 
import requests
import re

# data wrangling packages
import pandas as pd 
import numpy as np

# folder navigation package
import os

# package to create interactive dashboard
from bokeh.models import ColumnDataSource, CustomJS, Range1d, Select
from bokeh.plotting import figure, output_notebook, show
from bokeh.layouts import column

# import packages for testing 
import unittest

### Scraping

In [2]:
# function to scrape the data 

def scrape_imdb_top_250(url, dest, count):
    url = url # define url for top 250 
    response = requests.get(url) # define response 
    top_soup = BeautifulSoup(response.text, 'html.parser') # parse the response 


    movies = top_soup.select('td.titleColumn') # get the title of a film 
    ratings = [b.attrs.get('data-value')  
            for b in top_soup.select('td.posterColumn span[name=ir]')] # get the ratings
    links = [a.attrs.get('href') for a in top_soup.select('td.titleColumn a')] # get the likns for futher sscraping

    # list for storing data 
    top_list = []
    # convert each movie title to string 
    for index in range(0, len(movies)):
        movie_string = movies[index].get_text() 
        movie = (' '.join(movie_string.split()).replace('.', ''))
        movie_title = movie[len(str(index))+1:-7]

        # capture the scaraped items     
        data = {'movie_title': movie_title,
               'rating': ratings[index],
               "link": links[index]}

        # add data to list 
        top_list.append(data)

    # convert to df 
    df1 = pd.DataFrame(top_list)

    # get the top 20 films 
    df1 = df1.head(count)

    # convert reviews to numeric 
    df1['rating'] = pd.to_numeric(df1['rating'])


    ### get the number of oscars 

    # creat list for scraped data 
    oscar_list = []
    # create loop to scrape additional data from individual title pages
    for i in df1['link']:
        url = 'http://www.imdb.com' + i # create the new links to access the page of each title
        response = requests.get(url) # define response 
        oscar_soup = BeautifulSoup(response.text, 'html.parser') # parse the response
    
    # get the oscars text 
        oscar = oscar_soup.find_all('a', class_ = 
                  'ipc-metadata-list-item__label ipc-metadata-list-item__label--link')[2].text
    # get the alternative oscars text     
        oscar_alt = oscar_soup.find_all('a', class_ = 
                  'ipc-metadata-list-item__label ipc-metadata-list-item__label--link')[4].text
# NOTE that for some of the titles, the oscar count comes from a different position. To account for that, we scrape both positions and then join the results after extracting numerical data
        # capture the scaraped items  
        data = {'oscars': oscar,
               'oscars_alt': oscar_alt} 
        # add data to list  
        oscar_list.append(data)

    # convert to dataframe 
    df2 = pd.DataFrame(oscar_list)

    # formatting 
    df2['oscars'] = df2['oscars'].str.replace("[^0-9]+", "") # keeping only numeric data 
    df2['oscars_alt'] = df2['oscars_alt'].str.replace("[^0-9]+", "") # keeping only numeric data
    df2['oscars'] = pd.to_numeric(df2['oscars']) # converting to numeric type
    df2['oscars_alt'] = pd.to_numeric(df2['oscars_alt'])# converting to numeric type
    df2['oscars'] = df2['oscars'].fillna(df2['oscars_alt']) # filling in the missing values with values from the alternative position 
    df2['oscars'] = df2['oscars'].fillna(0) # filling in remaining missing values with 0 
    df2 = df2.drop(['oscars_alt'], axis=1) # dropping the redundant columns



    ### get the review count

    reviews_list = []
    # create loop to scrape additional data from rating pages of each title
    for i in df1['link']:
        url = 'http://www.imdb.com' + i + 'ratings' # accessing the ratings page for each title, by adjusting the url 
        response = requests.get(url) # define response 
        review_soup = BeautifulSoup(response.text, 'html.parser') # parse the response
    # get the oscars text 
        rating = review_soup.find_all('div', class_ = 'smallcell')[0].text

    # create list data instance 
        data = {'ratings': rating}

    # append the list     
        reviews_list.append(data)

    # convert to dataframe 
    df3 = pd.DataFrame(reviews_list)

    # clean the reviews 
    df3['ratings'] = df3['ratings'].str.replace("[^0-9]+", "")

    # format as numeric 
    df3['ratings'] = pd.to_numeric(df3['ratings'])

    # merge data together 
    imdb_df = pd.concat([df1, df2, df3], axis=1)

    # rename columns 
    imdb_df = imdb_df.rename(columns={'movie_title': 'movie_title',
                                          'ratings': 'num_of_ratings',
                                          'rating': 'avg_rating', 
                                          'oscars': 'num_of_oscars', 
                                          'link': 'link'})

    # write data to csv
    imdb_df.to_csv(dest + '/imdb_scrape.csv')
    
    return imdb_df

In [3]:
# scrape the data 

# define the page to scrape 
url = c
# define the folder for saving the data 
destination = os.getcwd()

df.to_csv(path)
# define the number of titles to scarape
count = 20 

# call the function 
imdb_df = scrape_imdb_top_250(url, destination, count)

  df2['oscars'] = df2['oscars'].str.replace("[^0-9]+", "")
  df2['oscars_alt'] = df2['oscars_alt'].str.replace("[^0-9]+", "")
  df3['ratings'] = df3['ratings'].str.replace("[^0-9]+", "")


### Rating Adjustment

In [4]:
# function to analyze the data 

def calc_adjusted_scores(df, dest):
    imdb_df_sorted = imdb_df.sort_values('num_of_ratings', ascending=False)
    # assign higherst review count to variable 
    max_reviews = imdb_df_sorted['num_of_ratings'][0]
    # calculate the deviation from the max count 
    imdb_df_sorted['deviation'] = (max_reviews - imdb_df_sorted['num_of_ratings']) // 100000
    # calculate the penalty value 
    imdb_df_sorted['review_penalty'] = imdb_df_sorted['deviation'] / 10 
    # get the penalized value 
    imdb_df_sorted['avg_rating_pen'] = imdb_df_sorted['avg_rating'] - imdb_df_sorted['review_penalty']

    # boosting scores based on number of oscars 

    # create list to capture boost values 
    oscar_boost = []

    # define the value of boost vased on the number of oscars 
    for i in imdb_df_sorted['num_of_oscars']:
        # define reward value based on the oscar range
        if i in range(1,3):
            oscar_value = 0.3
        elif i in range(4,6):
            oscar_value = 0.5 
        elif i in range(7,11):
            oscar_value = 1 
        else:
            oscar_value = 1.5 
     # capture boost values        
        oscar_boost.append(oscar_value)

    # add the list as df column 
    imdb_df_sorted['oscar_boost'] = oscar_boost

    # calculate the final rating 
    imdb_df_sorted['rating_dp'] = imdb_df_sorted['avg_rating_pen'] + imdb_df_sorted['oscar_boost']

    # write data to csv
    imdb_df_sorted.to_csv(dest + 'imdb_fin.csv')
    # show a compariison of imdb and adjusted scores
    print(imdb_df_sorted[['movie_title', 'avg_rating', 'rating_dp']])
    # return the enriched df 
    return imdb_df_sorted

In [5]:
# analyze the data aand assign to a df 
df = calc_adjusted_scores(imdb_df, destination)

                         movie_title  avg_rating  rating_dp
0                    A remény rabjai    9.234579  10.234579
2                      A sötét lovag    8.986993   9.286993
12                            Eredet    8.732960   8.932960
11                   Harcosok klubja    8.749423   8.549423
10                      Forrest Gump    8.767604   9.767604
7                       Ponyvaregény    8.850069   8.550069
15                            Mátrix    8.670713   8.470713
8   A Gyűrűk Ura: A gyűrű szövetsége    8.804689   8.504689
1                       A keresztapa    9.156467   9.856467
6   A Gyűrűk Ura: A király visszatér    8.922855   9.622855
13        A Gyűrűk Ura: A két torony    8.729207   8.129207
18                           Hetedik    8.604356   7.904356
5                  Schindler listája    8.936119   8.736119
14             A Birodalom visszavág    8.701520   7.701520
3                    A keresztapa II    8.984624   9.184624
16                         Nagymenők    

### Visualization of the Rating Adjustment

In [28]:
# set output to the notebook 
output_notebook() 

# create plot with categorical data
p = figure(x_range=df['movie_title'], height=350,toolbar_location=None, tools="")

# formatting the plot 
p.xgrid.grid_line_color = None # adjust grigline color 
p.y_range.start = 0 # start range from 0 
p.xaxis.major_label_orientation = 1 # adjust the text on x-axis 
# name the axis
p.xaxis.axis_label='Titles'
p.yaxis.axis_label='Measure'

# shared datasource
source = ColumnDataSource(df)

# defined the two barplots
plot_1 = p.vbar(x='movie_title', top='avg_rating', color='teal', source=source,width=0.9 )
plot_2 = p.vbar(x='movie_title', top="rating_dp", color="firebrick", source=source, width=0.9)

# initialise the plot with only y1 visible - to match the dropdown default
plot_2.visible = False

# dropdown widget + Javascript code for interactivity
select = Select(title='IMDB Top 20 Ratings:', value='IMDB Rating', options=['IMDB Rating', 'Datapao Rating'])# create the dropdown 
select.js_on_change('value', CustomJS(args=dict(bar_1=plot_1, bar_2=plot_2), code='''

bar_1.visible = true
bar_2.visible = true

if (this.value === 'IMDB Rating') {
    bar_2.visible = false 
} else {
    bar_1.visible = false
}
    
''')) # define the alternating logic
# define the layout 
layout = column(select, p)
show(layout)

### Testing the Functions

In [9]:
# establish unit tests 

# the tests for the first function compares the output of the scraping sunction to a pre-defined test df (stored on Github)
# define class for testing dfs 
class TestDataFrame(unittest.TestCase):
    
# define function to test the output of the scrape     
    def test_scrape(self):
        df_test = pd.read_csv('https://raw.githubusercontent.com/steveJ34/imdb_quest/main/Data/imdb_test.csv',
                          index_col=0) # load the test df 
        lenght_test = len(df_test) # get the lenght of the test df 
        lenght_comp = len(imdb_df) # get the length of the scraped df 
        self.assertEqual(lenght_test, lenght_comp) # compare the two   

# define function to test the output of the analysis function against a pre-defined test df (stored on Github)
# the aim of this tes t is to make sure that all the values used in the anlaysis are captured 
    def test_analysis(self):
        df_test = pd.read_csv('https://raw.githubusercontent.com/steveJ34/imdb_quest/main/Data/imdb_analysis_test.csv',
                          index_col=0) # load the test df 
        col_count_test = df_test.shape[1] # get the number of columns for test data 
        col_count_comp = df.shape[1] # get the number of columns for output df of the analysis function
        self.assertEqual(col_count_test, col_count_comp) # compare the two 
    

        
        
unittest.main(argv=[''], verbosity=2, exit=False)      

test_analysis (__main__.TestDataFrame) ... ok
test_scrape (__main__.TestDataFrame) ... ok

----------------------------------------------------------------------
Ran 2 tests in 1.621s

OK


<unittest.main.TestProgram at 0x1236588b0>