# Get Box office data from moviemojo

In [4]:
# Imports
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

## _Get the box office number for the desired week_

In [5]:
def get_site(week, year):
    """
    Get the page source for BOM top daily gross corresponding to month and year 
    """
    html = 'http://www.boxofficemojo.com/weekend/chart/?yr={}&wknd={}&p=.htm'.format(year, week)
    r = requests.get(html)  # Most sites can be accessed via the get function
    return r.content


## _Parse the source code and clean data_

In [6]:
def parse_source(page_source):

    page_soup = bs(page_source, "lxml")
    table = page_soup.find_all('table', attrs={'cellspacing': '1'})[0]  
    
    output_columns = ['TW', 'LW', 'Title', 'Studio', 'Weekend_Gross/$', '%_Change',
                  'Theater_Count','Theater_Change', 'Average/$', 'Total_Gross/$', 'Budget', 'num_week']
    output = dict((x, []) for x in output_columns)

    all_rows = table.find_all('tr')[1:106]

    for row in all_rows:
        row_cols = row.find_all('td')
        for dict_key, col in zip(output_columns, row_cols):
            output[dict_key].append(col.text)

    output_pd = pd.DataFrame(output)
    output_pd = output_pd[output_columns]
    
    return output_pd

def df_format(main_df):
    change_columns1 = ['Weekend_Gross/$', 'Average/$', 'Total_Gross/$', '%_Change']
    change_columns2 = ['Theater_Change', 'Theater_Count']
    change_columns3 = ['num_week', 'TW']
    main_df.drop('Budget', axis=1, inplace=True)
    for i in change_columns1:
        if i != '%_Change':
            main_df.loc[:, str(i)] = main_df.loc[:, str(i)].str.replace(',', '')
            main_df.loc[:, str(i)] = main_df.loc[:, str(i)].str.replace('$', '') 
            main_df.loc[:, str(i)] = main_df.loc[:, str(i)].astype(int) 
        else:
            main_df.loc[:, str(i)] = main_df.loc[:, str(i)].str.replace('%', '')
            main_df.loc[:, str(i)] = main_df.loc[:, str(i)].str.replace(',', '')
            main_df.loc[:, str(i)] = main_df.loc[:, str(i)].replace('-', 0)
            main_df.loc[:, str(i)] = main_df.loc[:, str(i)].astype(float) 

    for b in change_columns2:
        if b == 'Theater_Count':
            main_df.loc[:, str(b)] = main_df.loc[:, str(b)].str.replace(',', '')
            main_df.loc[:, str(b)] = main_df.loc[:, str(b)].astype(int)
        elif b == 'Theater_Change':
            main_df.loc[:, str(b)] = main_df.loc[:, str(b)].str.replace(',', '')
            main_df.loc[:, str(b)] = main_df.loc[:, str(b)].replace('-', 0)
            main_df.loc[:, str(b)] = main_df.loc[:, str(b)].astype(int) 

    for c in change_columns3:
        main_df.loc[:, str(c)] = main_df.loc[:, str(c)].astype(int) 
    main_df.set_index('TW', inplace=True)
    return main_df


def get_sum(df):
    print('total gross box office: ',df['Weekend_Gross/$'].sum(),'$')
    count = df['Theater_Count'].sort_values(ascending=False).head(10)
    print(count)

### _Get the table_

In [10]:
new = get_site(26, 2018)
main_df = parse_source(new)
main_df =df_format(main_df)
main_df

Unnamed: 0_level_0,LW,Title,Studio,Weekend_Gross/$,%_Change,Theater_Count,Theater_Change,Average/$,Total_Gross/$,num_week
TW,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1,Jurassic World: Fallen Kingdom,Uni.,60912195,-58.8,4485,10,13581,265699530,2
2,2,Incredibles 2,BV,46417761,-42.2,4410,0,10526,440601275,3
3,N,Sicario: Day of the Soldado,Sony,19007566,0.0,3055,0,6222,19007566,1
4,N,Uncle Drew,LG/S,15242781,0.0,2742,0,5559,15242781,1
5,3,Ocean's 8,WB,8332661,-27.8,3426,-230,2432,115004842,4
6,4,Tag,WB (NL),5880731,-28.7,3176,-206,1852,41133539,3
7,5,Deadpool 2,Fox,3574451,-32.2,2094,-326,1707,310474309,7
8,N,Sanju,FIP,2723349,0.0,356,0,7650,2723658,1
9,6,Solo: A Star Wars Story,BV,2687670,-40.4,1654,-684,1625,207673785,6
10,10,Won't You Be My Neighbor?,Focus,2421975,33.0,654,306,3703,7619057,4


In [12]:
# calculate some simple statistics
get_sum(main_df)

total gross box office:  179505719 $
TW
1     4485
2     4410
5     3426
6     3176
3     3055
4     2742
7     2094
9     1654
11    1424
13    1157
Name: Theater_Count, dtype: int64
