# Merging the data scraped from the-numbers and box office mojo
## The numbers has the budget data, and box office mojo has more thorough box office data

In [1]:
import pandas as pd

In [2]:
nums = pd.read_csv("../data/raw/the-numbers.csv")
bom = pd.read_csv("../data/raw/box-office-mojo.csv")

In [51]:
nums.head(10)

Unnamed: 0,release,title,budget,domestic,worldwide,year_released
0,2009-12-18,Avatar,425000000,760507625,2776345279,2009
1,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875,2011
2,2019-04-26,Avengers: Endgame,400000000,848768907,2772749754,2019
3,2019-06-07,Dark Phoenix,350000000,64782713,245510828,2019
4,2015-05-01,Avengers: Age of Ultron,330600000,459005868,1403013963,2015
5,2017-12-15,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747,2017
6,2015-12-18,Star Wars Ep. VII: The Force Awakens,306000000,936662225,2053311220,2015
7,2018-04-27,Avengers: Infinity War,300000000,678815482,2048134200,2018
8,2007-05-24,Pirates of the Caribbean: At World's End,300000000,309420425,963420425,2007
9,2017-11-17,Justice League,300000000,229024295,655945209,2017


In [4]:
nums[nums['title'].str.contains('â\x80\x99', na=False)].head()

Unnamed: 0,release,title,budget,domestic,worldwide
8,2007-05-24,Pirates of the Caribbean: At Worldâs End,300000000,309420425,963420425
28,2006-07-07,Pirates of the Caribbean: Dead Manâs Chest,225000000,423315812,1066215812
72,2009-11-06,Disneyâs A Christmas Carol,190000000,137855863,315709697
263,2001-11-16,Harry Potter and the Sorcererâs Stone,125000000,317871467,975047606
280,2016-12-21,Assassinâs Creed,125000000,54647948,240759682


In [5]:
#replacing errors with correct apostrophes
nums['title'] = nums['title'].str.replace('â\x80\x99', '\'')

In [6]:
bom[bom['movie'].str.contains('\'', na=False)].head()

Unnamed: 0.1,Unnamed: 0,rank,movie,year_released,american_box_office,international_box_office,total_box_office
173,751,174,Madagascar 3: Europe's Most Wanted,2012,216391482.0,530529789.0,746921271.0
180,821,181,Doctor Seuss' The Lorax,2012,214030500.0,136946253.0,350976753.0
241,432,242,Ocean's Eleven,2001,183417150.0,267311379.0,450728529.0
267,692,268,There's Something About Mary,1998,176484651.0,193400000.0,369884651.0
318,203,319,A Bug's Life,1998,162798565.0,200296754.0,363095319.0


In [39]:
nums['release'] = pd.to_datetime(nums['release'])

In [43]:
nums['year_released'] = nums['release'].map(lambda x: x.year)

In [45]:
nums.head()

Unnamed: 0,release,title,budget,domestic,worldwide,year_released
0,2009-12-18,Avatar,425000000,760507625,2776345279,2009
1,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875,2011
2,2019-04-26,Avengers: Endgame,400000000,848768907,2772749754,2019
3,2019-06-07,Dark Phoenix,350000000,64782713,245510828,2019
4,2015-05-01,Avengers: Age of Ultron,330600000,459005868,1403013963,2015


In [13]:
len(bom['movie'].unique())

14638

In [14]:
len(bom)

15000

In [35]:
bom[bom.duplicated(subset=['movie', 'year_released'], keep=False)].sort_values('movie')

Unnamed: 0.1,Unnamed: 0,rank,movie,year_released,american_box_office,international_box_office,total_box_office
14223,25142,14224,Home,2009,15922.0,,15922.0
14254,56142,14255,Home,2009,15433.0,44777735.0,44793168.0


In [54]:
bom.columns

Index(['Unnamed: 0', 'rank', 'title', 'year_released', 'american_box_office',
       'international_box_office', 'total_box_office'],
      dtype='object')

In [None]:
bom.drop([['Unnamed: 0', 'rank'], axis=1)

## Merging... let's see how many rows we still have

In [46]:
bom.rename(columns={'movie':'title'}, inplace=True)

In [47]:
bom.head()

Unnamed: 0.1,Unnamed: 0,rank,title,year_released,american_box_office,international_box_office,total_box_office
0,2,1,Star Wars Ep. VII: The Force Awakens,2015,936662225.0,1116649000.0,2053311000.0
1,3,2,Avengers: Endgame,2019,848768907.0,1923981000.0,2772750000.0
2,4,3,Avatar,2009,760507625.0,2015838000.0,2776345000.0
3,5,4,Black Panther,2018,700059566.0,648198700.0,1348258000.0
4,6,5,Avengers: Infinity War,2018,678815482.0,1369319000.0,2048134000.0


In [48]:
df = pd.merge(nums, bom, on=['title', 'year_released'])

In [49]:
len(df)

4994

In [61]:
df.head()

Unnamed: 0.1,release,title,budget,domestic,worldwide,year_released,Unnamed: 0,rank,american_box_office,international_box_office,total_box_office
0,2009-12-18,Avatar,425000000,760507625,2776345279,2009,4,3,760507625.0,2015838000.0,2776345000.0
1,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875,2011,371,136,241063875.0,804600000.0,1045664000.0
2,2019-04-26,Avengers: Endgame,400000000,848768907,2772749754,2019,3,2,848768907.0,1923981000.0,2772750000.0
3,2019-06-07,Dark Phoenix,350000000,64782713,245510828,2019,6912,1268,64782713.0,180728100.0,245510800.0
4,2015-05-01,Avengers: Age of Ultron,330600000,459005868,1403013963,2015,18,17,459005868.0,944008100.0,1403014000.0


In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4994 entries, 0 to 4993
Data columns (total 11 columns):
release                     4994 non-null datetime64[ns]
title                       4994 non-null object
budget                      4994 non-null int64
domestic                    4994 non-null int64
worldwide                   4994 non-null int64
year_released               4994 non-null int64
Unnamed: 0                  4994 non-null int64
rank                        4994 non-null int64
american_box_office         4994 non-null float64
international_box_office    3837 non-null float64
total_box_office            4994 non-null float64
dtypes: datetime64[ns](1), float64(3), int64(6), object(1)
memory usage: 468.2+ KB


In [62]:
df[(df['international_box_office'].isnull()) & (df['worldwide'].notnull()) & (df['worldwide'] != df['domestic'])]

Unnamed: 0.1,release,title,budget,domestic,worldwide,year_released,Unnamed: 0,rank,american_box_office,international_box_office,total_box_office
3236,2009-04-23,Home,500000,15433,44793168,2009,25142,14224,15922.0,,15922.0


In [65]:
df[(df['international_box_office'].isnull())]

Unnamed: 0.1,release,title,budget,domestic,worldwide,year_released,Unnamed: 0,rank,american_box_office,international_box_office,total_box_office
482,2000-11-22,102 Dalmatians,85000000,66941559,66941559,2000,1412,1213,66941559.0,,66941559.0
492,1998-11-13,Meet Joe Black,85000000,44650003,44650003,1998,1919,1918,44650003.0,,44650003.0
559,1998-12-25,Mighty Joe Young,80000000,50632037,50632037,1998,9516,1694,50632037.0,,50632037.0
614,1999-07-23,Inspector Gadget,75000000,97387965,97387965,1999,647,763,97387965.0,,97387965.0
630,1999-12-17,Anna and the King,75000000,39251128,39251128,1999,6223,2161,39251128.0,,39251128.0
638,1998-10-23,Soldier,75000000,14623082,14623082,1998,4442,4243,14623082.0,,14623082.0
640,2001-02-23,Monkeybone,75000000,5409517,5409517,2001,3760,6036,5409517.0,,5409517.0
688,1989-08-09,The Abyss,70000000,54243125,54243125,1989,5315,1552,54243125.0,,54243125.0
704,1998-01-16,Hard Rain,70000000,19870567,19870567,1998,9535,3594,19870567.0,,19870567.0
709,2002-09-20,Ballistic: Ecks vs. Sever,70000000,14294842,14294842,2002,10042,4299,14294842.0,,14294842.0
