# Ben's Cleaning 

## movies_gross

In [1]:
import pandas as pd

In [2]:
# Assigning the movie gross table to variable
df_movie_gross = pd.read_csv('dataframe_id_bom_movie_gross_gz.csv')

In [3]:
df_movie_gross.head() #checking the layout and stats


Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [4]:
# dropping 'studio' column due to irrelevance
df_movie_gross.drop(axis=1, columns='studio', inplace=True)
df_movie_gross

Unnamed: 0,title,domestic_gross,foreign_gross,year
0,Toy Story 3,415000000.0,652000000,2010
1,Alice in Wonderland (2010),334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,296000000.0,664300000,2010
3,Inception,292600000.0,535700000,2010
4,Shrek Forever After,238700000.0,513900000,2010
...,...,...,...,...
3382,The Quake,6200.0,,2018
3383,Edward II (2018 re-release),4800.0,,2018
3384,El Pacto,2500.0,,2018
3385,The Swan,2400.0,,2018


In [5]:
# planning on dropping foreign gross to prevent future NaN's after join
# will reinsert after join

In [6]:
df_movie_gross.isna().any()

title             False
domestic_gross     True
foreign_gross      True
year              False
dtype: bool

In [7]:
#finding NaNs
print(df_movie_gross.isna().sum())

title                0
domestic_gross      28
foreign_gross     1350
year                 0
dtype: int64


In [8]:
# checking for why NaN's present (did these movies not get foreign/domestic 
#releases?)
df = df_movie_gross
is_NaN = df.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = df[row_has_NaN]

print(rows_with_NaN)

                                       title  domestic_gross foreign_gross  \
222                                  Flipped       1800000.0           NaN   
230               It's a Wonderful Afterlife             NaN       1300000   
254   The Polar Express (IMAX re-issue 2010)        673000.0           NaN   
267                           Tiny Furniture        392000.0           NaN   
269            Grease (Sing-a-Long re-issue)        366000.0           NaN   
...                                      ...             ...           ...   
3382                               The Quake          6200.0           NaN   
3383             Edward II (2018 re-release)          4800.0           NaN   
3384                                El Pacto          2500.0           NaN   
3385                                The Swan          2400.0           NaN   
3386                       An Actor Prepares          1700.0           NaN   

      year  
222   2010  
230   2010  
254   2010  
267   2010 

In [9]:
# Assuming that NaN's represent lack of releases in those markets, replacing
# NaN's with 0's
df_movie_gross['domestic_gross'].fillna(0, inplace=True)
# Checking results
print(df_movie_gross.isna().sum())

title                0
domestic_gross       0
foreign_gross     1350
year                 0
dtype: int64


In [10]:
# dropping foreign_gross columns to prevent future NaN's after join.  
# will return foreign_gross after join 
df_movie_gross.drop(axis=1, columns='foreign_gross', inplace=True)
# checking changes
df_movie_gross.head()


Unnamed: 0,title,domestic_gross,year
0,Toy Story 3,415000000.0,2010
1,Alice in Wonderland (2010),334200000.0,2010
2,Harry Potter and the Deathly Hallows Part 1,296000000.0,2010
3,Inception,292600000.0,2010
4,Shrek Forever After,238700000.0,2010


In [11]:
# Convert domestic gross to int
df_movie_gross['domestic_gross'] = df_movie_gross.domestic_gross.astype(int)
df_movie_gross.head()

Unnamed: 0,title,domestic_gross,year
0,Toy Story 3,415000000,2010
1,Alice in Wonderland (2010),334200000,2010
2,Harry Potter and the Deathly Hallows Part 1,296000000,2010
3,Inception,292600000,2010
4,Shrek Forever After,238700000,2010


In [12]:
# checking for duplicate titles
df_movie_gross['title'].value_counts()


Bluebeard                                           2
Skin Trade                                          1
The American                                        1
Confidential Assignment                             1
Pirates of the Caribbean: Dead Men Tell No Tales    1
                                                   ..
The Decoy Bride                                     1
Terkel in Trouble                                   1
The Daughter                                        1
The Accountant                                      1
No One Killed Jessica                               1
Name: title, Length: 3386, dtype: int64

In [13]:
df_movie_gross.loc[df_movie_gross['title'] == 'Bluebeard']

Unnamed: 0,title,domestic_gross,year
317,Bluebeard,33500,2010
3045,Bluebeard,43100,2017


In [14]:
# Movies came out in different years, leaving them alone

In [15]:
# Checking for other missing data placeholders '?', 'Unknown', 'Unnamed'

for col in df_movie_gross:
    print('?: {}'.format(df_movie_gross.loc[
                                    df_movie_gross[col] == "?"].sum()))
    print('=========================')
    print('Unknown: {}'.format(df_movie_gross.loc[
                                    df_movie_gross[col] == "?"].sum()))
    print('=========================')
    print('Unnamed: {}'.format(df_movie_gross.loc[
                                    df_movie_gross[col] == "?"].sum()))
    print('=========================')
    print('New Column')

?: title             0.0
domestic_gross    0.0
year              0.0
dtype: float64
Unknown: title             0.0
domestic_gross    0.0
year              0.0
dtype: float64
Unnamed: title             0.0
domestic_gross    0.0
year              0.0
dtype: float64
New Column
?: title             0.0
domestic_gross    0.0
year              0.0
dtype: float64
Unknown: title             0.0
domestic_gross    0.0
year              0.0
dtype: float64
Unnamed: title             0.0
domestic_gross    0.0
year              0.0
dtype: float64
New Column
?: title             0.0
domestic_gross    0.0
year              0.0
dtype: float64
Unknown: title             0.0
domestic_gross    0.0
year              0.0
dtype: float64
Unnamed: title             0.0
domestic_gross    0.0
year              0.0
dtype: float64
New Column


  result = method(y)


In [16]:
df_movie_gross.head()

Unnamed: 0,title,domestic_gross,year
0,Toy Story 3,415000000,2010
1,Alice in Wonderland (2010),334200000,2010
2,Harry Potter and the Deathly Hallows Part 1,296000000,2010
3,Inception,292600000,2010
4,Shrek Forever After,238700000,2010


In [17]:
df_movie_gross.to_csv('bom_movie_gross_clean')

## Cleaning Budget

In [18]:
#importing budget
df_budget = pd.read_csv('dataframe_id_tn_movie_budgets_gz.csv')
#checking head, info, and shape
df_budget.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [19]:
df_budget.nunique()

id                    100
release_date         2418
movie                5698
production_budget     509
domestic_gross       5164
worldwide_gross      5356
dtype: int64

In [20]:
df_dup = df_budget[df_budget['movie'].duplicated()]
df_dup.groupby('release_date')
df_dup

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
273,74,"May 19, 1998",Godzilla,"$125,000,000","$136,314,294","$376,000,000"
408,9,"Nov 21, 2018",Robin Hood,"$99,000,000","$30,824,628","$84,747,441"
484,85,"Jul 8, 2005",Fantastic Four,"$87,500,000","$154,696,080","$333,132,750"
543,44,"May 7, 1999",The Mummy,"$80,000,000","$155,385,488","$416,385,488"
707,8,"Jun 13, 1997",Hercules,"$70,000,000","$99,112,101","$250,700,000"
...,...,...,...,...,...,...
5668,69,"Nov 16, 1942",Cat People,"$134,000","$4,000,000","$8,000,000"
5676,77,"Oct 1, 1968",Night of the Living Dead,"$114,000","$12,087,064","$30,087,064"
5677,78,"Feb 8, 1915",The Birth of a Nation,"$110,000","$10,000,000","$11,000,000"
5699,100,"Aug 30, 1972",The Last House on the Left,"$87,000","$3,100,000","$3,100,000"


In [21]:
# Changing amount values to integers 
def convert_amt_to_int(df, col):
    df[col] = df[col].str.replace("$", "").str.replace(",", "").astype('int64')
    return df

In [22]:
# changing the data type 
money_cols = ['production_budget', 'domestic_gross', 'worldwide_gross']

for col in money_cols:
    df_budget = convert_amt_to_int(df_budget, col)

In [23]:
# check to see if it worked 
df_budget.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,425000000,760507625,2776345279
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875
2,3,"Jun 7, 2019",Dark Phoenix,350000000,42762350,149762350
3,4,"May 1, 2015",Avengers: Age of Ultron,330600000,459005868,1403013963
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747


In [24]:
# dropping irrelevant columns
df_budget.drop(axis=1, columns=['id'], inplace=True)
df_budget.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 5 columns):
release_date         5782 non-null object
movie                5782 non-null object
production_budget    5782 non-null int64
domestic_gross       5782 non-null int64
worldwide_gross      5782 non-null int64
dtypes: int64(3), object(2)
memory usage: 226.0+ KB


In [25]:
#changing date to datetime format
df_budget['release_date'] = df_budget['release_date'].str.replace(",", "")


In [26]:
#checking
df_budget['release_date']

0       Dec 18 2009
1       May 20 2011
2        Jun 7 2019
3        May 1 2015
4       Dec 15 2017
           ...     
5777    Dec 31 2018
5778     Apr 2 1999
5779    Jul 13 2005
5780    Sep 29 2015
5781     Aug 5 2005
Name: release_date, Length: 5782, dtype: object

In [27]:
df_budget.head() 

Unnamed: 0,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,Dec 18 2009,Avatar,425000000,760507625,2776345279
1,May 20 2011,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875
2,Jun 7 2019,Dark Phoenix,350000000,42762350,149762350
3,May 1 2015,Avengers: Age of Ultron,330600000,459005868,1403013963
4,Dec 15 2017,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747


In [28]:
import string
df_budget['release_date'] = df_budget['release_date'].str.rstrip(string.digits)
df_budget.head() 

Unnamed: 0,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,Dec 18,Avatar,425000000,760507625,2776345279
1,May 20,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875
2,Jun 7,Dark Phoenix,350000000,42762350,149762350
3,May 1,Avengers: Age of Ultron,330600000,459005868,1403013963
4,Dec 15,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747


In [29]:
#checking for missing values
df_budget.isna().sum()

release_date         0
movie                0
production_budget    0
domestic_gross       0
worldwide_gross      0
dtype: int64

In [30]:
print(df_budget.nunique())
df_budget.shape

release_date          365
movie                5698
production_budget     509
domestic_gross       5164
worldwide_gross      5356
dtype: int64


(5782, 5)

In [31]:
df_budget.head()

Unnamed: 0,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,Dec 18,Avatar,425000000,760507625,2776345279
1,May 20,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875
2,Jun 7,Dark Phoenix,350000000,42762350,149762350
3,May 1,Avengers: Age of Ultron,330600000,459005868,1403013963
4,Dec 15,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747


In [32]:
df_budget.to_csv('tn_movie_budgets_clean.csv')

## Joining Budget and Gross tables

In [33]:
# looking at column names
df_movie_gross.head()

Unnamed: 0,title,domestic_gross,year
0,Toy Story 3,415000000,2010
1,Alice in Wonderland (2010),334200000,2010
2,Harry Potter and the Deathly Hallows Part 1,296000000,2010
3,Inception,292600000,2010
4,Shrek Forever After,238700000,2010


In [34]:
df_budget.head()

Unnamed: 0,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,Dec 18,Avatar,425000000,760507625,2776345279
1,May 20,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875
2,Jun 7,Dark Phoenix,350000000,42762350,149762350
3,May 1,Avengers: Age of Ultron,330600000,459005868,1403013963
4,Dec 15,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747


In [35]:
# Need to change df_budget['movie'] to ['title']
df_budget.rename(columns={'movie' : 'title'}, inplace=True)
df_budget.columns

Index(['release_date', 'title', 'production_budget', 'domestic_gross',
       'worldwide_gross'],
      dtype='object')

In [36]:
# Want to join tables based on title so that we add to the sample and include
# worldwide_gross
df_profits = df_budget.set_index('title').join(df_movie_gross.set_index(
                'title'), how='outer', rsuffix='_gr')

In [37]:
df_profits.head()

Unnamed: 0_level_0,release_date,production_budget,domestic_gross,worldwide_gross,domestic_gross_gr,year
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
#Horror,Nov 20,1500000.0,0.0,0.0,,
'71,,,,,1300000.0,2015.0
(500) Days of Summer,Jul 17,7500000.0,32425665.0,34439060.0,,
"1,000 Times Good Night",,,,,53900.0,2014.0
10 Cloverfield Lane,Mar 11,5000000.0,72082999.0,108286422.0,72100000.0,2016.0


### Cleaning Joined Table

In [38]:
# dropping

In [39]:
df_profits.isna().sum()

release_date         2149
production_budget    2149
domestic_gross       2149
worldwide_gross      2149
domestic_gross_gr    4535
year                 4535
dtype: int64

In [40]:
import numpy as np

In [41]:
# fixing gross value NaN's

def fix_nan(row):
    """
    Takes a row in a with 2 columns which contain comparable values.
    If one column is missing a value, the function will replace it with
    the value of the other column.
    Needs to be used in conjunction with: 
    
    df.apply(lamba x : fix_nan(x), axis=1)
    """
    if np.isnan(row['domestic_gross']):
        return row['domestic_gross_gr']
    else:
        return row['domestic_gross']

df_profits['domestic_gross']=df_profits.apply(lambda row : fix_nan(row),axis=1)
df_profits.head()

Unnamed: 0_level_0,release_date,production_budget,domestic_gross,worldwide_gross,domestic_gross_gr,year
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
#Horror,Nov 20,1500000.0,0.0,0.0,,
'71,,,1300000.0,,1300000.0,2015.0
(500) Days of Summer,Jul 17,7500000.0,32425665.0,34439060.0,,
"1,000 Times Good Night",,,53900.0,,53900.0,2014.0
10 Cloverfield Lane,Mar 11,5000000.0,72082999.0,108286422.0,72100000.0,2016.0


In [42]:
df_profits.isna().sum()

release_date         2149
production_budget    2149
domestic_gross          0
worldwide_gross      2149
domestic_gross_gr    4535
year                 4535
dtype: int64

In [43]:
df_profits['domestic_gross_gr']=df_profits.apply(lambda row : fix_nan(row),axis=1)

In [44]:
df_profits.isna().sum()

release_date         2149
production_budget    2149
domestic_gross          0
worldwide_gross      2149
domestic_gross_gr       0
year                 4535
dtype: int64

In [45]:
# removing NaN's from release date
df_profits.dropna(axis=0, subset=['release_date'], inplace=False)

Unnamed: 0_level_0,release_date,production_budget,domestic_gross,worldwide_gross,domestic_gross_gr,year
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
#Horror,Nov 20,1500000.0,0.0,0.0,0.0,
(500) Days of Summer,Jul 17,7500000.0,32425665.0,34439060.0,32425665.0,
10 Cloverfield Lane,Mar 11,5000000.0,72082999.0,108286422.0,72082999.0,2016.0
10 Days in a Madhouse,Nov 11,12000000.0,14616.0,14616.0,14616.0,
10 Things I Hate About You,Mar 31,13000000.0,38177966.0,60413950.0,38177966.0,
...,...,...,...,...,...,...
mother!,Sep 15,30000000.0,17800004.0,42531076.0,17800004.0,2017.0
xXx,Aug 9,70000000.0,141930000.0,267200000.0,141930000.0,
xXx: Return of Xander Cage,Jan 20,85000000.0,44898413.0,345033359.0,44898413.0,
Ã l\'intÃ©rieur,Apr 15,3000000.0,0.0,895932.0,0.0,


In [46]:
#resetting index
df_profits.reset_index()

Unnamed: 0,title,release_date,production_budget,domestic_gross,worldwide_gross,domestic_gross_gr,year
0,#Horror,Nov 20,1500000.0,0.0,0.0,0.0,
1,'71,,,1300000.0,,1300000.0,2015.0
2,(500) Days of Summer,Jul 17,7500000.0,32425665.0,34439060.0,32425665.0,
3,"1,000 Times Good Night",,,53900.0,,53900.0,2014.0
4,10 Cloverfield Lane,Mar 11,5000000.0,72082999.0,108286422.0,72082999.0,2016.0
...,...,...,...,...,...,...,...
7926,xXx,Aug 9,70000000.0,141930000.0,267200000.0,141930000.0,
7927,xXx: Return of Xander Cage,Jan 20,85000000.0,44898413.0,345033359.0,44898413.0,
7928,xXx: The Return of Xander Cage,,,44900000.0,,44900000.0,2017.0
7929,Ã l\'intÃ©rieur,Apr 15,3000000.0,0.0,895932.0,0.0,


In [47]:
# need to address discrepancy between values in domestic gross and 
# domestic_gross_gr
df_profits['domestic_gross'] = (df_profits['domestic_gross'] + 
                                df_profits['domestic_gross_gr'])/2
df_profits.head()

Unnamed: 0_level_0,release_date,production_budget,domestic_gross,worldwide_gross,domestic_gross_gr,year
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
#Horror,Nov 20,1500000.0,0.0,0.0,0.0,
'71,,,1300000.0,,1300000.0,2015.0
(500) Days of Summer,Jul 17,7500000.0,32425665.0,34439060.0,32425665.0,
"1,000 Times Good Night",,,53900.0,,53900.0,2014.0
10 Cloverfield Lane,Mar 11,5000000.0,72082999.0,108286422.0,72082999.0,2016.0


In [48]:
# now that we've addressed the discrepancy, dropping domestic_gross_gr
df_profits.drop(axis=1, columns=['domestic_gross_gr'],
                             inplace=True)
df_profits

Unnamed: 0_level_0,release_date,production_budget,domestic_gross,worldwide_gross,year
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
#Horror,Nov 20,1500000.0,0.0,0.0,
'71,,,1300000.0,,2015.0
(500) Days of Summer,Jul 17,7500000.0,32425665.0,34439060.0,
"1,000 Times Good Night",,,53900.0,,2014.0
10 Cloverfield Lane,Mar 11,5000000.0,72082999.0,108286422.0,2016.0
...,...,...,...,...,...
xXx,Aug 9,70000000.0,141930000.0,267200000.0,
xXx: Return of Xander Cage,Jan 20,85000000.0,44898413.0,345033359.0,
xXx: The Return of Xander Cage,,,44900000.0,,2017.0
Ã l\'intÃ©rieur,Apr 15,3000000.0,0.0,895932.0,


In [49]:
# dropping NaN values from worldwide 
df_profits.dropna(axis=0, subset=['worldwide_gross'], inplace=True)

In [50]:
df_profits.drop(axis=1, columns='year', inplace=True)

In [51]:
df_profits.isna().sum()

release_date         0
production_budget    0
domestic_gross       0
worldwide_gross      0
dtype: int64

In [52]:
# create a profit column
df_profits['net_prof'] = round(df_profits['worldwide_gross'] - 
                         df_profits['production_budget'], 5)
df_profits.head()

Unnamed: 0_level_0,release_date,production_budget,domestic_gross,worldwide_gross,net_prof
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
#Horror,Nov 20,1500000.0,0.0,0.0,-1500000.0
(500) Days of Summer,Jul 17,7500000.0,32425665.0,34439060.0,26939060.0
10 Cloverfield Lane,Mar 11,5000000.0,72082999.0,108286422.0,103286422.0
10 Days in a Madhouse,Nov 11,12000000.0,14616.0,14616.0,-11985384.0
10 Things I Hate About You,Mar 31,13000000.0,38177966.0,60413950.0,47413950.0


In [53]:
# finding the profit margin (%
df_profits['net_margin_%'] = round((df_profits['net_prof']/
                                    df_profits['worldwide_gross'])*100, 2)
df_profits.head()

Unnamed: 0_level_0,release_date,production_budget,domestic_gross,worldwide_gross,net_prof,net_margin_%
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
#Horror,Nov 20,1500000.0,0.0,0.0,-1500000.0,-inf
(500) Days of Summer,Jul 17,7500000.0,32425665.0,34439060.0,26939060.0,78.22
10 Cloverfield Lane,Mar 11,5000000.0,72082999.0,108286422.0,103286422.0,95.38
10 Days in a Madhouse,Nov 11,12000000.0,14616.0,14616.0,-11985384.0,-82001.81
10 Things I Hate About You,Mar 31,13000000.0,38177966.0,60413950.0,47413950.0,78.48


In [54]:
df_profits.reset_index()

Unnamed: 0,title,release_date,production_budget,domestic_gross,worldwide_gross,net_prof,net_margin_%
0,#Horror,Nov 20,1500000.0,0.0,0.0,-1500000.0,-inf
1,(500) Days of Summer,Jul 17,7500000.0,32425665.0,34439060.0,26939060.0,78.22
2,10 Cloverfield Lane,Mar 11,5000000.0,72082999.0,108286422.0,103286422.0,95.38
3,10 Days in a Madhouse,Nov 11,12000000.0,14616.0,14616.0,-11985384.0,-82001.81
4,10 Things I Hate About You,Mar 31,13000000.0,38177966.0,60413950.0,47413950.0,78.48
...,...,...,...,...,...,...,...
5777,mother!,Sep 15,30000000.0,17800004.0,42531076.0,12531076.0,29.46
5778,xXx,Aug 9,70000000.0,141930000.0,267200000.0,197200000.0,73.80
5779,xXx: Return of Xander Cage,Jan 20,85000000.0,44898413.0,345033359.0,260033359.0,75.36
5780,Ã l\'intÃ©rieur,Apr 15,3000000.0,0.0,895932.0,-2104068.0,-234.85


In [55]:
df_profits['net_margin_%'].value_counts()

-inf         367
 85.35         5
 80.00         5
 76.66         5
 89.05         5
            ... 
 32.21         1
-227.70        1
-1000.55       1
-28108.74      1
-27.50         1
Name: net_margin_%, Length: 4672, dtype: int64

In [56]:
# finding out the type of '-inf']
df_profits['net_margin_%'].apply(type)

title
#Horror                       <class 'float'>
(500) Days of Summer          <class 'float'>
10 Cloverfield Lane           <class 'float'>
10 Days in a Madhouse         <class 'float'>
10 Things I Hate About You    <class 'float'>
                                   ...       
mother!                       <class 'float'>
xXx                           <class 'float'>
xXx: Return of Xander Cage    <class 'float'>
Ã l\'intÃ©rieur               <class 'float'>
é·æ±ä¸è (CJ7)            <class 'float'>
Name: net_margin_%, Length: 5782, dtype: object

In [57]:
# deleting negative values
df_profits = df_profits[df_profits['net_margin_%'] > 0]
df_profits.head()

Unnamed: 0_level_0,release_date,production_budget,domestic_gross,worldwide_gross,net_prof,net_margin_%
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
(500) Days of Summer,Jul 17,7500000.0,32425665.0,34439060.0,26939060.0,78.22
10 Cloverfield Lane,Mar 11,5000000.0,72082999.0,108286422.0,103286422.0,95.38
10 Things I Hate About You,Mar 31,13000000.0,38177966.0,60413950.0,47413950.0,78.48
"10,000 B.C.",Mar 7,105000000.0,94784201.0,269065678.0,164065678.0,60.98
12 Monkeys,Dec 27,29000000.0,57141459.0,168841459.0,139841459.0,82.82


In [58]:
df_profits.to_csv('joined_gross_budget_cleaned.csv')

In [59]:
df_profits = df_profits.reset_index()
df_profits

Unnamed: 0,title,release_date,production_budget,domestic_gross,worldwide_gross,net_prof,net_margin_%
0,(500) Days of Summer,Jul 17,7500000.0,32425665.0,34439060.0,26939060.0,78.22
1,10 Cloverfield Lane,Mar 11,5000000.0,72082999.0,108286422.0,103286422.0,95.38
2,10 Things I Hate About You,Mar 31,13000000.0,38177966.0,60413950.0,47413950.0,78.48
3,"10,000 B.C.",Mar 7,105000000.0,94784201.0,269065678.0,164065678.0,60.98
4,12 Monkeys,Dec 27,29000000.0,57141459.0,168841459.0,139841459.0,82.82
...,...,...,...,...,...,...,...
3652,crazy/beautiful,Jun 29,13000000.0,16929123.0,19929123.0,6929123.0,34.77
3653,mother!,Sep 15,30000000.0,17800004.0,42531076.0,12531076.0,29.46
3654,xXx,Aug 9,70000000.0,141930000.0,267200000.0,197200000.0,73.80
3655,xXx: Return of Xander Cage,Jan 20,85000000.0,44898413.0,345033359.0,260033359.0,75.36


In [139]:
df_profits['title'].value_counts()

Halloween                      3
King Kong                      3
Aladdin                        2
Planet of the Apes             2
The Texas Chainsaw Massacre    2
                              ..
Elizabeth: The Golden Age      1
Hairspray                      1
Untraceable                    1
Following                      1
Family Plot                    1
Name: title, Length: 3611, dtype: int64

In [140]:
# Checking to see if repeats are separate releases
df_profits.loc[df_profits['title'] == 'King Kong']

Unnamed: 0,title,release_date,production_budget,domestic_gross,worldwide_gross,net_prof,net_margin_%
1493,King Kong,Dec 14,207000000.0,218080025.0,550517357.0,343517357.0,62.4
1494,King Kong,Dec 17,23000000.0,52614445.0,90614445.0,67614445.0,74.62
1495,King Kong,Apr 7,672000.0,10000000.0,10000650.0,9328650.0,93.28


In [141]:
df_profits.loc[df_profits['title'] == 'The Texas Chainsaw Massacre']

Unnamed: 0,title,release_date,production_budget,domestic_gross,worldwide_gross,net_prof,net_margin_%
3246,The Texas Chainsaw Massacre,Oct 17,9000000.0,80571655.0,107967319.0,98967319.0,91.66
3247,The Texas Chainsaw Massacre,Oct 18,140000.0,26572439.0,26572439.0,26432439.0,99.47


In [143]:
df_profits.duplicated().sum()

0

## Names Cleaning

In [60]:
# Correlates with title.crew
df_names = pd.read_csv('dataframe_id_imdb_name_basics_gz.csv')

df_names.head()

Unnamed: 0,nconst,primary_name,birth_year,death_year,primary_profession,known_for_titles
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"
2,nm0062070,Bruce Baum,,,"miscellaneous,actor,writer","tt1470654,tt0363631,tt0104030,tt0102898"
3,nm0062195,Axel Baumann,,,"camera_department,cinematographer,art_department","tt0114371,tt2004304,tt1618448,tt1224387"
4,nm0062798,Pete Baxter,,,"production_designer,art_department,set_decorator","tt0452644,tt0452692,tt3458030,tt2178256"


In [61]:
# use split funtion
df_names

Unnamed: 0,nconst,primary_name,birth_year,death_year,primary_profession,known_for_titles
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"
2,nm0062070,Bruce Baum,,,"miscellaneous,actor,writer","tt1470654,tt0363631,tt0104030,tt0102898"
3,nm0062195,Axel Baumann,,,"camera_department,cinematographer,art_department","tt0114371,tt2004304,tt1618448,tt1224387"
4,nm0062798,Pete Baxter,,,"production_designer,art_department,set_decorator","tt0452644,tt0452692,tt3458030,tt2178256"
...,...,...,...,...,...,...
606643,nm9990381,Susan Grobes,,,actress,
606644,nm9990690,Joo Yeon So,,,actress,"tt9090932,tt8737130"
606645,nm9991320,Madeline Smith,,,actress,"tt8734436,tt9615610"
606646,nm9991786,Michelle Modigliani,,,producer,


In [62]:
# drop irrelevent columns
df_names.drop(axis=1, columns= ['birth_year', 'death_year'], 
                         inplace=True)


In [63]:
df_names.head()

Unnamed: 0,nconst,primary_name,primary_profession,known_for_titles
0,nm0061671,Mary Ellen Bauder,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
1,nm0061865,Joseph Bauer,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"
2,nm0062070,Bruce Baum,"miscellaneous,actor,writer","tt1470654,tt0363631,tt0104030,tt0102898"
3,nm0062195,Axel Baumann,"camera_department,cinematographer,art_department","tt0114371,tt2004304,tt1618448,tt1224387"
4,nm0062798,Pete Baxter,"production_designer,art_department,set_decorator","tt0452644,tt0452692,tt3458030,tt2178256"


In [64]:
#checking for NaN's, duplicates, and placeholders
df_names.isna().sum()

nconst                    0
primary_name              0
primary_profession    51340
known_for_titles      30204
dtype: int64

In [65]:
# dropping NaN's since sample size is so huge
df_names.dropna(axis=0, how='any', inplace=True)


In [66]:
# Checking
df_names.isna().sum()

nconst                0
primary_name          0
primary_profession    0
known_for_titles      0
dtype: int64

In [67]:
# checking for duplicates
print(df_names.duplicated().sum())
df_names.shape

0


(535137, 4)

In [68]:
df_names.nunique()

nconst                535137
primary_name          511885
primary_profession      8623
known_for_titles      466005
dtype: int64

In [69]:
# expect to see fewer unique values in profession and titles
#checking to see repeated values in primary_name
df_dup = df_names[df_names['nconst'].duplicated()]

In [70]:
df_names.primary_name.value_counts()

Michael Johnson     14
James Brown         14
David Brown         14
Michael Brown       13
Jeff Johnson        11
                    ..
Kirstin McKenzie     1
Christian Toon       1
Bless May            1
Spiros Grammenos     1
Forrest Griffin      1
Name: primary_name, Length: 511885, dtype: int64

In [71]:
# Though names are common, the fact that 3 people have 14 iterations of their
# names raises red flags. Dropping duplicate names
df_names.drop_duplicates(subset='primary_name', inplace=True)

In [72]:
df_names.shape

(511885, 4)

In [73]:
df_names.head()

Unnamed: 0,nconst,primary_name,primary_profession,known_for_titles
0,nm0061671,Mary Ellen Bauder,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
1,nm0061865,Joseph Bauer,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"
2,nm0062070,Bruce Baum,"miscellaneous,actor,writer","tt1470654,tt0363631,tt0104030,tt0102898"
3,nm0062195,Axel Baumann,"camera_department,cinematographer,art_department","tt0114371,tt2004304,tt1618448,tt1224387"
4,nm0062798,Pete Baxter,"production_designer,art_department,set_decorator","tt0452644,tt0452692,tt3458030,tt2178256"


In [74]:
df_names.to_csv('Names.csv')

# Ayesha's Data Cleaning for title_akas


In [75]:
df_title_akas = pd.read_csv('dataframe_id_imdb_title_akas_gz.csv')
df = df_title_akas

In [76]:
df.head()


Unnamed: 0,title_id,ordering,title,region,language,types,attributes,is_original_title
0,tt0369610,10,Джурасик свят,BG,bg,,,0.0
1,tt0369610,11,Jurashikku warudo,JP,,imdbDisplay,,0.0
2,tt0369610,12,Jurassic World: O Mundo dos Dinossauros,BR,,imdbDisplay,,0.0
3,tt0369610,13,O Mundo dos Dinossauros,BR,,,short title,0.0
4,tt0369610,14,Jurassic World,FR,,imdbDisplay,,0.0


In [77]:
# Grouping data in catagories (title id joins with df_names )
df.groupby('title')
df

Unnamed: 0,title_id,ordering,title,region,language,types,attributes,is_original_title
0,tt0369610,10,Джурасик свят,BG,bg,,,0.0
1,tt0369610,11,Jurashikku warudo,JP,,imdbDisplay,,0.0
2,tt0369610,12,Jurassic World: O Mundo dos Dinossauros,BR,,imdbDisplay,,0.0
3,tt0369610,13,O Mundo dos Dinossauros,BR,,,short title,0.0
4,tt0369610,14,Jurassic World,FR,,imdbDisplay,,0.0
...,...,...,...,...,...,...,...,...
331698,tt9827784,2,Sayonara kuchibiru,,,original,,1.0
331699,tt9827784,3,Farewell Song,XWW,en,imdbDisplay,,0.0
331700,tt9880178,1,La atención,,,original,,1.0
331701,tt9880178,2,La atención,ES,,,,0.0


In [78]:
df.groupby('title')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001623263C828>

In [79]:
df.groupby('language')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001623263CC18>

In [80]:
# dropping attribute column
df = df.drop('attributes', axis = 1)
df.isna().sum()

title_id                  0
ordering                  0
title                     0
region                53293
language             289988
types                163256
is_original_title        25
dtype: int64

# Ayesha's Data Cleaning for movies

In [81]:
df_movies = pd.read_csv('dataframe_id_tmdb_movies_gz.csv')
df = df_movies

In [82]:
print(df_movies.head())
print(df_movies.info())
print(df_movies.shape)

   Unnamed: 0            genre_ids     id original_language  \
0           0      [12, 14, 10751]  12444                en   
1           1  [14, 12, 16, 10751]  10191                en   
2           2        [12, 28, 878]  10138                en   
3           3      [16, 35, 10751]    862                en   
4           4        [28, 878, 12]  27205                en   

                                 original_title  popularity release_date  \
0  Harry Potter and the Deathly Hallows: Part 1      33.533   2010-11-19   
1                      How to Train Your Dragon      28.734   2010-03-26   
2                                    Iron Man 2      28.515   2010-05-07   
3                                     Toy Story      28.005   1995-11-22   
4                                     Inception      27.920   2010-07-16   

                                          title  vote_average  vote_count  
0  Harry Potter and the Deathly Hallows: Part 1           7.7       10788  
1           

In [83]:
# not letting me groupby: genre_ids, original_title, popularity, release_date, 
# (genre id's used with df_title basics['genres'])
df.groupby('title').median() 

Unnamed: 0_level_0,Unnamed: 0,id,popularity,vote_average,vote_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"""BLESSED""",26355.0,564096.0,0.600,7.0,1.0
"""Legitimate Rape"" Pharmaceutical Ad",7815.0,283543.0,0.600,2.3,2.0
"""cherry"" - Supreme",12812.0,263765.0,0.893,10.0,1.0
#1 Cheerleader Camp,577.0,41371.0,3.277,3.6,34.0
#ALLMYMOVIES,16037.0,368247.0,0.840,9.0,4.0
...,...,...,...,...,...
纽约客@上海,5961.0,126186.0,2.416,6.0,12.0
마음의 소리 스페셜 1-효! 크러쉬,26292.0,602570.0,0.600,8.0,1.0
번개맨의 비밀,26267.0,591378.0,0.600,9.0,1.0
유병재: B의 농담,26106.0,542691.0,0.708,6.5,2.0



# Ayesha's Data Cleaning title_basics

In [84]:
df_title_basics = pd.read_csv('dataframe_id_imdb_title_basics_gz.csv')

In [85]:
print(df_title_basics.head())
print(df_title_basics.info())
print(df_title_basics.shape)

      tconst                    primary_title              original_title  \
0  tt0063540                        Sunghursh                   Sunghursh   
1  tt0066787  One Day Before the Rainy Season             Ashad Ka Ek Din   
2  tt0069049       The Other Side of the Wind  The Other Side of the Wind   
3  tt0069204                  Sabse Bada Sukh             Sabse Bada Sukh   
4  tt0100275         The Wandering Soap Opera       La Telenovela Errante   

   start_year  runtime_minutes                genres  
0        2013            175.0    Action,Crime,Drama  
1        2019            114.0       Biography,Drama  
2        2018            122.0                 Drama  
3        2018              NaN          Comedy,Drama  
4        2017             80.0  Comedy,Drama,Fantasy  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 6 columns):
tconst             146144 non-null object
primary_title      146144 non-null object
original_titl

In [86]:
# tconts correlates with title_basics, title_crew, title_principals

df = df_title_basics


In [87]:
df.groupby('genres')
df_movies.head()

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [88]:
#genres



# Alex's Cleaning

## Title Basics cleaning

In [89]:
import pandas as pd

In [90]:
pd.read_csv('dataframe_id_bom_movie_gross_gz.csv')

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010
...,...,...,...,...,...
3382,The Quake,Magn.,6200.0,,2018
3383,Edward II (2018 re-release),FM,4800.0,,2018
3384,El Pacto,Sony,2500.0,,2018
3385,The Swan,Synergetic,2400.0,,2018


In [91]:
#importing dataframe (AT dataset 1)
title_basics = pd.read_csv("dataframe_id_imdb_title_basics_gz.csv")
title_basics.head()
title_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 6 columns):
tconst             146144 non-null object
primary_title      146144 non-null object
original_title     146123 non-null object
start_year         146144 non-null int64
runtime_minutes    114405 non-null float64
genres             140736 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 6.7+ MB


In [92]:
#checking for missing values
title_basics.isna().sum()


tconst                 0
primary_title          0
original_title        21
start_year             0
runtime_minutes    31739
genres              5408
dtype: int64

In [93]:
#replacing missing runtime_minutes values with mean of existing runtime minutes
title_basics["runtime_minutes"] = title_basics['runtime_minutes'].fillna(title_basics['runtime_minutes'].mean())

In [94]:
title_basics.isna().sum()

tconst                0
primary_title         0
original_title       21
start_year            0
runtime_minutes       0
genres             5408
dtype: int64

In [95]:

#checking to see if any start year is suspicious
title_basics['start_year'].unique()

array([2013, 2019, 2018, 2017, 2012, 2010, 2011, 2015, 2021, 2016, 2014,
       2020, 2022, 2023, 2024, 2026, 2025, 2115, 2027], dtype=int64)

In [96]:
title_basics["original_title"].unique()

array(['Sunghursh', 'Ashad Ka Ek Din', 'The Other Side of the Wind', ...,
       'Dankyavar Danka', '6 Gunn', 'Chico Albuquerque - Revelações'],
      dtype=object)

In [97]:
title_basics["primary_title"].unique()

array(['Sunghursh', 'One Day Before the Rainy Season',
       'The Other Side of the Wind', ..., 'Dankyavar Danka', '6 Gunn',
       'Chico Albuquerque - Revelações'], dtype=object)

In [98]:
title_basics["tconst"].unique()

array(['tt0063540', 'tt0066787', 'tt0069049', ..., 'tt9916706',
       'tt9916730', 'tt9916754'], dtype=object)

In [99]:
#checking for duplicates
duplicates = title_basics[title_basics.duplicated(subset='tconst')]
print(len(duplicates))
duplicates.tail()

0


Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres


In [100]:
title_basics.isna().sum()

tconst                0
primary_title         0
original_title       21
start_year            0
runtime_minutes       0
genres             5408
dtype: int64

In [101]:
#replacing missing values for genres with string "missing"
title_basics['genres'] = title_basics['genres'].fillna(value="missing")

In [102]:
title_basics.isna().sum()

tconst              0
primary_title       0
original_title     21
start_year          0
runtime_minutes     0
genres              0
dtype: int64

In [103]:

title_basics.start_year.value_counts(normalize=True)

2017    0.119772
2016    0.118185
2018    0.115290
2015    0.111144
2014    0.106669
2013    0.100647
2012    0.094338
2011    0.088269
2010    0.081078
2019    0.057334
2020    0.006411
2021    0.000568
2022    0.000219
2023    0.000034
2024    0.000014
2027    0.000007
2026    0.000007
2025    0.000007
2115    0.000007
Name: start_year, dtype: float64

In [104]:

#removing all rows associated with starting year greater than 2020
title_basics = title_basics[title_basics.start_year < 2020]
title_basics["start_year"].max()
title_basics.head()

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,86.187247,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy"


In [105]:
#importing dataframe (AT Dataset 2)
title_crew = pd.read_csv("dataframe_id_imdb_title_crew_gz.csv")
title_crew.head()

Unnamed: 0,tconst,directors,writers
0,tt0285252,nm0899854,nm0899854
1,tt0438973,,"nm0175726,nm1802864"
2,tt0462036,nm1940585,nm1940585
3,tt0835418,nm0151540,"nm0310087,nm0841532"
4,tt0878654,"nm0089502,nm2291498,nm2292011",nm0284943


In [106]:
title_crew.isna().sum()

tconst           0
directors     5727
writers      35883
dtype: int64

In [107]:
duplicates2 = title_crew[title_crew.duplicated(subset='tconst')]
print(len(duplicates2))
duplicates2.tail()

0


Unnamed: 0,tconst,directors,writers


In [108]:
title_crew['tconst'].unique()

array(['tt0285252', 'tt0438973', 'tt0462036', ..., 'tt9001494',
       'tt9004986', 'tt9010172'], dtype=object)

In [109]:
title_crew['directors'].unique()

array(['nm0899854', nan, 'nm1940585', ..., 'nm10122357', 'nm6711477',
       'nm10123242,nm10123248'], dtype=object)

In [110]:
title_crew['writers'].unique()

array(['nm0899854', 'nm0175726,nm1802864', 'nm1940585', ..., 'nm6711477',
       'nm4993825', 'nm8352242'], dtype=object)

In [111]:
title_crew.directors.value_counts(normalize=True)

nm3266654              0.000442
nm5592581              0.000342
nm2682776              0.000342
nm3583561              0.000328
nm0183659              0.000313
                         ...   
nm2959179              0.000007
nm3690857,nm3530062    0.000007
nm3790611              0.000007
nm0656333,nm2948475    0.000007
nm9321682              0.000007
Name: directors, Length: 98525, dtype: float64

In [112]:
title_crew.writers.value_counts(normalize=True)

nm0000636              0.000726
nm2682776              0.000435
nm3266654              0.000417
nm3583561              0.000363
nm0772905              0.000308
                         ...   
nm0098545              0.000009
nm0700554              0.000009
nm7211435,nm7211466    0.000009
nm2063122,nm3547655    0.000009
nm7943840              0.000009
Name: writers, Length: 91920, dtype: float64

In [113]:
title_crew.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 3 columns):
tconst       146144 non-null object
directors    140417 non-null object
writers      110261 non-null object
dtypes: object(3)
memory usage: 3.3+ MB


In [114]:
title_crew.describe

<bound method NDFrame.describe of            tconst                      directors              writers
0       tt0285252                      nm0899854            nm0899854
1       tt0438973                            NaN  nm0175726,nm1802864
2       tt0462036                      nm1940585            nm1940585
3       tt0835418                      nm0151540  nm0310087,nm0841532
4       tt0878654  nm0089502,nm2291498,nm2292011            nm0284943
...           ...                            ...                  ...
146139  tt8999974                     nm10122357           nm10122357
146140  tt9001390                      nm6711477            nm6711477
146141  tt9001494          nm10123242,nm10123248                  NaN
146142  tt9004986                      nm4993825            nm4993825
146143  tt9010172                            NaN            nm8352242

[146144 rows x 3 columns]>

In [115]:
#replacing missing values for directors with string "missing"
title_crew['directors'] = title_crew['directors'].fillna(value="missing")

In [116]:

#replacing missing values for writers with string "missing"
title_crew['writers'] = title_crew['writers'].fillna(value="missing")

In [117]:
title_crew.isna().sum()

tconst       0
directors    0
writers      0
dtype: int64

In [118]:
#checking to see if any directors appear disproportionally indicating a placeholder value
title_crew.directors.value_counts(normalize=True)


missing                0.039187
nm3266654              0.000424
nm5592581              0.000328
nm2682776              0.000328
nm3583561              0.000315
                         ...   
nm10157983             0.000007
nm2959179              0.000007
nm3690857,nm3530062    0.000007
nm3790611              0.000007
nm9321682              0.000007
Name: directors, Length: 98526, dtype: float64

In [119]:
#checking to see if any writers appear disproportionally indicating a placeholder value
title_crew.writers.value_counts(normalize=True)

missing                                    0.245532
nm0000636                                  0.000547
nm2682776                                  0.000328
nm3266654                                  0.000315
nm3583561                                  0.000274
                                             ...   
nm7962463,nm7962464,nm7292812,nm6948617    0.000007
nm0098545                                  0.000007
nm0700554                                  0.000007
nm7211435,nm7211466                        0.000007
nm7943840                                  0.000007
Name: writers, Length: 91921, dtype: float64

In [120]:
#importing a dataframe (AT Dataframe 3)
title_principals = pd.read_csv("dataframe_id_imdb_title_principals_gz.csv")
title_principals.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0111414,1,nm0246005,actor,,"[""The Man""]"
1,tt0111414,2,nm0398271,director,,
2,tt0111414,3,nm3739909,producer,producer,
3,tt0323808,10,nm0059247,editor,,
4,tt0323808,1,nm3579312,actress,,"[""Beth Boothby""]"


In [121]:

title_principals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1028186 entries, 0 to 1028185
Data columns (total 6 columns):
tconst        1028186 non-null object
ordering      1028186 non-null int64
nconst        1028186 non-null object
category      1028186 non-null object
job           177684 non-null object
characters    393360 non-null object
dtypes: int64(1), object(5)
memory usage: 47.1+ MB


In [122]:
#checking for missing values
title_principals.isna().sum()

tconst             0
ordering           0
nconst             0
category           0
job           850502
characters    634826
dtype: int64

In [123]:
title_principals.tconst.value_counts(normalize=True)

tt2667690    9.725867e-06
tt5533938    9.725867e-06
tt1522841    9.725867e-06
tt8316026    9.725867e-06
tt1448497    9.725867e-06
                 ...     
tt4931756    9.725867e-07
tt5659798    9.725867e-07
tt3091124    9.725867e-07
tt7029160    9.725867e-07
tt2768304    9.725867e-07
Name: tconst, Length: 143454, dtype: float64

In [124]:
title_principals.nconst.value_counts(normalize=True)

nm1930572    3.676378e-04
nm0000636    1.556139e-04
nm0000616    1.439428e-04
nm0103977    1.225459e-04
nm4394575    1.001764e-04
                 ...     
nm7663989    9.725867e-07
nm9332717    9.725867e-07
nm2982108    9.725867e-07
nm7943733    9.725867e-07
nm2987391    9.725867e-07
Name: nconst, Length: 604546, dtype: float64

In [125]:
title_principals['category'].unique()

array(['actor', 'director', 'producer', 'editor', 'actress', 'composer',
       'cinematographer', 'writer', 'self', 'production_designer',
       'archive_footage', 'archive_sound'], dtype=object)

In [126]:

title_principals['job'].unique()

array([nan, 'producer', 'story', ..., 'Translation',
       'Introduction Narration Writer', 'planning'], dtype=object)

In [127]:
#checking for duplicates
title_principals[title_principals.duplicated(keep=False)].sort_values(by='tconst').head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters


In [128]:
title_principals.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0111414,1,nm0246005,actor,,"[""The Man""]"
1,tt0111414,2,nm0398271,director,,
2,tt0111414,3,nm3739909,producer,producer,
3,tt0323808,10,nm0059247,editor,,
4,tt0323808,1,nm3579312,actress,,"[""Beth Boothby""]"


In [129]:

title_principals["job"].tail()

1028181         NaN
1028182         NaN
1028183         NaN
1028184      writer
1028185    producer
Name: job, dtype: object

In [130]:
title_principals['category'].unique()

array(['actor', 'director', 'producer', 'editor', 'actress', 'composer',
       'cinematographer', 'writer', 'self', 'production_designer',
       'archive_footage', 'archive_sound'], dtype=object)

In [131]:
title_principals['ordering'].unique()

array([ 1,  2,  3, 10,  4,  5,  6,  7,  8,  9], dtype=int64)

In [132]:
#dropping a job column because about 85 percent of values are missing 
title_principals = title_principals.drop('job', axis = 1)
title_principals.isna().sum()

tconst             0
ordering           0
nconst             0
category           0
characters    634826
dtype: int64

In [133]:
title_principals.head()

Unnamed: 0,tconst,ordering,nconst,category,characters
0,tt0111414,1,nm0246005,actor,"[""The Man""]"
1,tt0111414,2,nm0398271,director,
2,tt0111414,3,nm3739909,producer,
3,tt0323808,10,nm0059247,editor,
4,tt0323808,1,nm3579312,actress,"[""Beth Boothby""]"


In [134]:
title_principals["characters"].unique()

array(['["The Man"]', nan, '["Beth Boothby"]', ..., '["Makar Petrovich"]',
       '["Corpsman"]', '["Herself","Regan"]'], dtype=object)

In [135]:
title_principals['characters'] = title_principals['characters'].fillna(value="missing")

In [136]:
title_principals.isna().sum()

tconst        0
ordering      0
nconst        0
category      0
characters    0
dtype: int64