# Ben's Cleaning 

In [1]:
import pandas as pd

In [2]:
# Assigning the movie gross table to variable
df_movie_gross = pd.read_csv('dataframe_id_bom_movie_gross_gz.csv')

In [3]:
df_movie_gross.head() #checking the layout and stats


Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [4]:
# dropping 'studio' column due to irrelevance
df_movie_gross.drop(axis=1, columns='studio', inplace=True)

In [5]:
#finding NaNs
print(df_movie_gross.isna())
print(df_movie_gross.isna().sum())

      title  domestic_gross  foreign_gross   year
0     False           False          False  False
1     False           False          False  False
2     False           False          False  False
3     False           False          False  False
4     False           False          False  False
...     ...             ...            ...    ...
3382  False           False           True  False
3383  False           False           True  False
3384  False           False           True  False
3385  False           False           True  False
3386  False           False           True  False

[3387 rows x 4 columns]
title                0
domestic_gross      28
foreign_gross     1350
year                 0
dtype: int64


In [6]:
# missing 5 values for sutdio [dropping column anyway] 
#domestic_gross is missing 28 values might just drop those rows
#foreign_gross missing 1350 values; calculating the percentage of data missing
(df_movie_gross['foreign_gross'].isna()
                 .sum()/len(df_movie_gross['foreign_gross'])) * 100

39.85828166519043

In [7]:
# 39.85% of our foreign gross values is missing, replacing with median values
# checking for why NaN's present (did these movies not get foreign/domestic releases?)
df = df_movie_gross
is_NaN = df.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = df[row_has_NaN]

print(rows_with_NaN)

                                       title  domestic_gross foreign_gross  \
222                                  Flipped       1800000.0           NaN   
230               It's a Wonderful Afterlife             NaN       1300000   
254   The Polar Express (IMAX re-issue 2010)        673000.0           NaN   
267                           Tiny Furniture        392000.0           NaN   
269            Grease (Sing-a-Long re-issue)        366000.0           NaN   
...                                      ...             ...           ...   
3382                               The Quake          6200.0           NaN   
3383             Edward II (2018 re-release)          4800.0           NaN   
3384                                El Pacto          2500.0           NaN   
3385                                The Swan          2400.0           NaN   
3386                       An Actor Prepares          1700.0           NaN   

      year  
222   2010  
230   2010  
254   2010  
267   2010 

In [8]:
# Assuming that NaN's represent lack of releases in those markets, replacing NaN's with 0's
df_movie_gross['domestic_gross'].fillna(0, inplace=True)
df_movie_gross['foreign_gross'].fillna(0, inplace=True)
# Checking results
print(df_movie_gross.isna().sum())

title             0
domestic_gross    0
foreign_gross     0
year              0
dtype: int64


In [9]:
# Changing foreign_gross to int
def convert_amt_to_int(df, col):
    df[col] = df[col].str.replace("$", "").str.replace(",", "").str.replace("'","").astype('float')
    return df[col]

df_movie_gross['foreign_gross'] = convert_amt_to_int(df_movie_gross, 'foreign_gross')


In [10]:
df_movie_gross['foreign_gross']

0       652000000.0
1       691300000.0
2       664300000.0
3       535700000.0
4       513900000.0
           ...     
3382            NaN
3383            NaN
3384            NaN
3385            NaN
3386            NaN
Name: foreign_gross, Length: 3387, dtype: float64

In [11]:
# making list of all columns to change the data types


In [12]:
# importing budget csv to see if I can fill in missing data by merging

df_budget = pd.read_csv('dataframe_id_tn_movie_budgets_gz.csv')
#checking head, info, and shape


In [13]:
# dropping irrelevant columns
df_budget.drop(axis=1, columns=['id'], inplace=True)
df_budget.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 5 columns):
release_date         5782 non-null object
movie                5782 non-null object
production_budget    5782 non-null object
domestic_gross       5782 non-null object
worldwide_gross      5782 non-null object
dtypes: object(5)
memory usage: 226.0+ KB


In [14]:
#checking for missing values
df_budget.isna().sum()

release_date         0
movie                0
production_budget    0
domestic_gross       0
worldwide_gross      0
dtype: int64

In [15]:
# no missing values, all relevant columns need to be converted into integers

# Ayesha's Data Cleaning for title_akas


In [16]:
df_title_akas = pd.read_csv('dataframe_id_imdb_title_akas_gz.csv')
df = df_title_akas

In [17]:
df.head()


Unnamed: 0,title_id,ordering,title,region,language,types,attributes,is_original_title
0,tt0369610,10,Джурасик свят,BG,bg,,,0.0
1,tt0369610,11,Jurashikku warudo,JP,,imdbDisplay,,0.0
2,tt0369610,12,Jurassic World: O Mundo dos Dinossauros,BR,,imdbDisplay,,0.0
3,tt0369610,13,O Mundo dos Dinossauros,BR,,,short title,0.0
4,tt0369610,14,Jurassic World,FR,,imdbDisplay,,0.0


In [18]:
# Grouping data in catagories (title id joins with df_names )
df.groupby('title')
df

Unnamed: 0,title_id,ordering,title,region,language,types,attributes,is_original_title
0,tt0369610,10,Джурасик свят,BG,bg,,,0.0
1,tt0369610,11,Jurashikku warudo,JP,,imdbDisplay,,0.0
2,tt0369610,12,Jurassic World: O Mundo dos Dinossauros,BR,,imdbDisplay,,0.0
3,tt0369610,13,O Mundo dos Dinossauros,BR,,,short title,0.0
4,tt0369610,14,Jurassic World,FR,,imdbDisplay,,0.0
...,...,...,...,...,...,...,...,...
331698,tt9827784,2,Sayonara kuchibiru,,,original,,1.0
331699,tt9827784,3,Farewell Song,XWW,en,imdbDisplay,,0.0
331700,tt9880178,1,La atención,,,original,,1.0
331701,tt9880178,2,La atención,ES,,,,0.0


In [19]:
# Data not relevant 
df.groupby('title')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1182adf28>

In [20]:
df.groupby('language').mean()

Unnamed: 0_level_0,ordering,is_original_title
language,Unnamed: 1_level_1,Unnamed: 2_level_1
af,2.400000,0.0
ar,5.500000,0.0
az,19.000000,0.0
bg,9.748961,0.0
bn,2.393443,0.0
...,...,...
xh,2.000000,0.0
yi,3.500000,0.0
yue,6.489510,0.0
zh,1.500000,0.0


In [21]:
# dropping attribute column
df = df.drop('attributes', axis = 1)
df.isna().sum()

title_id                  0
ordering                  0
title                     0
region                53293
language             289988
types                163256
is_original_title        25
dtype: int64

# Ayesha's Data Cleaning for movies

In [22]:
df_movies = pd.read_csv('dataframe_id_tmdb_movies_gz.csv')
df = df_movies

In [85]:
df_movies.head()
#print(df_movies.info())
#print(df_movies.shape)

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [24]:
# not letting me groupby: genre_ids, original_title, popularity, release_date, 
# (genre id's used with df_title basics['genres'])
df.groupby('title').median() 

Unnamed: 0_level_0,Unnamed: 0,id,popularity,vote_average,vote_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"""BLESSED""",26355.0,564096.0,0.600,7.0,1.0
"""Legitimate Rape"" Pharmaceutical Ad",7815.0,283543.0,0.600,2.3,2.0
"""cherry"" - Supreme",12812.0,263765.0,0.893,10.0,1.0
#1 Cheerleader Camp,577.0,41371.0,3.277,3.6,34.0
#ALLMYMOVIES,16037.0,368247.0,0.840,9.0,4.0
...,...,...,...,...,...
纽约客@上海,5961.0,126186.0,2.416,6.0,12.0
마음의 소리 스페셜 1-효! 크러쉬,26292.0,602570.0,0.600,8.0,1.0
번개맨의 비밀,26267.0,591378.0,0.600,9.0,1.0
유병재: B의 농담,26106.0,542691.0,0.708,6.5,2.0



# Ayesha's Data Cleaning title_basics

In [25]:
df_title_basics = pd.read_csv('dataframe_id_imdb_title_basics_gz.csv')

In [26]:
print(df_title_basics.head())
print(df_title_basics.info())
print(df_title_basics.shape)

      tconst                    primary_title              original_title  \
0  tt0063540                        Sunghursh                   Sunghursh   
1  tt0066787  One Day Before the Rainy Season             Ashad Ka Ek Din   
2  tt0069049       The Other Side of the Wind  The Other Side of the Wind   
3  tt0069204                  Sabse Bada Sukh             Sabse Bada Sukh   
4  tt0100275         The Wandering Soap Opera       La Telenovela Errante   

   start_year  runtime_minutes                genres  
0        2013            175.0    Action,Crime,Drama  
1        2019            114.0       Biography,Drama  
2        2018            122.0                 Drama  
3        2018              NaN          Comedy,Drama  
4        2017             80.0  Comedy,Drama,Fantasy  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 6 columns):
tconst             146144 non-null object
primary_title      146144 non-null object
original_titl

In [27]:
# tconts correlates with title_basics, title_crew, title_principals

df = df_title_basics


In [28]:
df.groupby('genres')
df_movies.head()

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [29]:
#genres



# Alex's Cleaning

In [30]:
import pandas as pd

In [31]:
pd.read_csv('dataframe_id_bom_movie_gross_gz.csv')

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010
...,...,...,...,...,...
3382,The Quake,Magn.,6200.0,,2018
3383,Edward II (2018 re-release),FM,4800.0,,2018
3384,El Pacto,Sony,2500.0,,2018
3385,The Swan,Synergetic,2400.0,,2018


In [32]:
#importing dataframe (AT dataset 1)
title_basics = pd.read_csv("dataframe_id_imdb_title_basics_gz.csv")
title_basics.head()
title_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 6 columns):
tconst             146144 non-null object
primary_title      146144 non-null object
original_title     146123 non-null object
start_year         146144 non-null int64
runtime_minutes    114405 non-null float64
genres             140736 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 6.7+ MB


In [33]:
#checking for missing values
title_basics.isna().sum()


tconst                 0
primary_title          0
original_title        21
start_year             0
runtime_minutes    31739
genres              5408
dtype: int64

In [34]:
#replacing missing runtime_minutes values with mean of existing runtime minutes
title_basics["runtime_minutes"] = title_basics['runtime_minutes'].fillna(title_basics['runtime_minutes'].mean())

In [35]:
title_basics.isna().sum()

tconst                0
primary_title         0
original_title       21
start_year            0
runtime_minutes       0
genres             5408
dtype: int64

In [36]:

#checking to see if any start year is suspicious
title_basics['start_year'].unique()

array([2013, 2019, 2018, 2017, 2012, 2010, 2011, 2015, 2021, 2016, 2014,
       2020, 2022, 2023, 2024, 2026, 2025, 2115, 2027])

In [37]:
title_basics["original_title"].unique()

array(['Sunghursh', 'Ashad Ka Ek Din', 'The Other Side of the Wind', ...,
       'Dankyavar Danka', '6 Gunn', 'Chico Albuquerque - Revelações'],
      dtype=object)

In [38]:
title_basics["primary_title"].unique()

array(['Sunghursh', 'One Day Before the Rainy Season',
       'The Other Side of the Wind', ..., 'Dankyavar Danka', '6 Gunn',
       'Chico Albuquerque - Revelações'], dtype=object)

In [39]:
title_basics["tconst"].unique()

array(['tt0063540', 'tt0066787', 'tt0069049', ..., 'tt9916706',
       'tt9916730', 'tt9916754'], dtype=object)

In [40]:
#checking for duplicates
duplicates = title_basics[title_basics.duplicated(subset='tconst')]
print(len(duplicates))
duplicates.tail()

0


Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres


In [41]:
title_basics.isna().sum()

tconst                0
primary_title         0
original_title       21
start_year            0
runtime_minutes       0
genres             5408
dtype: int64

In [42]:
#replacing missing values for genres with string "missing"
title_basics['genres'] = title_basics['genres'].fillna(value="missing")

In [43]:
title_basics.isna().sum()

tconst              0
primary_title       0
original_title     21
start_year          0
runtime_minutes     0
genres              0
dtype: int64

In [44]:

title_basics.start_year.value_counts(normalize=True)

2017    0.119772
2016    0.118185
2018    0.115290
2015    0.111144
2014    0.106669
2013    0.100647
2012    0.094338
2011    0.088269
2010    0.081078
2019    0.057334
2020    0.006411
2021    0.000568
2022    0.000219
2023    0.000034
2024    0.000014
2027    0.000007
2026    0.000007
2025    0.000007
2115    0.000007
Name: start_year, dtype: float64

In [45]:

#removing all rows associated with starting year greater than 2020
title_basics = title_basics[title_basics.start_year < 2020]
title_basics["start_year"].max()
title_basics.head()

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,86.187247,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy"


In [87]:
# Splitting genres and keeping separated by commas to look cleaner (Ayesha)
title_basics['genres'] = title_basics['genres'].apply(lambda x: x.split(",") if x else x)
title_basics.head()

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"[Action, Crime, Drama]"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"[Biography, Drama]"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,[Drama]
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,86.187247,"[Comedy, Drama]"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"[Comedy, Drama, Fantasy]"


In [46]:
#importing dataframe (AT Dataset 2)
title_crew = pd.read_csv("dataframe_id_imdb_title_crew_gz.csv")
title_crew.head()

Unnamed: 0,tconst,directors,writers
0,tt0285252,nm0899854,nm0899854
1,tt0438973,,"nm0175726,nm1802864"
2,tt0462036,nm1940585,nm1940585
3,tt0835418,nm0151540,"nm0310087,nm0841532"
4,tt0878654,"nm0089502,nm2291498,nm2292011",nm0284943


In [47]:
title_crew.isna().sum()

tconst           0
directors     5727
writers      35883
dtype: int64

In [48]:
duplicates2 = title_crew[title_crew.duplicated(subset='tconst')]
print(len(duplicates2))
duplicates2.tail()

0


Unnamed: 0,tconst,directors,writers


In [49]:
title_crew['tconst'].unique()

array(['tt0285252', 'tt0438973', 'tt0462036', ..., 'tt9001494',
       'tt9004986', 'tt9010172'], dtype=object)

In [50]:
title_crew['directors'].unique()

array(['nm0899854', nan, 'nm1940585', ..., 'nm10122357', 'nm6711477',
       'nm10123242,nm10123248'], dtype=object)

In [51]:
title_crew['writers'].unique()

array(['nm0899854', 'nm0175726,nm1802864', 'nm1940585', ..., 'nm6711477',
       'nm4993825', 'nm8352242'], dtype=object)

In [52]:
title_crew.directors.value_counts(normalize=True)

nm3266654              0.000442
nm5592581              0.000342
nm2682776              0.000342
nm3583561              0.000328
nm0183659              0.000313
                         ...   
nm8181644,nm4707876    0.000007
nm0371955              0.000007
nm1363530,nm3867074    0.000007
nm0139584,nm0915930    0.000007
nm3365641              0.000007
Name: directors, Length: 98525, dtype: float64

In [53]:
title_crew.writers.value_counts(normalize=True)

nm0000636    0.000726
nm2682776    0.000435
nm3266654    0.000417
nm3583561    0.000363
nm0772905    0.000308
               ...   
nm7106378    0.000009
nm9247789    0.000009
nm4442994    0.000009
nm3563781    0.000009
nm5453829    0.000009
Name: writers, Length: 91920, dtype: float64

In [54]:
title_crew.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 3 columns):
tconst       146144 non-null object
directors    140417 non-null object
writers      110261 non-null object
dtypes: object(3)
memory usage: 3.3+ MB


In [55]:
title_crew.describe

<bound method NDFrame.describe of            tconst                      directors              writers
0       tt0285252                      nm0899854            nm0899854
1       tt0438973                            NaN  nm0175726,nm1802864
2       tt0462036                      nm1940585            nm1940585
3       tt0835418                      nm0151540  nm0310087,nm0841532
4       tt0878654  nm0089502,nm2291498,nm2292011            nm0284943
...           ...                            ...                  ...
146139  tt8999974                     nm10122357           nm10122357
146140  tt9001390                      nm6711477            nm6711477
146141  tt9001494          nm10123242,nm10123248                  NaN
146142  tt9004986                      nm4993825            nm4993825
146143  tt9010172                            NaN            nm8352242

[146144 rows x 3 columns]>

In [56]:
#replacing missing values for directors with string "missing"
title_crew['directors'] = title_crew['directors'].fillna(value="missing")

In [57]:

#replacing missing values for writers with string "missing"
title_crew['writers'] = title_crew['writers'].fillna(value="missing")

In [58]:
title_crew.isna().sum()

tconst       0
directors    0
writers      0
dtype: int64

In [59]:
#checking to see if any directors appear disproportionally indicating a placeholder value
title_crew.directors.value_counts(normalize=True)


missing                0.039187
nm3266654              0.000424
nm5592581              0.000328
nm2682776              0.000328
nm3583561              0.000315
                         ...   
nm8181644,nm4707876    0.000007
nm0371955              0.000007
nm1363530,nm3867074    0.000007
nm0139584,nm0915930    0.000007
nm3365641              0.000007
Name: directors, Length: 98526, dtype: float64

In [60]:
#checking to see if any writers appear disproportionally indicating a placeholder value
title_crew.writers.value_counts(normalize=True)

missing      0.245532
nm0000636    0.000547
nm2682776    0.000328
nm3266654    0.000315
nm3583561    0.000274
               ...   
nm7106378    0.000007
nm9247789    0.000007
nm4442994    0.000007
nm3563781    0.000007
nm5453829    0.000007
Name: writers, Length: 91921, dtype: float64

In [61]:
#importing a dataframe (AT Dataframe 3)
title_principals = pd.read_csv("dataframe_id_imdb_title_principals_gz.csv")
title_principals.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0111414,1,nm0246005,actor,,"[""The Man""]"
1,tt0111414,2,nm0398271,director,,
2,tt0111414,3,nm3739909,producer,producer,
3,tt0323808,10,nm0059247,editor,,
4,tt0323808,1,nm3579312,actress,,"[""Beth Boothby""]"


In [62]:

title_principals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1028186 entries, 0 to 1028185
Data columns (total 6 columns):
tconst        1028186 non-null object
ordering      1028186 non-null int64
nconst        1028186 non-null object
category      1028186 non-null object
job           177684 non-null object
characters    393360 non-null object
dtypes: int64(1), object(5)
memory usage: 47.1+ MB


In [63]:
#checking for missing values
title_principals.isna().sum()

tconst             0
ordering           0
nconst             0
category           0
job           850502
characters    634826
dtype: int64

In [64]:
title_principals.tconst.value_counts(normalize=True)

tt6096500     9.725867e-06
tt1503777     9.725867e-06
tt8262802     9.725867e-06
tt3350790     9.725867e-06
tt2377398     9.725867e-06
                  ...     
tt8045892     9.725867e-07
tt2953276     9.725867e-07
tt7258450     9.725867e-07
tt10429390    9.725867e-07
tt2271423     9.725867e-07
Name: tconst, Length: 143454, dtype: float64

In [65]:
title_principals.nconst.value_counts(normalize=True)

nm1930572    3.676378e-04
nm0000636    1.556139e-04
nm0000616    1.439428e-04
nm0103977    1.225459e-04
nm4394575    1.001764e-04
                 ...     
nm5571265    9.725867e-07
nm0335504    9.725867e-07
nm5285572    9.725867e-07
nm3366061    9.725867e-07
nm8020604    9.725867e-07
Name: nconst, Length: 604546, dtype: float64

In [66]:
title_principals['category'].unique()

array(['actor', 'director', 'producer', 'editor', 'actress', 'composer',
       'cinematographer', 'writer', 'self', 'production_designer',
       'archive_footage', 'archive_sound'], dtype=object)

In [67]:

title_principals['job'].unique()

array([nan, 'producer', 'story', ..., 'Translation',
       'Introduction Narration Writer', 'planning'], dtype=object)

In [68]:
#checking for duplicates
title_principals[title_principals.duplicated(keep=False)].sort_values(by='tconst').head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters


In [69]:
title_principals.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0111414,1,nm0246005,actor,,"[""The Man""]"
1,tt0111414,2,nm0398271,director,,
2,tt0111414,3,nm3739909,producer,producer,
3,tt0323808,10,nm0059247,editor,,
4,tt0323808,1,nm3579312,actress,,"[""Beth Boothby""]"


In [70]:

title_principals["job"].tail()

1028181         NaN
1028182         NaN
1028183         NaN
1028184      writer
1028185    producer
Name: job, dtype: object

In [71]:
title_principals['category'].unique()

array(['actor', 'director', 'producer', 'editor', 'actress', 'composer',
       'cinematographer', 'writer', 'self', 'production_designer',
       'archive_footage', 'archive_sound'], dtype=object)

In [72]:
title_principals['ordering'].unique()

array([ 1,  2,  3, 10,  4,  5,  6,  7,  8,  9])

In [73]:
#dropping a job column because about 85 percent of values are missing 
title_principals = title_principals.drop('job', axis = 1)
title_principals.isna().sum()

tconst             0
ordering           0
nconst             0
category           0
characters    634826
dtype: int64

In [74]:
title_principals.head()

Unnamed: 0,tconst,ordering,nconst,category,characters
0,tt0111414,1,nm0246005,actor,"[""The Man""]"
1,tt0111414,2,nm0398271,director,
2,tt0111414,3,nm3739909,producer,
3,tt0323808,10,nm0059247,editor,
4,tt0323808,1,nm3579312,actress,"[""Beth Boothby""]"


In [75]:
title_principals["characters"].unique()

array(['["The Man"]', nan, '["Beth Boothby"]', ..., '["Makar Petrovich"]',
       '["Corpsman"]', '["Herself","Regan"]'], dtype=object)

In [76]:
title_principals['characters'] = title_principals['characters'].fillna(value="missing")

In [77]:
title_principals.isna().sum()

tconst        0
ordering      0
nconst        0
category      0
characters    0
dtype: int64

# Ben's Cleaning relevant to Alex's project

In [78]:
# Correlates with title.crew
df_names = pd.read_csv('dataframe_id_imdb_name_basics_gz.csv')

df_names.head()

Unnamed: 0,nconst,primary_name,birth_year,death_year,primary_profession,known_for_titles
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"
2,nm0062070,Bruce Baum,,,"miscellaneous,actor,writer","tt1470654,tt0363631,tt0104030,tt0102898"
3,nm0062195,Axel Baumann,,,"camera_department,cinematographer,art_department","tt0114371,tt2004304,tt1618448,tt1224387"
4,nm0062798,Pete Baxter,,,"production_designer,art_department,set_decorator","tt0452644,tt0452692,tt3458030,tt2178256"


In [79]:
# use split funtion

# Ayesha cleaning title_ratings

In [80]:
df_title_ratings = pd.read_csv('dataframe_id_imdb_title_ratings_gz.csv')

In [81]:
df_title_ratings.head()

Unnamed: 0,tconst,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559
2,tt1042974,6.4,20
3,tt1043726,4.2,50352
4,tt1060240,6.5,21


In [82]:
# Data not relevent, don't use table 

In [109]:
# Doing a join between the df_movies and title_basics tables to combine genre with genre ID's
joined_df = title_basics.join(df_movies, how='outer', lsuffix='_left', rsuffix='_right')
joined_df

Unnamed: 0.1,tconst,primary_title,original_title_left,start_year,runtime_minutes,genres,Unnamed: 0,genre_ids,id,original_language,original_title_right,popularity,release_date,title,vote_average,vote_count
0,tt0063540,Sunghursh,Sunghursh,2013.0,175.000000,"[Action, Crime, Drama]",0.0,"[12, 14, 10751]",12444.0,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788.0
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019.0,114.000000,"[Biography, Drama]",1.0,"[14, 12, 16, 10751]",10191.0,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610.0
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018.0,122.000000,[Drama],2.0,"[12, 28, 878]",10138.0,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368.0
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018.0,86.187247,"[Comedy, Drama]",3.0,"[16, 35, 10751]",862.0,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174.0
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017.0,80.000000,"[Comedy, Drama, Fantasy]",4.0,"[28, 878, 12]",27205.0,en,Inception,27.920,2010-07-16,Inception,8.3,22186.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146139,tt9916538,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,2019.0,123.000000,[Drama],,,,,,,,,,
146140,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,2015.0,86.187247,[Documentary],,,,,,,,,,
146141,tt9916706,Dankyavar Danka,Dankyavar Danka,2013.0,86.187247,[Comedy],,,,,,,,,,
146142,tt9916730,6 Gunn,6 Gunn,2017.0,116.000000,[missing],,,,,,,,,,
