### Metadata Merge

This notebook merges current metadata collected from Playback resources with metadata collected from `2_parse_links` to get all necessary details. It removes all the columns that are not necessary for a high level overview of the movie.

In [1]:
import pandas as pd

In [3]:
METADATA_DIR = "../data/6_character_metadata"
df = pd.read_csv(f"{METADATA_DIR}/movies_with_ids.csv", dtype={'imdb_id': str})

In [4]:
print(df.shape)
print(df.columns)

(3570, 15)
Index(['file', 'title', 'entityType', 'runtimeSeconds', 'synopsis',
       'rating_count', 'rating', 'en_type', 'en_url', 'sdh_sub_lang', 'url',
       'dir', 'imdb_id', 'match_error', 'cast_num'],
      dtype='object')


In [5]:
df_ids = df.loc[:, ['title', 'imdb_id',  'file', 'dir', 'synopsis']]

In [6]:
df_ids

Unnamed: 0,title,imdb_id,file,dir,synopsis
0,My Fault,21909764,0_My_Fault,com,"Noah must leave her town, boyfriend and friend..."
1,On The Trail of UFOS: Dark Sky,14928972,1000_On_The_Trail_of_UFOS_Dark_Sky,com,On the Trail of UFOs: Dark Sky traces decades ...
2,Student Of The Year,2172071,1001_Student_Of_The_Year,com,"Introducing Alia Bhatt (Sharanya Singhania), S..."
3,"The Badge, The Bible and Bigfoot",11208026,1005_The_Badge_The_Bible_and_Bigfoot,com,"In a small coastal town Bigfoot is sighted, an..."
4,Sharknado 5: Global Swarming,6298780,1009_Sharknado_5_Global_Swarming,com,"With much of North America lying in ruins, the..."
...,...,...,...,...,...
3565,It Never Sleeps,,2580_It_Never_Sleeps,com,
3566,True Grit (1969),0065126,5659_True_Grit_1969,com,
3567,Bharaate,8811292,1398_Bharaate,in2010s,"When Jagan meets Radha, it is love at first si..."
3568,Katha Sangama,7315232,1839_Katha_Sangama,in2010s,Kathasangama is an anthology movie comprising ...


In [7]:
PARSE_LINKS_DIR = "../data/2_metadata"
df_com = pd.read_csv(f"{PARSE_LINKS_DIR}/com/clean_meta_en_prime.csv")
df_before2010 = pd.read_csv(f"{PARSE_LINKS_DIR}/before2010/clean_meta_en_prime.csv")
df_in2010 = pd.read_csv(f"{PARSE_LINKS_DIR}/in2010s/clean_meta_en_prime.csv")
df_after2020 = pd.read_csv(f"{PARSE_LINKS_DIR}/after2020/clean_meta_en_prime.csv")

In [8]:
df_com.columns

Index(['title', 'link', 'tags', 'year', 'clean_title', 'file', 'short_url',
       'clean_short_url'],
      dtype='object')

In [16]:
dfs = [df_com, df_before2010, df_in2010, df_after2020]
df_temp = df_ids
df_temp[['year', 'link']] = None

for _df in dfs:
    df_ids_merged_f = df_temp.merge(_df[['file', 'year', 'link']], how='left', on='file', suffixes=('', '_x'))
    df_ids_merged_f['year'] = df_ids_merged_f['year'].combine_first(df_ids_merged_f['year_x'])
    df_ids_merged_f['link'] = df_ids_merged_f['link'].combine_first(df_ids_merged_f['link_x'])
    df_temp = df_ids_merged_f.drop(['year_x', 'link_x'], axis=1)
    print(df_temp['year'].isnull().sum(), df_temp['link'].isnull().sum())

df_temp

686 686
554 554
295 295
5 5


Unnamed: 0,title,imdb_id,file,dir,synopsis,year,link
0,My Fault,21909764,0_My_Fault,com,"Noah must leave her town, boyfriend and friend...",2023.0,/My-Fault-Nicole-Wallace/dp/B0B683GB78/ref=sr_...
1,On The Trail of UFOS: Dark Sky,14928972,1000_On_The_Trail_of_UFOS_Dark_Sky,com,On the Trail of UFOs: Dark Sky traces decades ...,2021.0,/Trail-UFOS-Dark-Sky/dp/B09BKF2WGQ/ref=sr_1_24...
2,Student Of The Year,2172071,1001_Student_Of_The_Year,com,"Introducing Alia Bhatt (Sharanya Singhania), S...",2012.0,/Student-Year-Sidharth-Malhotra/dp/B0BZTD87WK/...
3,"The Badge, The Bible and Bigfoot",11208026,1005_The_Badge_The_Bible_and_Bigfoot,com,"In a small coastal town Bigfoot is sighted, an...",2019.0,/Badge-Bible-Bigfoot-Ashley-Wright/dp/B09JMYV8...
4,Sharknado 5: Global Swarming,6298780,1009_Sharknado_5_Global_Swarming,com,"With much of North America lying in ruins, the...",2017.0,/Sharknado-Global-Swarming-Ian-Ziering/dp/B07M...
...,...,...,...,...,...,...,...
3565,It Never Sleeps,,2580_It_Never_Sleeps,com,,2021.0,/Never-Sleeps-Laura-Swift/dp/B096HXDD4J/ref=sr...
3566,True Grit (1969),0065126,5659_True_Grit_1969,com,,1969.0,/True-Grit-John-Wayne/dp/B000ID1VYS/ref=sr_1_9...
3567,Bharaate,8811292,1398_Bharaate,in2010s,"When Jagan meets Radha, it is love at first si...",2019.0,/Bharaate-4K-UHD-Srii-Murali/dp/B0824LQBZT/ref...
3568,Katha Sangama,7315232,1839_Katha_Sangama,in2010s,Kathasangama is an anthology movie comprising ...,2019.0,/Katha-Sangama-Rishab-Shetty/dp/B083365FWG/ref...


In [19]:
df_temp['imdb_id'].isnull().sum()

441

In [20]:
df_temp.to_csv(f"{METADATA_DIR}/metadata_for_validation.csv", index=False)