### Metadata Merge

This notebook merges current metadata collected from Playback resources with metadata collected from `2_parse_links` to get all necessary details. It removes all the columns that are not necessary for a high level overview of the movie.

In [1]:
import pandas as pd

In [4]:
METADATA_DIR = "../data/6_character_metadata"
df = pd.read_csv(f"{METADATA_DIR}/movies_with_ids.csv", dtype={'movie_id': str})

In [5]:
print(df.shape)
print(df.columns)

(3567, 15)
Index(['file', 'title', 'entityType', 'runtimeSeconds', 'synopsis',
       'rating_count', 'rating', 'en_type', 'en_url', 'sdh_sub_lang', 'url',
       'dir', 'movie_id', 'match_error', 'cast_num'],
      dtype='object')


In [6]:
df_ids = df.loc[:, ['title', 'movie_id',  'file', 'dir', 'synopsis']]

In [7]:
df_ids

Unnamed: 0,title,movie_id,file,dir,synopsis
0,My Fault,21909764,0_My_Fault,com,"Noah must leave her town, boyfriend and friend..."
1,On The Trail of UFOS: Dark Sky,14928972,1000_On_The_Trail_of_UFOS_Dark_Sky,com,On the Trail of UFOs: Dark Sky traces decades ...
2,Student Of The Year,2172071,1001_Student_Of_The_Year,com,"Introducing Alia Bhatt (Sharanya Singhania), S..."
3,"The Badge, The Bible and Bigfoot",11208026,1005_The_Badge_The_Bible_and_Bigfoot,com,"In a small coastal town Bigfoot is sighted, an..."
4,Sharknado 5: Global Swarming,6298780,1009_Sharknado_5_Global_Swarming,com,"With much of North America lying in ruins, the..."
...,...,...,...,...,...
3562,Lucifer (Tamil),,1299_Lucifer_Tamil,in2010s,
3563,A Place at the Table,1736049,1598_A_Place_at_the_Table,in2010s,
3564,Go Straight Take Left,8956392,2411_Go_Straight_Take_Left,in2010s,
3565,It Never Sleeps,,2580_It_Never_Sleeps,com,


In [8]:
PARSE_LINKS_DIR = "../data/2_metadata"
df_com = pd.read_csv(f"{PARSE_LINKS_DIR}/com/clean_meta_en_prime.csv")
df_before2010 = pd.read_csv(f"{PARSE_LINKS_DIR}/before2010/clean_meta_en_prime.csv")
df_in2010 = pd.read_csv(f"{PARSE_LINKS_DIR}/in2010s/clean_meta_en_prime.csv")
df_after2020 = pd.read_csv(f"{PARSE_LINKS_DIR}/after2020/clean_meta_en_prime.csv")

In [10]:
df_com.columns

Index(['title', 'link', 'tags', 'year', 'clean_title', 'fname', 'short_url',
       'clean_short_url'],
      dtype='object')

In [12]:
dfs = [df_com, df_before2010, df_in2010, df_after2020]
df_temp = df_ids
df_temp[['year', 'link']] = None

for _df in dfs:
    df_ids_merged_f = df_temp.merge(_df[['fname', 'year', 'link']], how='left', left_on='file', right_on='fname', suffixes=('', '_x')).drop('fname', axis=1)
    df_ids_merged_f['year'] = df_ids_merged_f['year'].combine_first(df_ids_merged_f['year_x'])
    df_ids_merged_f['link'] = df_ids_merged_f['link'].combine_first(df_ids_merged_f['link_x'])
    df_temp = df_ids_merged_f.drop(['year_x', 'link_x'], axis=1)
    print(df_temp['year'].isnull().sum(), df_temp['link'].isnull().sum())

df_temp

683 683
551 551
295 295
5 5


Unnamed: 0,title,movie_id,file,dir,synopsis,year,link
0,My Fault,21909764,0_My_Fault,com,"Noah must leave her town, boyfriend and friend...",2023.0,/My-Fault-Nicole-Wallace/dp/B0B683GB78/ref=sr_...
1,On The Trail of UFOS: Dark Sky,14928972,1000_On_The_Trail_of_UFOS_Dark_Sky,com,On the Trail of UFOs: Dark Sky traces decades ...,2021.0,/Trail-UFOS-Dark-Sky/dp/B09BKF2WGQ/ref=sr_1_24...
2,Student Of The Year,2172071,1001_Student_Of_The_Year,com,"Introducing Alia Bhatt (Sharanya Singhania), S...",2012.0,/Student-Year-Sidharth-Malhotra/dp/B0BZTD87WK/...
3,"The Badge, The Bible and Bigfoot",11208026,1005_The_Badge_The_Bible_and_Bigfoot,com,"In a small coastal town Bigfoot is sighted, an...",2019.0,/Badge-Bible-Bigfoot-Ashley-Wright/dp/B09JMYV8...
4,Sharknado 5: Global Swarming,6298780,1009_Sharknado_5_Global_Swarming,com,"With much of North America lying in ruins, the...",2017.0,/Sharknado-Global-Swarming-Ian-Ziering/dp/B07M...
...,...,...,...,...,...,...,...
3562,Lucifer (Tamil),,1299_Lucifer_Tamil,in2010s,,2019.0,/Lucifer-Tamil-Mohanlal/dp/B08KWR2Q2G/ref=sr_1...
3563,A Place at the Table,1736049,1598_A_Place_at_the_Table,in2010s,,2013.0,/Place-at-Table-Jeff-Bridges/dp/B00BN506KU/ref...
3564,Go Straight Take Left,8956392,2411_Go_Straight_Take_Left,in2010s,,2018.0,/Straight-Take-Left-Naveen-Richard/dp/B07HDC11...
3565,It Never Sleeps,,2580_It_Never_Sleeps,com,,2021.0,/Never-Sleeps-Laura-Swift/dp/B096HXDD4J/ref=sr...


In [13]:
df_temp['movie_id'].isnull().sum()

441

In [14]:
df_temp.to_csv(f"{METADATA_DIR}/metadata_for_validation.csv", index=False)