In [1]:
import pandas as pd
import numpy as np
from unidecode import unidecode
import urllib

In [4]:
df = pd.read_csv("./metadata/com/meta_en_prime.csv", encoding='cp1252', header=None)

In [5]:
df.tail()

Unnamed: 0,0,1,2
6379,Cabin 28,/Cabin-28-Lee-Bane/dp/B079WDP9CP/ref=sr_1_1580...,2017|NR|CC
6380,The Perfect Victim,/Perfect-Victim-Elizabeth-Rohrbaugh/dp/B07749H...,2013|CC
6381,EVANGELION:3.33 YOU CAN (NOT) REDO.,/EVANGELION-3-33-YOU-CAN-REDO/dp/B08SVTV9DQ/re...,2013|CC
6382,God's Miracles,/Gods-Miracles-Bryant-Gumbel/dp/B0763TBQXY/ref...,1982|CC
6383,Philomena,/Philomena-Judi-Dench/dp/B00IIVK3ZU/ref=sr_1_1...,2013|PG-13|CC


In [6]:
df.columns=['title', 'link', 'tags']

In [9]:
df.shape

(6384, 3)

### First, we'll convert the name to only alphanumeric characters

In [10]:
df['clean_title'] = [unidecode(text) for text in df['title']]
df['clean_title'] = df['clean_title'].str.replace("[^0-9a-zA-Z ]", "", regex=True).str.replace(" ", "_")

In [11]:
df.loc[435,]

title                                             Los sonámbulos
link           /Los-son%C3%A1mbulos-%C3%89rica-Rivas/dp/B0918...
tags                                                      2019|R
clean_title                                       Los_sonambulos
Name: 435, dtype: object

We'll also add the index to make the names unique.

In [12]:
df['fname'] = pd.Series(np.arange(0, df.shape[0]), dtype='str') + "_" + df['clean_title']

In [13]:
df.head()

Unnamed: 0,title,link,tags,clean_title,fname
0,My Fault,/My-Fault-Nicole-Wallace/dp/B0B683GB78/ref=sr_...,2023|CC,My_Fault,0_My_Fault
1,Jurassic World Dominion,/Jurassic-World-Dominion-Chris-Pratt/dp/B0B5NN...,2022|PG-13|CC,Jurassic_World_Dominion,1_Jurassic_World_Dominion
2,Top Gun: Maverick,/Top-Gun-Maverick-Tom-Cruise/dp/B0B214XZB4/ref...,2022|PG-13|CC,Top_Gun_Maverick,2_Top_Gun_Maverick
3,Creed III,/Creed-III-Michael-B-Jordan/dp/B0B8TKJCRQ/ref=...,2023|PG-13|CC,Creed_III,3_Creed_III
4,Sonic the Hedgehog 2,/Sonic-Hedgehog-2-James-Marsden/dp/B09ZRVBZG7/...,2022|PG|CC,Sonic_the_Hedgehog_2,4_Sonic_the_Hedgehog_2


In [14]:
df[df['title'].duplicated()]

Unnamed: 0,title,link,tags,clean_title,fname
867,Blackbird,/Blackbird-Susan-Sarandon/dp/B09P1MH4V7/ref=sr...,2020|R|CC,Blackbird,867_Blackbird
1250,Global Meltdown,/Global-Meltdown-Michael-Pare/dp/B0B66791WP/re...,2018|TV-MA|CC,Global_Meltdown,1250_Global_Meltdown
1413,The Lost World,/Lost-World-John-Rhys-Davies/dp/B077YZHZYD/ref...,1992|CC,The_Lost_World,1413_The_Lost_World
1582,Payback,/Payback-Matt-Levett/dp/B08TLP11CP/ref=sr_1_30...,2021|CC,Payback,1582_Payback
2009,His Girl Friday,/His-Girl-Friday-Cary-Grant/dp/B099D8SF1F/ref=...,1940|TV-14|CC,His_Girl_Friday,2009_His_Girl_Friday
...,...,...,...,...,...
6146,Goodbye World,/Goodbye-World-Adrian-Grenier/dp/B0755CXJ95/re...,2014|TV-14|CC,Goodbye_World,6146_Goodbye_World
6198,One Body Too Many,/One-Body-Many-Jack-Haley/dp/B0B8X5WR6X/ref=sr...,2023|CC,One_Body_Too_Many,6198_One_Body_Too_Many
6216,Jigsaw,/Jigsaw-4K-UHD-Hannah-Anderson/dp/B07WHL775C/r...,2017|R|CC,Jigsaw,6216_Jigsaw
6282,Abandoned,/Abandoned-Dominic-Purcell/dp/B076279G5R/ref=s...,2016|CC,Abandoned,6282_Abandoned


There could be multiple files with the same name. I checked tags as well. Same movies can have slightly different tags too. But if the short_url matches, then it is likely a duplicate.

In [15]:
df[df['title'] == "Abandoned"]

Unnamed: 0,title,link,tags,clean_title,fname
954,Abandoned,/Abandoned-Brittany-Murphy/dp/B0040J2VQ8/ref=s...,2010|PG-13|CC,Abandoned,954_Abandoned
2798,Abandoned,/Abandoned-Dominic-Purcell/dp/B01D97X1DS/ref=s...,2016|TV-14|CC,Abandoned,2798_Abandoned
6282,Abandoned,/Abandoned-Dominic-Purcell/dp/B076279G5R/ref=s...,2016|CC,Abandoned,6282_Abandoned


In [16]:
df['short_url'] = df['link'].str.extract("(/[^/]*/)")

Some can also have the same short_url, but could be different movies. We can check the title as well along with the short_url for validation.

In [17]:
df[df["short_url"] == "/Stalked-My-Doctor-Eric-Roberts/"]

Unnamed: 0,title,link,tags,clean_title,fname,short_url
109,Stalked By My Doctor: The Return,/Stalked-My-Doctor-Eric-Roberts/dp/B01LWIKCNZ/...,2016|CC,Stalked_By_My_Doctor_The_Return,109_Stalked_By_My_Doctor_The_Return,/Stalked-My-Doctor-Eric-Roberts/
5835,Stalked By My Doctor,/Stalked-My-Doctor-Eric-Roberts/dp/B089Y8G8SN/...,2015|TV-14|CC,Stalked_By_My_Doctor,5835_Stalked_By_My_Doctor,/Stalked-My-Doctor-Eric-Roberts/


In [18]:
dups = df[df[['title', 'short_url']].duplicated()]

In [19]:
dups.shape

(32, 6)

Let's check some of them.

In [66]:
# for url in dups['short_url']:
#     print(df.loc[df['short_url'] == url])
#     input()

These are some cases: The Little Princess, two dates, but newer one is just remastered
Virginia, listed as two entries, but are same movies.
There were many such cases, but seems like checking for short_url and title is a good heuristic to remove duplicates.

In [20]:
df_clean = df[~df[['title', 'short_url']].duplicated()]

In [21]:
df.shape, df_clean.shape

((6384, 6), (6352, 6))

We removed 32 duplicates.

However, let's check the entries with duplicate titles.

In [22]:
df_clean[df_clean['title'].duplicated()]

Unnamed: 0,title,link,tags,clean_title,fname,short_url
1250,Global Meltdown,/Global-Meltdown-Michael-Pare/dp/B0B66791WP/re...,2018|TV-MA|CC,Global_Meltdown,1250_Global_Meltdown,/Global-Meltdown-Michael-Pare/
1413,The Lost World,/Lost-World-John-Rhys-Davies/dp/B077YZHZYD/ref...,1992|CC,The_Lost_World,1413_The_Lost_World,/Lost-World-John-Rhys-Davies/
1582,Payback,/Payback-Matt-Levett/dp/B08TLP11CP/ref=sr_1_30...,2021|CC,Payback,1582_Payback,/Payback-Matt-Levett/
2056,Awakened,/Awakened-JulianneMichelle/dp/B018JNSEXO/ref=s...,2014|NR|CC,Awakened,2056_Awakened,/Awakened-JulianneMichelle/
2330,The Dark,/Dark-Maria-Bello/dp/B06XSGLDC4/ref=sr_1_363?q...,2006|R|CC,The_Dark,2330_The_Dark,/Dark-Maria-Bello/
...,...,...,...,...,...,...
6087,Charade,/Charade-Audrey-Hepburn/dp/B07G1DSL2J/ref=sr_1...,1963|CC,Charade,6087_Charade,/Charade-Audrey-Hepburn/
6089,A Mother's Revenge,/Mothers-Revenge-Jamie-Luner/dp/B01NBAZM9G/ref...,2016|CC,A_Mothers_Revenge,6089_A_Mothers_Revenge,/Mothers-Revenge-Jamie-Luner/
6198,One Body Too Many,/One-Body-Many-Jack-Haley/dp/B0B8X5WR6X/ref=sr...,2023|CC,One_Body_Too_Many,6198_One_Body_Too_Many,/One-Body-Many-Jack-Haley/
6216,Jigsaw,/Jigsaw-4K-UHD-Hannah-Anderson/dp/B07WHL775C/r...,2017|R|CC,Jigsaw,6216_Jigsaw,/Jigsaw-4K-UHD-Hannah-Anderson/


In [23]:
df_clean[df_clean['title'] == "Global Meltdown"]

Unnamed: 0,title,link,tags,clean_title,fname,short_url
940,Global Meltdown,/Global-Meltdown-Michael-Par%C3%A9/dp/B07JFTCZ...,2018|TV-MA|CC,Global_Meltdown,940_Global_Meltdown,/Global-Meltdown-Michael-Par%C3%A9/
1250,Global Meltdown,/Global-Meltdown-Michael-Pare/dp/B0B66791WP/re...,2018|TV-MA|CC,Global_Meltdown,1250_Global_Meltdown,/Global-Meltdown-Michael-Pare/


For such cases above, we need to unquote the url and unidecode.

In [24]:
df_clean_copy = df_clean.copy()
df_clean_copy['clean_short_url'] = [unidecode(urllib.parse.unquote(string)) for string in df_clean_copy['short_url']]

In [25]:
df_clean_copy.head()

Unnamed: 0,title,link,tags,clean_title,fname,short_url,clean_short_url
0,My Fault,/My-Fault-Nicole-Wallace/dp/B0B683GB78/ref=sr_...,2023|CC,My_Fault,0_My_Fault,/My-Fault-Nicole-Wallace/,/My-Fault-Nicole-Wallace/
1,Jurassic World Dominion,/Jurassic-World-Dominion-Chris-Pratt/dp/B0B5NN...,2022|PG-13|CC,Jurassic_World_Dominion,1_Jurassic_World_Dominion,/Jurassic-World-Dominion-Chris-Pratt/,/Jurassic-World-Dominion-Chris-Pratt/
2,Top Gun: Maverick,/Top-Gun-Maverick-Tom-Cruise/dp/B0B214XZB4/ref...,2022|PG-13|CC,Top_Gun_Maverick,2_Top_Gun_Maverick,/Top-Gun-Maverick-Tom-Cruise/,/Top-Gun-Maverick-Tom-Cruise/
3,Creed III,/Creed-III-Michael-B-Jordan/dp/B0B8TKJCRQ/ref=...,2023|PG-13|CC,Creed_III,3_Creed_III,/Creed-III-Michael-B-Jordan/,/Creed-III-Michael-B-Jordan/
4,Sonic the Hedgehog 2,/Sonic-Hedgehog-2-James-Marsden/dp/B09ZRVBZG7/...,2022|PG|CC,Sonic_the_Hedgehog_2,4_Sonic_the_Hedgehog_2,/Sonic-Hedgehog-2-James-Marsden/,/Sonic-Hedgehog-2-James-Marsden/


Let's check for duplicates.

In [26]:
df_clean_copy[df_clean_copy[['title', 'clean_short_url']].duplicated()]

Unnamed: 0,title,link,tags,clean_title,fname,short_url,clean_short_url
1250,Global Meltdown,/Global-Meltdown-Michael-Pare/dp/B0B66791WP/re...,2018|TV-MA|CC,Global_Meltdown,1250_Global_Meltdown,/Global-Meltdown-Michael-Pare/,/Global-Meltdown-Michael-Pare/


There was only one such entry, so we'll just remove it.

In [27]:
df_clean_copy = df_clean_copy[~df_clean_copy[['title', 'clean_short_url']].duplicated()]

In [28]:
df_clean_copy.shape

(6351, 7)

In [29]:
df_clean_copy.to_csv("./metadata/com/clean_meta_en_prime.csv", index=False)