In [31]:
import os
import re
from collections import Counter

In [32]:

# CLEANING FUNCTION 

def clean_movie(name):
    name = name.strip()              # remove spaces
    name = name.lower()              # ignore case

    # remove stars, commas, extra symbols
    name = re.sub(r"[^\w\s]", "", name)

    # remove years like (2025) or 1999
    name = re.sub(r"\b\d{4}\b", "", name)

    name = name.strip()

    return name


# READ ONE FILE

def read_file(path):
    movies = []

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            cleaned = clean_movie(line)
            if cleaned:
                movies.append(cleaned)

    return movies

In [33]:

# FOLDERS

watched_folder = "watched-movies"
unwatched_folder = "unwatched-movies"

watched_files = os.listdir(watched_folder)
unwatched_files = os.listdir(unwatched_folder)

In [34]:


# DUPLICATES INSIDE EACH FILE

print("\n=========== DUPLICATES INSIDE FILES ===========")

for folder in [watched_folder, unwatched_folder]:
    for file in os.listdir(folder):

        path = os.path.join(folder, file)
        movies = read_file(path)

        counts = Counter(movies)
        duplicates = [m for m, c in counts.items() if c > 1]

        if duplicates:
            print(f"\n{folder}/{file}")
            for d in duplicates:
                print("  ", d.title())



watched-movies/english.txt
   The Lion King
   The Mummy

watched-movies/hindi.txt
   Bade Miyan Chote Miyan


In [35]:
# COMMON BETWEEN MATCHING FILES

print("\n=========== WATCHED vs UNWATCHED COMMON ===========")

for file in watched_files:

    if file in unwatched_files:

        watched_path = os.path.join(watched_folder, file)
        unwatched_path = os.path.join(unwatched_folder, file)

        watched_movies = set(read_file(watched_path))
        unwatched_movies = set(read_file(unwatched_path))

        common = watched_movies.intersection(unwatched_movies)

        if common:
            print(f"\n{file}")
            for movie in sorted(common):
                print("  ", movie.title())


