In [4]:
import os
import time
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager

# Define download directory
download_dir = os.path.abspath("pdf")

# Ensure the directory exists
os.makedirs(download_dir, exist_ok=True)

# Configure Chrome options to handle PDF downloads
chrome_options = Options()
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": download_dir,  # Set download location
    "download.prompt_for_download": False,  # Disable pop-ups
    "plugins.always_open_pdf_externally": True,  # Directly download PDFs instead of opening them
    "profile.default_content_settings.popups": 0,
})

# Set up WebDriver
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)

def download_pdf(link, title):
    try:
        driver.get(link)
        time.sleep(3)  # Wait for the page to load (adjust if necessary)

        # Find all links in the page
        pdf_links = driver.find_elements(By.TAG_NAME, "a")

        for pdf_link in pdf_links:
            href = pdf_link.get_attribute("href")

            if href and href.endswith(".pdf"):  # Ensure it's a PDF file
                print(f"Downloading: {href}")

                # Open the link in a new tab to trigger download
                driver.execute_script("window.open(arguments[0], '_blank');", href)
                time.sleep(5)  # Wait for download to start
                return

        print(f"No PDF found on page: {link}")

    except Exception as e:
        print(f"Error downloading PDF from {link}: {e}")

# Read links from CSV and process them
with open("pwc_india_insights_details.csv", "r", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        link = row["Link"]
        title = row["Title"].replace(" ", "_").replace("/", "-")  # Sanitize filename
        download_pdf(link, title)

# Close the browser after processing all links
driver.quit()

print(f"PDFs downloaded to {download_dir}")


No PDF found on page: https://www.pwc.in/ghost-templates/the-mutual-funds-route-to-viksit-bharat-2047.html
No PDF found on page: https://www.pwc.in/ghost-templates/financial-health-transcending-from-access-to-impact.html
No PDF found on page: https://www.pwc.in/ghost-templates/retail-reinvention-paradigm.html
No PDF found on page: https://www.pwc.in/ghost-templates/pwc-indias-financial-services-fs-risk-symposium-ministry-finance-keynote-session-february-2025.html
No PDF found on page: https://www.pwc.in/ghost-templates/india-spends-deep-dive-consumer-spending-behaviour.html
No PDF found on page: https://www.pwc.in/ghost-templates/future-quality-manufacturing.html
No PDF found on page: https://www.pwc.in/ghost-templates/powering-automation-with-agents.html
No PDF found on page: https://www.pwc.in/ghost-templates/deals-at-a-glance-annual-review-2024.html
PDFs downloaded to x:\benori_assignment\pwc_india\PwC_PDFs


In [None]:
import os
import re
import pandas as pd

def clean_filename(s):
    """
    Removes illegal characters from a filename.
    """
    s = s.strip()
    return re.sub(r'[\\/*?:"<>|]', "", s)

def get_csv_link_base(link):
    """
    Removes the trailing '.html' (if present) and returns the last segment of the URL.
    """
    if link.endswith('.html'):
        link = link[:-len('.html')]
    return link.split('/')[-1].lower()

def tokenize(name):
    """
    Splits the name by '-' and returns a set of tokens in lowercase.
    """
    return set(name.lower().split('-'))

def is_similar(name1, name2, threshold=0.5):
    """
    Determines if the two names are similar based on token intersection.
    
    The similarity score is computed as:
        similarity = (number of common tokens) / (max(tokens in name1, tokens in name2))
        
    If the similarity score is greater than or equal to the threshold, the names are considered similar.
    """
    tokens1 = tokenize(name1)
    tokens2 = tokenize(name2)
    
    if not tokens1 or not tokens2:
        return False
    
    common = tokens1.intersection(tokens2)
    similarity = len(common) / max(len(tokens1), len(tokens2))
    return similarity >= threshold

def rename_pdf_files(csv_file, download_dir, threshold=0.5):
    """
    Reads the CSV file and builds a list of CSV entries with:
      - csv_link_base: The base part of the link (stripped of ".html")
      - date: Date (only the date part if time is present)
      - title: Title string
    
    Then iterates over each PDF file in the download directory (removing the .pdf extension)
    and attempts to find a matching CSV entry by comparing token-level similarity.
    
    If a match is found (based on the threshold), renames the file to:
        PWC_DATE_TITLE.pdf
    Otherwise, simply prepends "PWC_" to the original filename.
    """
    df = pd.read_csv(csv_file)
    
    csv_entries = []
    for idx, row in df.iterrows():
        link = row.get('Link', '')
        date = row.get('Date', '')
        title = row.get('Title', '')
        if link and date and title:
            link_base = get_csv_link_base(link)
            csv_entries.append({
                'link_base': link_base,
                'date': date,
                'title': title
            })
    
    for filename in os.listdir(download_dir):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(download_dir, filename)
            pdf_base = filename[:-4].lower()  # Remove ".pdf" and lower for matching
            
            matched_entry = None
            best_similarity = 0.0
            for entry in csv_entries:
                csv_link_base = entry['link_base']
                similarity = 0.0
                if is_similar(pdf_base, csv_link_base, threshold):
                    tokens_pdf = tokenize(pdf_base)
                    tokens_csv = tokenize(csv_link_base)
                    common = tokens_pdf.intersection(tokens_csv)
                    similarity = len(common) / max(len(tokens_pdf), len(tokens_csv))
                
                if similarity >= threshold and similarity > best_similarity:
                    best_similarity = similarity
                    matched_entry = entry
            
            if matched_entry:
                date_part = matched_entry['date'].split()[0]
                date_part = date_part.replace("/", "-")  # Replace "/" with "-" if needed
                title_clean = clean_filename(matched_entry['title'].replace(" ", "_"))
                new_filename = f"PWC_{date_part}_{title_clean}.pdf"
            else:
                new_filename = f"PWC_{filename}"
            
            new_path = os.path.join(download_dir, new_filename)
            try:
                os.rename(pdf_path, new_path)
                print(f"Renamed '{filename}' to '{new_filename}'")
            except Exception as e:
                print(f"Error renaming '{filename}': {e}")

csv_file = "pwc_india_insights_details.csv"  # CSV file containing Date, Title, Link
download_dir = os.path.abspath("pdf")      # Directory where PDFs were downloaded

rename_pdf_files(csv_file, download_dir, threshold=0.5)


Renamed 'deals-at-a-glance-annual-review-2024-v1.pdf' to 'PWC_14-02-25_Deals_at_a_glance_Annual_review_2024.pdf'
Renamed 'financial-health-transcending-from-access-to-impact.pdf' to 'PWC_04-03-25_Financial_health_Transcending_from_access_to_impact.pdf'
Renamed 'future-quality-manufacturing.pdf' to 'PWC_21-02-25_The_future_of_quality_in_manufacturing.pdf'
Renamed 'how-india-spends-a-deep-dive-into-consumers-pending-behaviour.pdf' to 'PWC_24-02-25_How_India_spends_A_deep_dive_into_consumer_spending_behaviour.pdf'
Renamed 'powering-automation-with-agents.pdf' to 'PWC_18-02-25_Powering_automation_with_agents.pdf'
Renamed 'pwc-indias-financial-services-fs-risk-symposium-ministry-finance-keynote-session-february-2025.pdf' to 'PWC_25-02-25_PwC_India's_Financial_Services_(FS)_Risk_Symposium_Ministry_of_Finance_keynote_session_-_February_2025.pdf'
Renamed 'retail-reinvention-paradigm-v1.pdf' to 'PWC_27-02-25_The_retail_reinvention_paradigm.pdf'
Renamed 'the-mutual-funds-route-to-viksit-bharat-2