In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
import json
import os
import re
import logging
import warnings
from tqdm import tqdm

logging.getLogger('scrapy').propagate = False
warnings.filterwarnings('ignore', category=scrapy.exceptions.ScrapyDeprecationWarning)
pdf_folder_path = r'C:\Users\Mateusz\Documents\epu_arch'

def extract(filename):
    return filename[:9]

pdf_files = [f for f in os.listdir(pdf_folder_path) if f.lower().endswith('.pdf')]

unique_numbers = set()
for pdf_file in pdf_files:
    number = extract(pdf_file)
    if number:
        unique_numbers.add(number)

sorted_unique_numbers = sorted(list(unique_numbers))

class LinkSpider(scrapy.Spider):
    name = 'link_spider'
    def start_requests(self):
        base_url = 'http://192.168.10.7:7000/synology?q='
        for number in tqdm(sorted_unique_numbers):
            url = base_url + number
            yield scrapy.Request(url=url, callback=self.parse)
    
    def __init__(self, *args, **kwargs):
        super(LinkSpider, self).__init__(*args, **kwargs)
        self.visited_urls = set()

    def parse(self, response):
        if response.url not in self.visited_urls:
            self.visited_urls.add(response.url)
            link_texts = response.css('a::text').getall()
            for link_text in link_texts:
                yield {'text': link_text}

process = CrawlerProcess(settings={
    'FEED_FORMAT': 'jsonlines',
    'FEED_URI': 'links.jsonl',
    'DUPEFILTER_CLASS': 'scrapy.dupefilters.RFPDupeFilter'
})
process.crawl(LinkSpider)
process.start()

results = []
with open('links.jsonl', 'r') as f:
    for line in f:
        results.append(json.loads(line))
with open(r'C:\Users\Mateusz\Documents\epu_arch\arca.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

In [None]:
import pandas as pd

jp = r"C:\Users\Mateusz\Documents\epu_arch\arca.json"
df = pd.read_json(jp)
df = df[['text']].drop_duplicates()
df.head()

In [None]:
from tqdm import tqdm

pdf_files = [os.path.splitext(f)[0] for f in os.listdir(r"C:\Users\Mateusz\Documents\epu_arch") if f.endswith('.pdf')]

def find_match(pdf_name, df):
    match = df[df['text'].str.contains(pdf_name, case=False, na=False)]
    return match

result = []
for pdf_file in tqdm(pdf_files):
    match = find_match(pdf_file, df)
    if not match.empty:
        ref = match['text'].values[0]
        ref = "-".join(ref.split("-")[:4])
        result.append(ref)

In [None]:
import csv

with open(r"C:\Users\Mateusz\Desktop\arca.csv", 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=';')
    for ref in result:
        writer.writerow([ref])
        
print('ok')

In [None]:
count = 0
result = pd.read_csv(r"C:\Users\Mateusz\Desktop\arca.csv", delimiter=';', encoding='windows-1250', header=None)
for _, row in result.iterrows():
    result = row.tolist()   
    pdf_files = [f for f in os.listdir(r"C:\Users\Mateusz\Documents\epu_arch") if f.endswith('.pdf')]
    matches = [ref for ref in result if f'{ref}.pdf' in pdf_files]
    for match in matches:
        path = f'{match}.pdf'
        if path in os.listdir(r"C:\Users\Mateusz\Documents\epu_arch"):
            os.remove(os.path.join(r"C:\Users\Mateusz\Documents\epu_arch", path))

print('ok')