In [5]:
import pandas as pd
import json
import numpy as np
from glob import glob 
import os, requests, threading
from PIL import Image, ImageDraw, ImageFont

In [2]:
def scrape_from_json_files():
    try:
        # Find all JSON files starting with 'bca' and ending with '.json'
        json_files = glob('bca*.json')
        
        if not json_files:
            print("No JSON files found matching the pattern 'bca*.json'")
            return
        
        results = []
        
        for json_file in json_files:
            print(f"Processing file: {json_file}")
            
            try:
                with open(json_file, "r") as f:
                    json_data = json.load(f)
                
                # Assuming your data is in an 'items' array in the JSON
                items = json_data.get('items', [])
                
                for item in items:
                    details = {}
                    
                    # Basic vehicle information
                    details['Reg'] = item.get('vrm', 'na')
                    details['VIN'] = item.get('vin', 'na')
                    # details['saleCode'] = item.get('saleCode', 'na')
                    details['Auction type'] = item.get('saleLocation', 'na')
                    details['Center'] = item.get('localSaleLocation', 'na')
                    details['Hall Name'] = item.get('hallName', 'na')
                    details['Auction Name'] = item.get('saleType', 'na')
                    details['Title'] = item.get('primaryVehicleDescription', 'na')
                    details['Make'] = item.get('make', 'na')
                    details['Model'] = item.get('model', 'na')
                    details['CC'] = item.get('engineCapacity', 'na')
                    details['Start Date']  =item.get("saleDate", 'na').split("T")[0]
                    details['Start Time']  = item.get("saleDate", 'na').split("T")[1].split("Z")[0]
                    details['Derivative'] = item.get('derivative', 'na')
                    details['Body Type'] = item.get('bodyType', 'na')
                    details['Mileage Warranted'] = item.get('mileageWarranted', 'na')
                    details['Mileage'] = item.get('mileage', 'na')
                    details['Year'] = item.get('plate', 'na').split(" ")[0]
                    details['Plate'] = item.get('plate', 'na')
                    details['CAP Clean'] = item.get('capCleanPrice', 'na')
                    # details['CAP Clean'] = item.get('capClean', 'na')
                    details['Grade'] = item.get('grade', 'na')
                    details['Mechanical Grade'] = item.get('mechanicalGrade', 'na')
                    details['Date Of Grading'] = item.get('dateOfGrading', 'na')
                    
                    # Images
                    media = item.get('images', [])
                    image_urls = [entry.get('imageURI', '') for entry in media if entry.get('imageType') == 'Default']
                    details['Images'] = ", ".join(image_urls) if image_urls else 'na'
                    
                    # Extract condition report URL from reportUrls
                    report_urls = item.get('reportUrls', [])
                    condition_report = next((r['href'] for r in report_urls if r.get('rel') == 'conditionreporturl'), 'na')
                    details['Inspection Report'] = condition_report
                    other_report = next((r['href'] for r in report_urls if r.get('rel') == 'mechanicalreporturl'), 'na')
                    details['Other Report'] = other_report
                    other_report_text = next((r['rel'] for r in report_urls if r.get('rel') == 'mechanicalreporturl'), 'na')
                    details['Other Report Name'] = other_report_text

                    
                    # Other vehicle details
                    details['lot Id'] = item.get('lotId', 'na')
                    details['Lot'] = item.get('lotNumber', 'na')
                    details['Colour'] = item.get('manufacturerColour', 'na')
                    details['Non Runner'] = item.get('nonRunner', 'na')
                    details['Fuel Type'] = item.get('fuelType', 'na')
                    details['Transmission'] = item.get('transmission', 'na')
                    details['Doors'] = item.get('numberOfDoors', 'na')
                    details['Former Keepers'] = item.get('numberOfOwners', 'na')
                    details['V5 Status'] = item.get('logbookStatus', 'na')
                    details['V5'] = item.get('logBookStatusIndicator', 'na')
                    details['Service History Type'] = item.get('serviceHistoryType', 'na')
                    details['VAT Status'] = item.get('vatType', 'na')
                    details['Service History Present'] = item.get('serviceHistoryPresent', 'na')
                    details['Vehicle Type'] = item.get('vehicleType', 'na')
                    # details['vehicleTypeGroup'] = item.get('vehicleTypeGroup', 'na')
                    
                    # Equipment
                    equipment_items = item.get('equipmentItems', [])
                    details['Equipment'] = ", ".join([eq.get('name', '') for eq in equipment_items]) if equipment_items else 'na'
                    
                    # Service history
                    service_history = item.get('serviceHistory', {})
                    details['Last Service'] = service_history.get('lastService', {}).get('date', 'na')
                    details['Last service mileage'] = service_history.get('lastService', {}).get('mileage', 'na')
                    details['Service history'] = service_history.get('lastService', {}).get('type', 'na')
                    details['No of services'] = service_history.get('serviceCount', {}).get('total', 'na')
                    details['Service through main dealers'] = service_history.get('serviceCount', {}).get('mainDealer', 'na')
                    details['Service Notes'] = service_history.get('notes', 'na')
                    
                    # Get additional information
                    additional_info = item.get('additionalInformation', {})
                    emissions = additional_info.get('emissions', 'na')

                    # Get advisory information
                    advisory_info = item.get('advisoryItems') or item.get('additionalInformation', {}).get('advisoryItems', [])                     
                    name = [ad.get('name') for ad in advisory_info if ad.get('name')]                    
                    # Store both in a list
                    details['Additional information'] = [emissions, name if name else 'na']
                    
                    details['Keys'] = additional_info.get('numberOfKeys', 'na')
 
                    # DVSA mileage entries
                    dvsa_entries = additional_info.get('dvsaMileageEntries', [])
                    if dvsa_entries and isinstance(dvsa_entries, list):
                        details['DVSA mileage'] = ", ".join([f"{entry.get('mileage', '')} {entry.get('completedDate', '')}" 
                                                           for entry in dvsa_entries])
                    else:
                        details['DVSA mileage'] = 'na'
                    
                    details['MOT Expiry Date'] = additional_info.get('mot', {}).get('expiryDate', 'na')
                    
                    declarations = item.get('declarations', {})
                  
                   
                    if isinstance(declarations, str):
                        try:
                            declarations = json.loads(declarations)
                        except:
                            declarations = {}

                    if declarations and isinstance(declarations, dict):
                        true_flags = [
                            key for key, value in declarations.items()
                            if isinstance(value, bool) and value
                        ]
                        details['Declarations'] = ",".join(true_flags) if true_flags else ""
                    else:
                        details['Declarations'] = ""
                    
                    details['Major Damage'] = declarations.get('majorDamage', 'na')
                    details['Owned By Police'] = declarations.get('ownedByPolice', 'na')
                    details['Used As Taxi'] = declarations.get('usedAsTaxi', 'na')
                    details['Reregistered'] = declarations.get('reregistered', 'na')
                    details['Change Of Registration'] = declarations.get('changeOfRegistration', 'na')
                    details['Modified Vehicle'] = declarations.get('modifiedVehicle', 'na')
                    details['Salvaged Vehicle'] = declarations.get('salvagedVehicle', 'na')
                    
                    # Prices
                    prices = item.get('prices', [])
                    # if prices and len(prices) > 0:
                    #     details['CAPClean'] = prices[0].get('information', 'na')
                    if prices and len(prices) > 1:
                        details['CAP Average'] = prices[1].get('information', 'na').split(" ")[1]
                    if prices and len(prices) > 2:
                        details['CAP New'] = prices[2].get('information', 'na').split(" ")[1]
                    if prices and len(prices) > 3:
                        details['CAP Retail'] = prices[3].get('information', 'na').split(" ")[1]
                    if prices and len(prices) > 4:
                        details['CAP Below'] = prices[4].get('information', 'na').split(" ")[1]
                    
                    # HPI information
                    hpi = item.get('hpi', {}) or {}  
                    details['Vehicle base price when new'] = hpi.get('vehiclePriceWithVat', np.nan)
                    details['Total price when new'] = hpi.get('totalPrice', np.nan)
                    spec_options = hpi.get('specCheckOptions', [])
                    if spec_options:
                        desc_and_price = []
                        for opt in spec_options:
                            desc = opt.get('shortDescription', '')
                            price = opt.get('optionPrice', '')
                            if desc or price:
                                desc_and_price.append(f"{desc} {price}")
                        details['Description and prize'] = ", ".join(desc_and_price) if desc_and_price else 'na'
                    else:
                        details['Description and prize'] = 'na'

                    results.append(details)
            
            except Exception as e:
                print(f"Error processing file {json_file}: {str(e)}")
                continue
        
        if results:
            df = pd.DataFrame(results)
            output_file = "BCA_json.csv"
            df.to_csv(output_file, index=False)
            print(f"Successfully saved data from {len(json_files)} files to {output_file}")
        else:
            print("No data was extracted from the files")
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")

# Run the function
scrape_from_json_files() 

Processing file: bca.json
Successfully saved data from 1 files to BCA_json.csv


In [3]:
# def scrape(path):
#     # to avoid windows to close again and again we make use of headless
#     options = ChromeOptions()
#     options.headless=True
#     # make use of chrome for scraping
#     service = Service(ChromeDriverManager().install())
#     # create a driver using chrome
#     driver = Chrome(service=service, options=options)
#     # run the driver through url
#     driver.get(path)
#     # driver.maximize_window()

#     # =============================================================================================
#     # complete login

#     try:
#         provided_u_name = "haider1805@icloud.com"
#         # user_name_id = 
#         user_name = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, '(.//input[@id="username form-control__input"])[1]')))   
#         user_name.send_keys(provided_u_name)
#     except Exception as e:
#         print("No username tab found")
        
#     # try:
#     #     next_tab = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, '(.//button[@id="nextButton"])'))) 
#     #     if next_tab:
#     #         next_tab.click()
#     # except:
#     #     print("No next tab")
#     # time.sleep(1)
    
#     # get password tab
#     try:
#         provided_pass = "Muhssan7865"
#         # pass_id = 
#         password = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, './/input[@name="password"]')))
#         password.send_keys(provided_pass)
#     except Exception as e:
#         print("No password tab found") 
#      # get the login tab to enter the username and password
#     try:
#         # login_tab_css = "submit"
#         login_tab = driver.find_element(By.XPATH, './/button[@class="onelogin-main__btn btn "]')
#         driver.execute_script("arguments[0].scrollIntoView();", login_tab)
#         login_tab.click()
#     except Exception as e:
#         print("No login tab found here")
    
#     # =============================================================================================
#     # Handle cookie consent banner
#     try:
#         # cookie_main_bar = driver.find_element(By.ID, "onetrust-button-group")
#         cookie_accept = driver.find_element(By.ID, "onetrust-reject-all-handler")
#         driver.execute_script("arguments[0].scrollIntoView(true);", cookie_accept)
#         time.sleep(1)
#         cookie_accept.click()
#     except Exception as e:
#         print(f"Error handling cookie banner: {e}")
    
#     # =====================================================================================  
#     # get the number of cars found
#     try:
#         num_cars_css = "//p[@data-testid='lot-navigation-number']" # here we will have number of cars found for the current website
#         num_cars = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, num_cars_css)))
#         nums = num_cars.text
#         match = re.search(r"/(\d+)", nums) # beacuse of the date format
#         total_cars = int(match.group(1))
#     except Exception as e:
#         print("Nothing found")
    

#     results = []
#     # run till the number of cars available from total_cars above
#     for i in range(total_cars): # for sample purpose
#         details={}

#         try:
#             reg_num = WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.XPATH, './/div[@class="sc-aXZVg Vrm__VrmBox-sc-19tyda2-0 bLafSa bxEIHe"]'))).text.strip()
#             if reg_num:
#                 details['Reg']  = reg_num
#             else:
#                 details['Reg'] = "na"
            
#             reg_date =  WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.XPATH, '(.//div[@class="sc-aXZVg cXHGGV"]/div/div/div/p)[3]'))).text.strip()
#             if reg_date:
#                 details['Reg_date']  = reg_date
#             else:
#                 details['Reg_date'] = "na"
#         except:
#             print("No reg and reg date")
#     # =====================================================================================

#         results.append(details)
#         try:
#             next_button = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, '[data-testid="next-lot-navigation-button"]')))
#             if next_button:
#                 driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
#                 next_button.click()
#             # i+=1
#         except Exception as e:
#             break
        
#     df = pd.DataFrame.from_dict(results)
#     df.to_csv("reg_date.csv", index=False)
#     driver.quit()
    
# # provide link of inside car
# path = "https://www.bca.co.uk/lot/VX17%20LKG?q=&sort=LotSequence&bq=salechannel%3Ae-Auction%7Csalelocation%3ABedford%2CBrighouse%2CBridgwater%2CLeeds%2CLivingston%2CMeasham%2CNewcastle-u-Tyne%2CNottingham%2CPreston%2CBirmingham+-+Perry+Barr%2CBlackbushe%2CManchester%2CBristol%2CMossend%2CPaddock+Wood%2CCrawley%2CWolverhampton%2CGlasgow%2CEnfield%2CWalsall%7CsaleType%3AEvans+Halshaw+Budget%2CVertu+Motors+Group%2FBudget%2CMoneybarn+Budget%2Ccinch+Budget%2CBlack+Horse+Budget%2CHenson+Motor+Group%2CDuff+Morgan+Budget%2CBurton+Kia%2CPartner+Finance+Budget%2CBudget+Product%7Csaledate%3ATue+10+June+2025&searchVersion=new"
# scrape(path)

In [6]:
def add_watermark_to_image(image_path, text="Sourced from BCA"):
    try:
        image = Image.open(image_path).convert("RGBA")
        txt_layer = Image.new("RGBA", image.size, (255, 255, 255, 0))
        draw = ImageDraw.Draw(txt_layer)

        # Load font
        try:
            font = ImageFont.truetype("arial.ttf", 16)
        except:
            font = ImageFont.load_default()

        # Calculate text size and position
        margin = 10
        bbox = draw.textbbox((0, 0), text, font=font)
        text_width = bbox[2] - bbox[0]
        text_height = bbox[3] - bbox[1]
        x = image.width - text_width - margin
        y = image.height - text_height - margin

        # Draw semi-transparent background box
        box_width = text_width + 2 * margin
        box_height = text_height + 2 * margin
        draw.rectangle([x - margin, y - margin, x - margin + box_width, y - margin + box_height],fill=(0, 0, 0, 160))

        # Draw watermark text
        draw.text((x, y), text, font=font, fill=(255, 255, 255, 200))

        # Merge and save
        watermarked = Image.alpha_composite(image, txt_layer).convert("RGB")
        watermarked.save(image_path)
        print(f"Watermark added to {image_path}")

    except Exception as e:
        print(f"Failed to watermark {image_path}: {e}")


In [7]:
from urllib.parse import urlparse, urljoin

df = pd.read_csv("BCA_json.csv")
reg_img = df[['Reg', 'Images']]
reports = df[['Reg', "Inspection Report", "Other Report"]]

def download_images(data, main_folder="Images"):  
   
    os.makedirs(main_folder, exist_ok=True)
    

    for index, row in data.iterrows():
        reg_no = str(row["Reg"]).strip()
        if reg_no == 'nan':
            print(f"Skipping row {index} (Invalid registration number)")
            continue
        
        image_urls = row["Images"] 

        if pd.isna(image_urls) or not isinstance(image_urls, str) or image_urls.strip() == "":
            print(f"Skipping {reg_no} (No image URLs)")
            continue

        image_urls = image_urls.split(", ") 
     
        reg_folder = os.path.join(main_folder, reg_no) 
        os.makedirs(reg_folder, exist_ok=True)
        
    
        for idx, url in enumerate(image_urls):
            url = url.strip() 
            if not url.startswith(("http://", "https://")):
                url = urljoin("https://", url) 
            
 
            parsed_url = urlparse(url)


            if not parsed_url.scheme or not parsed_url.netloc:
                print(f"Invalid URL skipped: {url}") 
                continue
            
 
            try:

                response = requests.get(url, stream=True) 
                response.raise_for_status() 

          
                file_name = os.path.basename(parsed_url.path) or f"image_{idx + 1}.jpg"

          
                file_extension = file_name.split(".")[-1]
                
             
                if file_extension not in ["jpg", "jpeg", "png", "gif", "bmp", "webp"]:
                    file_name = f"image_{idx + 1}.jpg" 
                
                full_file_name = os.path.join(reg_folder, f"{reg_no}_{idx + 1}.jpg")
                
                # Save the image
                with open(full_file_name, 'wb') as f:
                    for chunk in response.iter_content(1024):
                        f.write(chunk)

                # Add watermark
                add_watermark_to_image(full_file_name)
                
                print(f"Downloaded: {full_file_name}")
            except Exception as e:
                print(f"Failed to download {url} for {reg_no}: {e}")

# function to download the reports
def download_reports(data, main_folder="Reports"):
    # Create the main folder if it doesn't exist
    os.makedirs(main_folder, exist_ok=True)  
    # Loop through every row to get the info
    for i, row in data.iterrows():
        reg_no = row["Reg"] # Separate reg nums
        Condition_report_urls = row["Inspection Report"]
        Other_rep_link_urls = row["Other Report"]
        
        # Create a subfolder for Inspection Report 
        Condition_report_folder = os.path.join(main_folder, "Inspection Report")  # Folder for regular images
        os.makedirs(Condition_report_folder, exist_ok=True)  # Ensure the "Images" folder exists

        # Create a subfolder for Other Report
        Essential_report_folder = os.path.join(main_folder, "Other Report")  # Folder for damaged images
        os.makedirs(Essential_report_folder, exist_ok=True)  # Ensure the "Damaged" folder exists

        # Check if the Condition_report_urls is missing
        if not Condition_report_urls or pd.isna(Condition_report_urls):
            print(f"Missing Inspection Report of {reg_no}")
            # continue
        else:
        
            if not Condition_report_urls.startswith(("http://", "https://")):  # Check if the URL starts with 'http' or 'https'
                Condition_report_urls = urljoin("https://", Condition_report_urls)  # Add 'https://' if missing

            # Parse the URL
            parsed_url = urlparse(Condition_report_urls)

            # Check if the parsed URL is valid
            if not parsed_url.scheme or not parsed_url.netloc:
                print(f"Invalid URL skipped: {Condition_report_urls}")
                continue
            
            # Try downloading the image
            try:
                # Download the image
                response = requests.get(Condition_report_urls, stream=True)
                response.raise_for_status()  # Raises an exception if the HTTP request fails

                # Extract the file name from the URL
                file_name = os.path.basename(parsed_url.path) or f"Inspection Report.pdf"

                # Check for valid file extensions
                file_extension = file_name.split(".")[-1]
                if file_extension not in ["pdf", "jpeg", "png", "gif", "bmp", "webp"]:
                    file_name = f"Inspection Report.pdf"  # Assign a default name if extension is invalid
                
                # Construct the full path for the file inside the "Images" folder
                full_file_name = os.path.join(Condition_report_folder, f"{reg_no}.pdf")
                
                # Save the image
                with open(full_file_name, 'wb') as f:
                    for chunk in response.iter_content(1024):
                        f.write(chunk)
                
                print(f"Downloaded: {full_file_name}")
            except Exception as e:
                print(f"Failed to download {Condition_report_urls} for {reg_no}: {e}")

        # Check if the Condition_report_urls is missing
        if not Other_rep_link_urls or pd.isna(Other_rep_link_urls):
            print(f"Missing Other Report of {reg_no}")
            # continue
        else:
            if not Other_rep_link_urls.startswith(("http://", "https://")):
                Other_rep_link_urls = urljoin("https://", Other_rep_link_urls)  # Add 'https://' if missing

            # Parse the URL
            parsed_url = urlparse(Other_rep_link_urls)

            # Check if the parsed URL is valid
            if not parsed_url.scheme or not parsed_url.netloc:
                print(f"Invalid URL skipped: {Other_rep_link_urls}")
                continue
            
            # Try downloading the image
            try:
                # Download the image
                response = requests.get(Other_rep_link_urls, stream=True)
                response.raise_for_status()  # Raises an exception if the HTTP request fails

                # Extract the file name from the URL
                file_name = os.path.basename(parsed_url.path) or f"Other Report.pdf"

                # Check for valid file extensions
                file_extension = file_name.split(".")[-1]
                if file_extension not in ["pdf", "jpeg", "png", "gif", "bmp", "webp"]:
                    file_name = f"Other Report.pdf"  # Assign a default name if extension is invalid
                
                # Construct the full path for the file inside the "Damaged" folder
                full_file_name = os.path.join(Essential_report_folder, f"{reg_no}.pdf")
                
                # Save the image
                with open(full_file_name, 'wb') as f:
                    for chunk in response.iter_content(1024):
                        f.write(chunk)
                
                print(f"Downloaded: {full_file_name}")
            except Exception as e:
                print(f"Failed to download {Other_rep_link_urls} for {reg_no}: {e}")

def start_funcs():
    thread1 = threading.Thread(target=download_reports, args=(reports,))
    thread2 = threading.Thread(target=download_images, args=(reg_img,))
    # thread.daemon = True
    print("Started downloading reports")
    thread1.start()
    print("Started downloading images")
    thread2.start()

    # Wait for both threads to finish
    thread1.join()
    thread2.join()

# Run the script
if __name__ == "__main__":
    start_funcs()

Started downloading reports
Started downloading images
Watermark added to Images\LR54 SXN\LR54 SXN_1.jpg
Downloaded: Images\LR54 SXN\LR54 SXN_1.jpg
Downloaded: Reports\Inspection Report\LR54 SXN.pdf
Invalid URL skipped: https:///na
Watermark added to Images\LR54 SXN\LR54 SXN_2.jpg
Downloaded: Images\LR54 SXN\LR54 SXN_2.jpg
Downloaded: Reports\Inspection Report\MK57 XWH.pdf
Invalid URL skipped: https:///na
Watermark added to Images\LR54 SXN\LR54 SXN_3.jpg
Downloaded: Images\LR54 SXN\LR54 SXN_3.jpg
Downloaded: Reports\Inspection Report\AO53 GOE.pdf
Invalid URL skipped: https:///na
Watermark added to Images\LR54 SXN\LR54 SXN_4.jpg
Downloaded: Images\LR54 SXN\LR54 SXN_4.jpg
Watermark added to Images\LR54 SXN\LR54 SXN_5.jpg
Downloaded: Images\LR54 SXN\LR54 SXN_5.jpg
Downloaded: Reports\Inspection Report\BJ07 KGP.pdf
Invalid URL skipped: https:///na
Watermark added to Images\LR54 SXN\LR54 SXN_6.jpg
Downloaded: Images\LR54 SXN\LR54 SXN_6.jpg
Downloaded: Reports\Inspection Report\DK08 NUA.pdf
