In [None]:
import os
import csv
from bs4 import BeautifulSoup
import re
from datetime import datetime
import calendar

def extract_data(soup):
    data = {}


    title = soup.select_one("h1.detail__title span")
    data["Title"] = title.get_text(strip=True) if title else ""

    if data["Title"]:
        parts = data["Title"].split()
        year = parts[0] if re.match(r"^\d{4}$", parts[0]) else ""
        make = parts[1] if year else ""
        model = parts[2] if make else ""
        colour = parts[-1] if len(parts) > 3 else ""
        variant = " ".join(parts[3:-1]).strip() if len(parts) > 4 else ""

        data["Year"] = year
        data["Make"] = make
        data["Model"] = model
        data["Variant"] = variant
        data["Colour"] = colour


    subtitle = soup.find("h2", class_="detail__subtitle")
    subtitle_text = subtitle.get_text(strip=True) if subtitle else ""


    mot_date = ""
    mot_match = re.search(r"MOT TILL ([A-Za-z]+)\s+(\d{4})", subtitle_text, re.IGNORECASE)
    if mot_match:
        month_name = mot_match.group(1)
        year = int(mot_match.group(2))
        try:
            month_number = datetime.strptime(month_name, "%b").month
        except:
            try:
                month_number = datetime.strptime(month_name, "%B").month
            except:
                month_number = 0
        if month_number > 0:
            last_day = calendar.monthrange(year, month_number)[1]
            mot_date = f"{last_day:02d}/{month_number:02d}/{year}"

    data["MOT Expiry Date"] = mot_date


    trans_match = re.search(r"(MANUAL|AUTOMATIC|SEMI-AUTO|AUTO)", subtitle_text, re.IGNORECASE)
    data["Transmission"] = trans_match.group(1).upper() if trans_match else ""


    start_span = soup.find("span", class_="awe-rt-startingDTTM")
    end_span = soup.find("span", class_="awe-rt-endingDTTM")

    start_raw = start_span.get("data-initial-dttm") if start_span else ""
    end_raw = end_span.get("data-initial-dttm") if end_span else ""

    def split_date_time(dt):
        try:
            d = datetime.strptime(dt, "%m/%d/%Y %H:%M:%S")
            return d.strftime("%d/%m/%Y"), d.strftime("%H:%M")
        except:
            return "", ""

    data["Start Date"], data["Start Time"] = split_date_time(start_raw)
    data["End Date"], data["End Time"] = split_date_time(end_raw)

   
    desc_div = soup.find("div", class_="detail__sectionBody description")
    if desc_div:
  
        data.setdefault("Mileage", "")
        data.setdefault("CC", "")
        data.setdefault("Fuel Type", "")
        data.setdefault("V5", "")

        for p in desc_div.find_all("p"):
            text = p.get_text(strip=True).upper()
            

            if "MILES" in text:
                m = re.search(r"([\d,]+)\s*MILES", text)
                if m:
                    data["Mileage"] = m.group(1).replace(",", "")
            
 
            elif re.match(r"\d+\.?\d*\s*(PETROL|DIESEL|HYBRID|ELECTRIC)", text):
                m = re.match(r"(\d+\.?\d*)\s*(PETROL|DIESEL|HYBRID|ELECTRIC)", text)
                data["CC"] = m.group(1)
                data["Fuel Type"] = m.group(2)
            
     
            elif text in ["AUTOMATIC", "MANUAL", "SEMI-AUTO"]:
                data["Transmission"] = text
            

            elif "V5" in text:
                data["V5"] = text
    vrn_div = soup.find("div", class_="detail__cfUnit")
    vrn = ""
    if vrn_div:
        name_div = vrn_div.find("div", class_="detail__cfName")
        if name_div and name_div.get_text(strip=True) == "VRN":
            value_div = vrn_div.find("div", class_="detail__cfValue")
            if value_div:
                vrn = value_div.get_text(strip=True).replace(" ", "")  
    data["Reg"] = vrn
    base_url = "https://auctions.redcorn.co.uk/"

    images_div = soup.find("div", class_="detail__sectionBody detail__imageThumbnails")
    image_urls = []

    if images_div:
        for a_tag in images_div.find_all("a", href=True):
            href = a_tag['href'].strip()
            if href:
                if not href.lower().startswith("http"):
                    href = base_url + href.lstrip("/")
                image_urls.append(href)

    images = ",".join(image_urls)
    data["Images"] = images


    return data


def parse_folder(folder="html"):
    records = []
    for file in os.listdir(folder):
        if file.endswith(".html"):
            with open(os.path.join(folder, file), "r", encoding="utf-8") as f:
                soup = BeautifulSoup(f.read(), "html.parser")
                record = extract_data(soup)
                record["Lot"] = file.replace(".html", "")
                records.append(record)

    column_order = [
        "Lot", "Title", "Year", "Make", "Model", "Variant", "Colour",
        "Transmission", "MOT Expiry Date",
        "Start Date", "Start Time", "End Date", "End Time",
        "Mileage", "CC", "Fuel Type", "V5","Reg","Images"
    ]

    with open("reconData.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=column_order)
        writer.writeheader()
        writer.writerows(records)

    print(f"‚úÖ Parsed {len(records)} HTML files. CSV saved as reconData.csv")


if __name__ == "__main__":
    parse_folder("html")


‚úÖ Parsed 6 HTML files. CSV saved as reconData.csv


In [9]:
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

email = "sultanmirza0501@icloud.com"
password = "Muhssan7865"


df = pd.read_csv("reconData.csv")
reg_list = df["Reg"].dropna().astype(str).tolist()


save_folder = "carcheckhtml"
os.makedirs(save_folder, exist_ok=True)


driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.maximize_window()


driver.get("https://totalcarcheck.co.uk/Account/Login")

try:
    wait = WebDriverWait(driver, 10)
    
    wait.until(EC.presence_of_element_located((By.ID, "UserName"))).send_keys(email)
    wait.until(EC.presence_of_element_located((By.ID, "Password"))).send_keys(password)
    
    driver.find_element(By.XPATH, "//input[@type='submit' and @value='Log in']").click()
    print("‚úî Logged in successfully!")
except Exception as e:
    print("‚ùå Login failed:", e)

time.sleep(3)


for reg in reg_list:
    print(f"üîé Checking: {reg}")


    url = f"https://totalcarcheck.co.uk/FreeCheck?regno={reg}"
    driver.get(url)

    time.sleep(4)  


    html = driver.page_source
    

    file_path = os.path.join(save_folder, f"{reg}.html")
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(html)
    
    print(f"üì© Saved HTML: {file_path}")
driver.quit()
print("üéâ All record HTML saved successfully!")


‚úî Logged in successfully!
üîé Checking: AK20BEO
üì© Saved HTML: carcheckhtml\AK20BEO.html
üîé Checking: EU11HWP
üì© Saved HTML: carcheckhtml\EU11HWP.html
üîé Checking: KR59WKX
üì© Saved HTML: carcheckhtml\KR59WKX.html
üîé Checking: SL18CGZ
üì© Saved HTML: carcheckhtml\SL18CGZ.html
üîé Checking: GK64YDR
üì© Saved HTML: carcheckhtml\GK64YDR.html
üîé Checking: KN65BTE
üì© Saved HTML: carcheckhtml\KN65BTE.html
üéâ All record HTML saved successfully!


In [10]:
from bs4 import BeautifulSoup
import os, re
import pandas as pd

def T_scrap_by_html_to_csv(folder="carcheckhtml", output_csv="totalcarcheck.csv"):
    if not os.path.exists(folder):
        print(f"Folder '{folder}' does not exist!")
        return

    files = sorted(os.listdir(folder))
    if not files:
        print("No HTML files found in folder.")
        return

    all_data = []

    for file in files:
        if not file.endswith(".html"):
            continue

        path = os.path.join(folder, file)
        with open(path, "r", encoding="utf-8") as f:
            soup = BeautifulSoup(f.read(), "html.parser")

            def get_value(label):
                tag = soup.find("span", text=re.compile(label, re.I))
                if tag:
                    data = tag.find_parent("td").find_next_sibling("td")
                    if data:
                        return data.get_text(strip=True)
                return ""


            engine_cc_text = get_value("Engine Size")  
            engine_l = ""

            if engine_cc_text:
                cc_match = re.findall(r"\d+", engine_cc_text)
                if cc_match:
                    cc_value = int(cc_match[0])
                    engine_l = round(cc_value / 1000, 1)
            
            regnumber = soup.find("span",id="regPlateFreeCheck")
            reg_text=regnumber.get_text(strip=True)
            row = {
                "Reg": reg_text,
                # "cc": engine_l,
                "Body Style": get_value("Body Style"),
                "Euro Status": get_value("Euro Status")
            }

            all_data.append(row)

    df = pd.DataFrame(all_data)
    df.to_csv(output_csv, index=False, encoding="utf-8")
    print(f"üöó Completed! Saved '{output_csv}'")

T_scrap_by_html_to_csv()


  tag = soup.find("span", text=re.compile(label, re.I))


üöó Completed! Saved 'totalcarcheck.csv'


In [11]:
import pandas as pd


df1 = pd.read_csv("reconData.csv")
df2 = pd.read_csv("totalcarcheck.csv")


df1['Reg'] = df1['Reg'].str.upper().str.replace(" ", "")
df2['Reg'] = df2['Reg'].str.upper().str.replace(" ", "")


merged = pd.merge(df1, df2, on='Reg', how='left')

merged.to_csv("reconData_merged.csv", index=False, encoding='utf-8')

print("‚úî final_agnew.csv created successfully!")


‚úî final_agnew.csv created successfully!


In [12]:
import os
import csv
from bs4 import BeautifulSoup

def extract_bidding_data(soup):
    data = {}

  
    table = soup.find("table", class_="table-bidHistory")
    bids = []
    if table:
        rows = table.find("tbody").find_all("tr")
        for row in rows:
            td_bid = row.find_all("td")[1]  
            bid_text = td_bid.get_text(strip=True)
            if bid_text:
                bids.append(bid_text)

    data["Bidding History"] = ";".join(bids)        
    data["No of Bids"] = len(bids)              
    data["Last Bid"] = bids[0] if bids else ""      

    
    reserve_span = soup.find("span", class_="reserve-not-met")
    if reserve_span:
        status_span = reserve_span.find("span")
        data["Bidding Status"] = status_span.get_text(strip=True) if status_span else ""
    else:
        data["Bidding Status"] = ""

    return data


def parse_bidding_folder(folder="bidding"):
    records = []
    for file in os.listdir(folder):
        if file.endswith(".html"):
            file_path = os.path.join(folder, file)
            with open(file_path, "r", encoding="utf-8") as f:
                soup = BeautifulSoup(f.read(), "html.parser")
                record = extract_bidding_data(soup)
                record["Lot"] = file.replace(".html", "")
                records.append(record)


    fieldnames = ["Lot", "Bidding History", "No of Bids", "Last Bid", "Bidding Status"]
    with open("bidding.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(records)

    print("‚úî CSV Generated: bidding.csv")



parse_bidding_folder()


‚úî CSV Generated: bidding.csv


In [13]:
import pandas as pd

df1 = pd.read_csv("reconData_merged.csv")
df2 = pd.read_csv("bidding.csv")

df1['Lot'] = df1['Lot'].astype(str).str.upper().str.replace(" ", "", regex=False)
df2['Lot'] = df2['Lot'].astype(str).str.upper().str.replace(" ", "", regex=False)


merged = pd.merge(df1, df2, on='Lot', how='left')


merged.to_csv("final_refcorn.csv", index=False, encoding='utf-8')

print("‚úî final_refcorn.csv created successfully!")


‚úî final_refcorn.csv created successfully!


In [14]:
import os
import requests
import pandas as pd
from urllib.parse import urlparse, urljoin
from PIL import Image, ImageDraw, ImageFont


df = pd.read_csv("final_refcorn.csv")


reg_img = df[["Reg", "Images"]]

BASE_URL = "https://auctions.redcorn.co.uk/"


def add_watermark_to_image(image_path, text="Sourced from Redcorn Salvage Auctions"):
    try:
        image = Image.open(image_path).convert("RGBA")
        txt_layer = Image.new("RGBA", image.size, (255, 255, 255, 0))
        draw = ImageDraw.Draw(txt_layer)

        try:
            font = ImageFont.truetype("arial.ttf", 20)
        except:
            font = ImageFont.load_default()

        margin = 10
        bbox = draw.textbbox((0, 0), text, font=font)
        tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]

        x = image.width - tw - margin
        y = image.height - th - margin

        draw.rectangle(
            [x - 5, y - 5, x + tw + 5, y + th + 5],
            fill=(0, 0, 0, 160)
        )
        draw.text((x, y), text, fill=(255, 255, 255, 220), font=font)

        final = Image.alpha_composite(image, txt_layer).convert("RGB")
        final.save(image_path)

        print(f"Watermarked: {image_path}")

    except Exception as e:
        print(f"Watermark failed: {e}")


def download_images(data, main_folder="Images"):

    os.makedirs(main_folder, exist_ok=True)

    for idx, row in data.iterrows():
        reg = str(row["Reg"]).strip()

        if pd.isna(row["Images"]) or not str(row["Images"]).strip():
            print(f"‚ùå No images for {reg}")
            continue

        image_urls = [u.strip() for u in str(row["Images"]).split(",") if u.strip()]

        reg_folder = os.path.join(main_folder, reg)
        os.makedirs(reg_folder, exist_ok=True)

        print(f"\nüìå Downloading images for: {reg}")

        for i, url in enumerate(image_urls, start=1):


            if not url.startswith("http"):
                url = urljoin(BASE_URL, url)

            parsed = urlparse(url)
            if not parsed.scheme or not parsed.netloc:
                print(f"‚ùå Invalid URL skipped: {url}")
                continue

            save_path = os.path.join(reg_folder, f"{reg}_{i}.jpg")

            try:
                response = requests.get(url, timeout=20)
                response.raise_for_status()

                with open(save_path, "wb") as f:
                    f.write(response.content)


                add_watermark_to_image(save_path)

                print(f"‚úî Downloaded: {save_path}")

            except Exception as e:
                print(f"‚ùå Failed: {url} | Error: {e}")


if __name__ == "__main__":
    download_images(reg_img)



üìå Downloading images for: AK20BEO
Watermarked: Images\AK20BEO\AK20BEO_1.jpg
‚úî Downloaded: Images\AK20BEO\AK20BEO_1.jpg
Watermarked: Images\AK20BEO\AK20BEO_2.jpg
‚úî Downloaded: Images\AK20BEO\AK20BEO_2.jpg
Watermarked: Images\AK20BEO\AK20BEO_3.jpg
‚úî Downloaded: Images\AK20BEO\AK20BEO_3.jpg
Watermarked: Images\AK20BEO\AK20BEO_4.jpg
‚úî Downloaded: Images\AK20BEO\AK20BEO_4.jpg
Watermarked: Images\AK20BEO\AK20BEO_5.jpg
‚úî Downloaded: Images\AK20BEO\AK20BEO_5.jpg
Watermarked: Images\AK20BEO\AK20BEO_6.jpg
‚úî Downloaded: Images\AK20BEO\AK20BEO_6.jpg
Watermarked: Images\AK20BEO\AK20BEO_7.jpg
‚úî Downloaded: Images\AK20BEO\AK20BEO_7.jpg
Watermarked: Images\AK20BEO\AK20BEO_8.jpg
‚úî Downloaded: Images\AK20BEO\AK20BEO_8.jpg
Watermarked: Images\AK20BEO\AK20BEO_9.jpg
‚úî Downloaded: Images\AK20BEO\AK20BEO_9.jpg
Watermarked: Images\AK20BEO\AK20BEO_10.jpg
‚úî Downloaded: Images\AK20BEO\AK20BEO_10.jpg
Watermarked: Images\AK20BEO\AK20BEO_11.jpg
‚úî Downloaded: Images\AK20BEO\AK20BEO_11.jpg
W