In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time

In [2]:
system_name = "DeCrescent 178kW PV"
location = "United States"
base_url = "https://pvoutput.org/list.jsp?p={}&id=32413&sid=29714&gs=0&v=0&o=date&d=desc"
pages_to_scrape = 5

In [3]:
options = Options()
options.binary_location = r"C:\Program Files\chrome for testing\chrome-win64\chrome.exe"
options.add_argument("--headless=new")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36")
service = Service(executable_path=r"C:\Program Files\chrome for testing\chrome-win64\chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)

In [4]:
all_data = []

for page in range(pages_to_scrape):
    url = base_url.format(page)
    print(f"Scraping page {page + 1} from: {url}")
    
    try:
        driver.get(url)
        time.sleep(6)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        table = soup.find("table", id="tbl_main")

        if not table:
            print(f"No table found on page {page + 1}. Retrying once...")
            time.sleep(5)
            driver.get(url)
            soup = BeautifulSoup(driver.page_source, "html.parser")
            table = soup.find("table", id="tbl_main")

        if table:
            rows = table.find_all("tr")[1:]
            for row in rows:
                cols = [td.get_text(strip=True).replace(',', '') for td in row.find_all("td")]
                if len(cols) >= 9:
                    cols.append(system_name)
                    cols.append(location)
                    all_data.append(cols)
        else:
            print(f"Failed again: No table found on page {page + 1}")
    except Exception as e:
        print(f"Error scraping page {page + 1}: {e}")

driver.quit()

📄 Scraping page 1 from: https://pvoutput.org/list.jsp?p=0&id=32413&sid=29714&gs=0&v=0&o=date&d=desc
📄 Scraping page 2 from: https://pvoutput.org/list.jsp?p=1&id=32413&sid=29714&gs=0&v=0&o=date&d=desc
📄 Scraping page 3 from: https://pvoutput.org/list.jsp?p=2&id=32413&sid=29714&gs=0&v=0&o=date&d=desc
📄 Scraping page 4 from: https://pvoutput.org/list.jsp?p=3&id=32413&sid=29714&gs=0&v=0&o=date&d=desc
📄 Scraping page 5 from: https://pvoutput.org/list.jsp?p=4&id=32413&sid=29714&gs=0&v=0&o=date&d=desc


In [5]:
header = [
    "Date", "Generated", "Efficiency", "Exported", "Peak Power",
    "Peak Time", "Conditions", "Temperature", "Comments",
    "System Name", "Location"
]

In [6]:
df = pd.DataFrame(all_data, columns=header)
df.to_csv("daily_DeCrescent_1yr.csv", index=False)
print(f"Saved {len(df)} rows to daily_DeCrescent_1yr.csv")
print(df.head())

✅ Saved 200 rows to daily_DeCrescent_1yr.csv
       Date   Generated   Efficiency Exported Peak Power Peak Time  \
0  28/03/25  427.154kWh  3.797kWh/kW        -  100.276kW   12:25PM   
1  27/03/25  743.860kWh  6.612kWh/kW        -  100.278kW    1:05PM   
2  26/03/25  565.422kWh  5.026kWh/kW        -  104.259kW   11:40AM   
3  25/03/25  242.491kWh  2.155kWh/kW        -  100.356kW   10:50AM   
4  24/03/25   83.653kWh  0.744kWh/kW        -   48.898kW    4:40PM   

      Conditions Temperature                Comments          System Name  \
0  Mostly Cloudy           -  Updated 00:16 UTC-4:00  DeCrescent 178kW PV   
1           Fine           -  Updated 00:42 UTC-4:00  DeCrescent 178kW PV   
2           Fine           -  Updated 00:18 UTC-4:00  DeCrescent 178kW PV   
3         Cloudy           -  Updated 00:13 UTC-4:00  DeCrescent 178kW PV   
4        Showers           -  Updated 00:29 UTC-4:00  DeCrescent 178kW PV   

        Location  
0  United States  
1  United States  
2  United Stat

In [7]:
page6 = pd.read_csv("DeCrescent_page6.csv")
page7 = pd.read_csv("DeCrescent_page7.csv")
page8 = pd.read_csv("DeCrescent_page8.csv")
page9 = pd.read_csv("DeCrescent_page9.csv")
page10 = pd.read_csv("DeCrescent_page10.csv")

In [8]:
df = pd.DataFrame(all_data, columns=header)
df.drop(columns=["Exported", "Temperature", "Comments", "Peak Power", "Peak Time", "Conditions"], inplace=True)
print(df.head())

       Date   Generated   Efficiency          System Name       Location
0  28/03/25  427.154kWh  3.797kWh/kW  DeCrescent 178kW PV  United States
1  27/03/25  743.860kWh  6.612kWh/kW  DeCrescent 178kW PV  United States
2  26/03/25  565.422kWh  5.026kWh/kW  DeCrescent 178kW PV  United States
3  25/03/25  242.491kWh  2.155kWh/kW  DeCrescent 178kW PV  United States
4  24/03/25   83.653kWh  0.744kWh/kW  DeCrescent 178kW PV  United States


In [37]:
page10["System Name"] = "DeCrescent 178kW PV"
page10["Location"] = "United States"

In [39]:
master_df = pd.concat([df, page6, page7, page8, page9, page10], ignore_index=True)

In [11]:
master_df

Unnamed: 0,Date,Generated,Efficiency,System Name,Location,Unnamed: 2
0,28/03/25,427.154kWh,3.797kWh/kW,DeCrescent 178kW PV,United States,
1,27/03/25,743.860kWh,6.612kWh/kW,DeCrescent 178kW PV,United States,
2,26/03/25,565.422kWh,5.026kWh/kW,DeCrescent 178kW PV,United States,
3,25/03/25,242.491kWh,2.155kWh/kW,DeCrescent 178kW PV,United States,
4,24/03/25,83.653kWh,0.744kWh/kW,DeCrescent 178kW PV,United States,
...,...,...,...,...,...,...
372,3/10/2024,209.171kWh,1.859298,DeCrescent 178kW PV,United States,
373,3/9/2024,198.313kWh,1.762782,DeCrescent 178kW PV,United States,
374,3/8/2024,701.856kWh,6.23872,DeCrescent 178kW PV,United States,
375,3/7/2024,282.632kWh,2.512284,DeCrescent 178kW PV,United States,


In [45]:
master_df["System Size (kW)"] = 112.5

In [47]:
master_df.to_csv("daily_DeCrescent_master.csv", index=False)