In [2]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
import time

# Load the Excel file
file_path = "data_final.xlsx"
df = pd.read_excel(file_path)

# Function to initialize the WebDriver in headless mode
def initialize_driver():
    options = Options()
    options.add_argument("--headless")  # Enforce headless mode
    options.add_argument("--disable-gpu")  # Disable GPU acceleration if present
    options.add_argument("--no-sandbox")  # Sandbox may cause issues, disable it
    service = Service(executable_path="/home/dragon/Git/pythonclass/webscrapping school/geckodriver")
    driver = webdriver.Firefox(service=service, options=options)
    return driver

# Lists to store the data
authors = []
affiliations = []
links = []

# Iterate through all links in the 'Link' column
for i, link in enumerate(df['Link']):
    try:
        # Initialize the WebDriver for each link
        driver = initialize_driver()
        print(f"Opening link {i + 1}")  # Print the current link number
        driver.get(link)
        time.sleep(5)  # Pause for 5 seconds to let the page load fully

        # Locate the div with id="author-group"
        author_group_div = driver.find_element(By.ID, "author-group")

        # Find all buttons within the div with the specified classes
        buttons = author_group_div.find_elements(By.CSS_SELECTOR, ".button-link.button-link-secondary.button-link-underline")
        
        # Click each button
        for button in buttons:
            author_text = button.text  # Store the text on the button as the author
            button.click()
            time.sleep(3)  # Pause to allow the side panel to open
            
            # Locate the side panel content
            side_panel_content = driver.find_element(By.CLASS_NAME, "side-panel-content")
            
            # Locate the div with class="affiliation" within the side panel
            affiliation_div = side_panel_content.find_element(By.CLASS_NAME, "affiliation")
            affiliation_text = affiliation_div.text  # Collect the text as the affiliation
            
            # Store the link, author, and affiliation
            links.append(link)
            authors.append(author_text)
            affiliations.append(affiliation_text)

    except Exception as e:
        print(f"Error processing link {i + 1}: {e}")
        links.append(link)
        authors.append("N/A")
        affiliations.append("N/A")
    
    finally:
        # Terminate the WebDriver after processing the link
        driver.quit()

# Combine the collected data into a DataFrame
output_df = pd.DataFrame({
    'Link': links,
    'Author': authors,
    'Affiliation': affiliations
})

# Save the output to an Excel file or CSV
output_file_path = "collected_data.xlsx"
output_df.to_excel(output_file_path, index=False)

print(f"Data collected and saved to {output_file_path}")


Opening link 1
Opening link 2
Opening link 3
Opening link 4
Opening link 5
Opening link 6
Opening link 7
Opening link 8
Opening link 9
Opening link 10
Opening link 11
Opening link 12
Opening link 13
Opening link 14
Opening link 15
Opening link 16
Opening link 17
Opening link 18
Opening link 19
Opening link 20
Opening link 21
Opening link 22
Opening link 23
Opening link 24
Opening link 25
Opening link 26
Opening link 27
Opening link 28
Opening link 29
Opening link 30
Opening link 31
Opening link 32
Opening link 33
Opening link 34
Opening link 35
Opening link 36
Opening link 37
Opening link 38
Opening link 39
Opening link 40
Opening link 41
Opening link 42
Opening link 43
Opening link 44
Opening link 45
Opening link 46
Opening link 47
Opening link 48
Opening link 49
Opening link 50
Opening link 51
Opening link 52
Opening link 53
Opening link 54
Opening link 55
Opening link 56
Opening link 57
Opening link 58
Opening link 59
Opening link 60
Opening link 61
Opening link 62
Opening link 63
O

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f8501abda60>>
Traceback (most recent call last):
  File "/home/dragon/.local/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


KeyboardInterrupt: 

In [3]:

# Combine the collected data into a DataFrame
output_df = pd.DataFrame({
    'Link': links,
    'Author': authors,
    'Affiliation': affiliations
})

# Save the output to an Excel file or CSV
output_file_path = "collected_data.xlsx"
output_df.to_excel(output_file_path, index=False)

print(f"Data collected and saved to {output_file_path}")


Data collected and saved to collected_data.xlsx


In [2]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
import time

# Load the Excel file with links to be scraped
file_path = "data_final.xlsx"
df = pd.read_excel(file_path)

# Load the Excel file with already collected data
collected_file_path = "collected_data.xlsx"
try:
    collected_df = pd.read_excel(collected_file_path)
    existing_links = set(collected_df['Link'])  # Convert to set for faster lookup
except FileNotFoundError:
    # If the file does not exist, create an empty DataFrame and set
    collected_df = pd.DataFrame(columns=['Link', 'Author', 'Affiliation'])
    existing_links = set()

# Function to initialize the WebDriver in headless mode
def initialize_driver():
    options = Options()
    options.add_argument("--headless")  # Enforce headless mode
    options.add_argument("--disable-gpu")  # Disable GPU acceleration if present
    options.add_argument("--no-sandbox")  # Sandbox may cause issues, disable it
    service = Service(executable_path="/home/dragon/Git/pythonclass/webscrapping school/geckodriver")
    driver = webdriver.Firefox(service=service, options=options)
    return driver

# Iterate through all links in the 'Link' column of the initial Excel file
for i, link in enumerate(df['Link']):
    if link in existing_links:
        print(f"Link {i + 1} already scraped, skipping.")
        continue  # Skip already scraped links
    
    try:
        # Initialize the WebDriver
        driver = initialize_driver()
        print(f"Opening link {i + 1}: {link}")  # Print the current link number
        driver.get(link)
        time.sleep(5)  # Pause for 5 seconds to let the page load fully

        # Locate the div with id="author-group"
        author_group_div = driver.find_element(By.ID, "author-group")

        # Find all buttons within the div with the specified classes
        buttons = author_group_div.find_elements(By.CSS_SELECTOR, ".button-link.button-link-secondary.button-link-underline")
        
        # Collect authors and their affiliations
        for button in buttons:
            author_text = button.text  # Store the text on the button as the author
            button.click()
            time.sleep(3)  # Pause to allow the side panel to open
            
            # Locate the side panel content
            side_panel_content = driver.find_element(By.CLASS_NAME, "side-panel-content")
            
            # Locate the div with class="affiliation" within the side panel
            affiliation_div = side_panel_content.find_element(By.CLASS_NAME, "affiliation")
            affiliation_text = affiliation_div.text  # Collect the text as the affiliation
            
            # Append the new data to the collected DataFrame
            new_data = pd.DataFrame({'Link': [link], 'Author': [author_text], 'Affiliation': [affiliation_text]})
            collected_df = pd.concat([collected_df, new_data], ignore_index=True)
            
            # Save the updated collected data to the Excel file
            collected_df.to_excel(collected_file_path, index=False)
            print(f"Data for author '{author_text}' added.")

    except Exception as e:
        print(f"Error processing link {i + 1}: {e}")
        # Store N/A if there was an error, and save it to maintain the record
        new_data = pd.DataFrame({'Link': [link], 'Author': ["N/A"], 'Affiliation': ["N/A"]})
        collected_df = pd.concat([collected_df, new_data], ignore_index=True)
        collected_df.to_excel(collected_file_path, index=False)
    
    finally:
        # Terminate the WebDriver after processing the link
        driver.quit()

print(f"Data collection completed. All data saved to {collected_file_path}")


Link 1 already scraped, skipping.
Link 2 already scraped, skipping.
Link 3 already scraped, skipping.
Link 4 already scraped, skipping.
Link 5 already scraped, skipping.
Link 6 already scraped, skipping.
Link 7 already scraped, skipping.
Link 8 already scraped, skipping.
Link 9 already scraped, skipping.
Link 10 already scraped, skipping.
Link 11 already scraped, skipping.
Link 12 already scraped, skipping.
Link 13 already scraped, skipping.
Link 14 already scraped, skipping.
Link 15 already scraped, skipping.
Link 16 already scraped, skipping.
Link 17 already scraped, skipping.
Link 18 already scraped, skipping.
Link 19 already scraped, skipping.
Link 20 already scraped, skipping.
Link 21 already scraped, skipping.
Link 22 already scraped, skipping.
Link 23 already scraped, skipping.
Link 24 already scraped, skipping.
Link 25 already scraped, skipping.
Link 26 already scraped, skipping.
Link 27 already scraped, skipping.
Link 28 already scraped, skipping.
Link 29 already scraped, skip

KeyboardInterrupt: 