In [11]:
from bs4 import BeautifulSoup

# Sample HTML input
html_input = '''
<a href="/faculty/alice-m-agogino" class="field--name-field-name"><span class="field field--name-title field--type-string field--label-hidden">Alice M. Agogino</span>
</a>
'''

# Base URL
base_url = "https://vcresearch.berkeley.edu"

# Parse the HTML
soup = BeautifulSoup(html_input, 'html.parser')

# Find the anchor tag and extract the href attribute
anchor_tag = soup.find('a', class_='field--name-field-name')
relative_url = anchor_tag['href']

# Construct the full URL
full_url = base_url + relative_url

print(full_url)


https://vcresearch.berkeley.edu/faculty/alice-m-agogino


In [14]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def extract_links(url):
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Parse the HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('div', class_="views-field views-field-rendered-entity")

        # Find all anchor tags
        links = soup.find('a', href=True)

        # Extract and print all href attributes (links)
        extracted_links = []
        for link in links:
            href = link['href']
            full_url = urljoin(url, href)  # Convert relative URLs to absolute URLs
            extracted_links.append(full_url)

        return extracted_links

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []

# Example usage:
url = "https://vcresearch.berkeley.edu/faculty/alice-m-agogino"
links = extract_links(url)
for link in links:
    print(link)


TypeError: string indices must be integers

In [22]:
import requests
from bs4 import BeautifulSoup

def extract_url(url):
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Parse the HTML
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the span tag with class 'field--name-title'
        span_tag = soup.find('span', class_='field--name-title')

        if span_tag:
            # Find the parent <a> tag
            anchor_tag = span_tag.parent

            if anchor_tag and anchor_tag.name == 'a' and anchor_tag.has_attr('href'):
                href_attr = anchor_tag['href']
                
                # Check if href_attr starts with '/faculty/'
                if href_attr.startswith('/faculty/'):
                    return href_attr.strip()  # Return the extracted URL path

        return None  # Return None if no URL is found

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

# Example usage:
url = "https://vcresearch.berkeley.edu/faculty-expertise"
extracted_url = extract_url(url)
if extracted_url:
    print(f"Extracted URL: {extracted_url}")
else:
    print("URL extraction failed.")


Extracted URL: /faculty/pieter-abbeel


In [24]:
import requests
from bs4 import BeautifulSoup

def extract_urls(url):
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Parse the HTML
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all span tags with class 'field--name-title'
        span_tags = soup.find_all('span', class_='field--name-title')

        extracted_urls = []

        for span_tag in span_tags:
            # Find the parent <a> tag
            anchor_tag = span_tag.parent

            if anchor_tag and anchor_tag.name == 'a' and anchor_tag.has_attr('href'):
                href_attr = anchor_tag['href']

                # Check if href_attr starts with '/faculty/'
                if href_attr.startswith('/faculty/'):
                    extracted_urls.append(href_attr.strip())  # Append the extracted URL path

        return extracted_urls

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []

# Example usage:
url = "https://vcresearch.berkeley.edu/faculty-expertise"
extracted_urls = extract_urls(url)

if extracted_urls:
    for url in extracted_urls:
        print(f"Extracted URL: {url}")
else:
    print("URL extraction failed or no URLs found.")


Extracted URL: /faculty/pieter-abbeel
Extracted URL: /faculty/marie-abe
Extracted URL: /faculty/brooks-abel
Extracted URL: /faculty/elizabeth-abel
Extracted URL: /faculty/rebecca-abergel
Extracted URL: /faculty/dor-abrahamson
Extracted URL: /faculty/barbara-abrams
Extracted URL: /faculty/kathryn-abrams
Extracted URL: /faculty/charisma-acey
Extracted URL: /faculty/david-ackerly
Extracted URL: /faculty/hillel-adesnik
Extracted URL: /faculty/ilan-adler
Extracted URL: /faculty/mina-aganagic
Extracted URL: /faculty/sabrina-agarwal
Extracted URL: /faculty/vinod-aggarwal
Extracted URL: /faculty/alice-m-agogino
Extracted URL: /faculty/ian-agol
Extracted URL: /faculty/adrian-aguilera
Extracted URL: /faculty/jennifer-ahern
Extracted URL: /faculty/wali-ahmadi
Extracted URL: /faculty/asad-q-ahmed
Extracted URL: /faculty/ashok-ajoy
Extracted URL: /faculty/george-akerlof
Extracted URL: /faculty/zakaria-al-balushi
Extracted URL: /faculty/ahmed-alaa


In [25]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv

def fetch_urls_from_page(url):
    links = []
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find all span tags with class 'field--name-title'
            span_tags = soup.find_all('span', class_='field--name-title')

            for span_tag in span_tags:
                # Find the parent <a> tag
                anchor_tag = span_tag.parent

                if anchor_tag and anchor_tag.name == 'a' and anchor_tag.has_attr('href'):
                    href_attr = anchor_tag['href']

                    # Check if href_attr starts with '/faculty/'
                    if href_attr.startswith('/faculty/'):
                        full_url = urljoin(url, href_attr)
                        links.append(full_url)

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")

    return links

# Example usage:
directory_url = "https://vcresearch.berkeley.edu/faculty-expertise"
all_links = fetch_urls_from_page(directory_url)

# Prepare the data for CSV
data = []
for link in all_links:
    profile = {
        'University': 'Berkeley',
        'Department': 'N/A',
        'Name': 'N/A',
        'Position': 'N/A',
        'Link': link,
        'Email': 'N/A',
        'Research Focus': 'N/A'
    }
    data.append(profile)

# Define the CSV file path
csv_file = "profiles_urls.csv"

# Specify the headers/column names
headers = ['University', 'Department', 'Name', 'Position', 'Link', 'Email', 'Research Focus']

# Write data to CSV
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=headers)
    writer.writeheader()
    writer.writerows(data)

print(f"Data has been saved to {csv_file}.")


Data has been saved to profiles_urls.csv.


In [28]:
import requests
from bs4 import BeautifulSoup

def extract_faculty_info(url):
    # Fetch the HTML content from the URL
    response = requests.get(url)
    response.raise_for_status()  # Ensure the request was successful
    html_content = response.text
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Extract email
    email_tag = soup.find('a', href=lambda x: x and x.startswith('mailto:'))
    email = email_tag.get_text(strip=True) if email_tag else ""

    # Extract position
    position_tag = soup.find('p', class_='large')
    position = position_tag.get_text(strip=True) if position_tag else ""

    # Extract research expertise
    expertise_heading = soup.find('h2', text='Research Expertise and Interest')
    expertise_tag = expertise_heading.find_next_sibling('p') if expertise_heading else None
    research_expertise = expertise_tag.get_text(strip=True) if expertise_tag else ""

    return {
        'email': email,
        'position': position,
        'research_expertise': research_expertise
    }

# Example usage
url = 'https://vcresearch.berkeley.edu/faculty/david-m-auslander'
faculty_info = extract_faculty_info(url)

print("Email:", faculty_info['email'])
print("Position:", faculty_info['position'])
print("Research Expertise:", faculty_info['research_expertise'])


Email: dma@me.berkeley.edu
Position: Professor of the Graduate School
Research Expertise: control systems,                              simulation,                              mechatronics,                              real time software,                              energy management,                              satellite attitude control,                              demand response,                              machine control


  expertise_heading = soup.find('h2', text='Research Expertise and Interest')


In [29]:
import requests
from bs4 import BeautifulSoup

def extract_faculty_info(url):
    # Fetch the HTML content from the URL
    response = requests.get(url)
    response.raise_for_status()  # Ensure the request was successful
    html_content = response.text
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Extract email
    email_tag = soup.find('a', href=lambda x: x and x.startswith('mailto:'))
    email = email_tag.get_text(strip=True) if email_tag else ""

    # Extract position
    position_tag = soup.find('p', class_='large')
    position = position_tag.get_text(strip=True) if position_tag else ""

    # Extract research expertise
    expertise_heading = soup.find('h2', text='Research Expertise and Interest')
    expertise_tag = expertise_heading.find_next_sibling('p') if expertise_heading else None
    research_expertise = expertise_tag.get_text(strip=True) if expertise_tag else ""
    
    # Clean up extra spaces in research expertise
    research_expertise = ' '.join(research_expertise.split())

    return {
        'email': email,
        'position': position,
        'research_expertise': research_expertise
    }

# Example usage
url = 'https://vcresearch.berkeley.edu/faculty/david-m-auslander'
faculty_info = extract_faculty_info(url)

print("Email:", faculty_info['email'])
print("Position:", faculty_info['position'])
print("Research Expertise:", faculty_info['research_expertise'])


Email: dma@me.berkeley.edu
Position: Professor of the Graduate School
Research Expertise: control systems, simulation, mechatronics, real time software, energy management, satellite attitude control, demand response, machine control


  expertise_heading = soup.find('h2', text='Research Expertise and Interest')


In [32]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_data(url):
    # Fetch the HTML content from the URL
    response = requests.get(url)
    response.raise_for_status()  # Ensure the request was successful
    html_content = response.text
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract position
    position_tag = soup.find('p', class_='large')
    position = position_tag.get_text(strip=True) if position_tag else ""

    # Extract email
    email_tag = soup.find('a', href=lambda x: x and x.startswith('mailto:'))
    email = email_tag.get_text(strip=True) if email_tag else ""

    # Extract research expertise
    expertise_heading = soup.find('h2', text='Research Expertise and Interest')
    expertise_tag = expertise_heading.find_next_sibling('p') if expertise_heading else None
    research_expertise = expertise_tag.get_text(strip=True) if expertise_tag else ""
    
    # Clean up extra spaces in research expertise
    research_expertise = ' '.join(research_expertise.split())

    return {
        "Position": position,
        "Email": email,
        "Research Focus": research_expertise,
    }

# Path to the CSV file
csv_file = r'D:\Files\Upwork\Scrape\Us_30_Uni_engineering\Src\4_Berkeley\MechanicalEngineering\me.csv'

# Read the CSV file
df = pd.read_csv(csv_file)

# Initialize new columns in the DataFrame
df['Position'] = ''
df['Email'] = ''
df['Research Focus'] = ''

url = 'https://vcresearch.berkeley.edu/faculty/david-m-auslander'
# Loop through URLs and scrape data
for index, row in df.iterrows():
    url = row['Link']  # Assuming the column containing URLs is named 'Link'
    scraped_data = scrape_data(url)
    df.at[index, 'Position'] = scraped_data["Position"]
    df.at[index, 'Email'] = scraped_data["Email"]
    df.at[index, 'Research Focus'] = scraped_data["Research Focus"]

# Save the updated DataFrame back to the CSV
output_csv_file = 'Mechanical_result.csv'
df.to_csv(output_csv_file, index=False)

print("Scraping and CSV update complete.")


  expertise_heading = soup.find('h2', text='Research Expertise and Interest')


Scraping and CSV update complete.


In [33]:
import pandas as pd

# Load the CSV file
df = pd.read_csv(r'D:\Files\Upwork\Scrape\Us_30_Uni_engineering\Src\4_Berkeley\profiles_urls.csv')

# Filter rows that contain "engineering" in the "Department" column (case insensitive)
filtered_df = df[df['Department'].str.contains('engineering', case=False, na=False)]

# Save the filtered data to a new CSV file
filtered_df.to_csv('filtered_profiles_urls.csv', index=False)

print("Filtered CSV file has been saved as 'filtered_profiles_urls.csv'.")


Filtered CSV file has been saved as 'filtered_profiles_urls.csv'.
