In [7]:
import requests
from bs4 import BeautifulSoup

# BASE_URL
BASE_URL = 'https://www.iisermohali.ac.in'

# Make an HTTP request to get the content of the website
response = requests.get(f'{BASE_URL}/faculty/people/faculty')

# Check if the request was successful
if response.status_code != 200:
    print("Failed to retrieve the webpage.")
    exit()

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Extract faculty rows from the table (skipping the header)
faculty_rows = soup.find_all('tr')[1:]

# Extracting faculty details
faculty_details = []

for row in faculty_rows:
    columns = row.find_all('td')

    # Only consider rows with the expected number of columns (6 columns in this case)
    if len(columns) == 6:
        name_element = columns[0].find('a')
        name = name_element.text if name_element else columns[0].text.strip()
        link = BASE_URL + name_element['href'] if name_element else "No Link Provided"
        department = columns[1].text.strip()
        expertise = columns[2].text.strip()
        office = columns[3].text.strip()
        phone = columns[4].text.strip()
        email = columns[5].text.strip() + "@iisermohali.ac.in"

        faculty_details.append({
            "Name": name,
            "Link": link,
            "Department": department,
            "Expertise": expertise,
            "Office": office,
            "Phone": phone,
            "Email": email
        })

# Print or further process the faculty_details list
for faculty in faculty_details:
    print(faculty)


{'Name': 'Abhik Ganguli', 'Link': 'https://www.iisermohali.ac.in/faculty/dms/aganguli', 'Department': 'Mathematics', 'Expertise': 'Number Theory', 'Office': 'AB2-2F1', 'Phone': '-', 'Email': 'aganguli@iisermohali.ac.in'}
{'Name': 'Abhishek Chaudhuri', 'Link': 'https://www.iisermohali.ac.in/faculty/dps/abhishek', 'Department': 'Physics', 'Expertise': 'Condensed Matter Theory : Soft and Biological Matter', 'Office': 'AB1-2F11', 'Phone': '135', 'Email': 'abhishek@iisermohali.ac.in'}
{'Name': "Adrene Freeda D 'cruz", 'Link': 'https://www.iisermohali.ac.in/faculty/hss/adrene', 'Department': 'HSS', 'Expertise': 'Postwar American Literature', 'Office': '-', 'Phone': '-', 'Email': 'adrene@iisermohali.ac.in'}
{'Name': 'Alok Kumar Maharana', 'Link': 'https://www.iisermohali.ac.in/faculty/dms/maharana', 'Department': 'Mathematics', 'Expertise': 'Algebraic Geometry', 'Office': '-', 'Phone': '-', 'Email': 'maharana@iisermohali.ac.in'}
{'Name': 'Amit Kulshrestha', 'Link': 'https://www.iisermohali.ac

In [14]:
import requests
from bs4 import BeautifulSoup
import re

response = requests.get("https://www.iisermohali.ac.in/faculty/dps/abhishek")
soup = BeautifulSoup(response.content, 'html.parser')

pattern = re.compile(r'font-family:\s*arial,helvetica,sans-serif')
elements = soup.find_all('span', style=pattern)
texts = [element.text.strip() for element in elements]

for text in texts:
    print(text)


Associate Professor, Physical Sciences
Email
abhishek(AT)iisermohali.ac.in
Phone
+91 172 2293129
Fax
+91 172 2240266
Personal Page

Research Area
Condensed Matter Theory : Soft and Biological Matter
Research Focus
Research Focus
Research Area: Condensed Matter Theory : Soft and Biological MatterThe aim of our group is to understand the physical properties of biological and soft condensed matter systems that are driven out of equilibrium. We use both analytical approaches (Equilibrium and Non-equilibrium Statistical Mechanics, Hydrodynamics) and computational methods (Molecular Dynamics, Brownian Dynamics, Monte Carlo) to investigate the dynamics of systems ranging from the cell membrane and the cell cytoskeleton to polymers and colloids in confinement.The cell is an active dynamical medium, constantly generating and dissipating energy to sustain the various life processes. It is subject to active stresses arising from a meshwork of filaments (cell cytoskeleton), which is driven out of 

In [17]:

url = "https://www.iisermohali.ac.in/faculty/dps/abhishek"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Fetching text with the specified font family
texts = [span.get_text(strip=True) for span in soup.find_all('span', style=lambda value: value and "font-family: arial,helvetica,sans-serif;" in value)]

# Proceeding with the organization of the data
data = {}
data['Position'] = texts[0]  # This seems to be the first item

# Extracting Email, Phone, Fax, and Personal Page
data['Email'] = texts[texts.index('Email') + 1]
data['Phone'] = texts[texts.index('Phone') + 1]
data['Fax'] = texts[texts.index('Fax') + 1]

# For the Personal Page, we will search for any 'a' tag after the 'Fax' data
personal_page_tag = soup.find('span', string=data['Fax']).find_next('a')
# Check for a valid 'http' link for Personal Page
data['Personal Page'] = personal_page_tag['href'] if personal_page_tag and re.match(r'^http', personal_page_tag['href']) else 'N/A'


# Extracting Research Area and Research Focus
data['Research Area'] = texts[texts.index('Research Area') + 1]
# research_focus_index = texts.index('Research Focus') + 1
# data['Research Focus'] = texts[research_focus_index]

# Extracting Selected Publications
publications_start_index = texts.index('Selected Publications') + 1
data['Selected Publications'] = texts[publications_start_index:]

data

{'Position': 'Associate Professor, Physical Sciences',
 'Email': 'abhishek(AT)iisermohali.ac.in',
 'Phone': '+91 172 2293129',
 'Fax': '+91 172 2240266',
 'Personal Page': 'N/A',
 'Research Area': 'Condensed Matter Theory : Soft and Biological Matter',
 'Selected Publications': ['Abhishek Chaudhuri, Bhaswati Bhattacharya, Kripa Gowrishankar, Satyajit Mayor and Madan Rao Spatiotemporal regulation of chemical reaction kinetics of cell surface molecules by active remodeling of cortical actin, Proc. Natl. Acad. Sci. USA 108, 14825 (2011).',
  'Abhishek Chaudhuri, Giuseppe Battaglia and Ramin Golestanian Effect of interactions on the cellular uptake of nanoparticles, Phys. Biol. 8, 046002 (2011) Selected as highlights of 2011 in Physical Biology.',
  'Jack Cohen, Abhishek Chaudhuri and Ramin Golestanian Active polymer translocation through flickering pores, Phys. Rev. Lett. 107, 238102 (2011).',
  'Abhishek Chaudhuri, Anupam Kundu, Dibyendu Roy, Abhishek Dhar, Joel L. Lebowitz and Herbert S

In [18]:

url = "https://www.iisermohali.ac.in/faculty/dms/aganguli"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Fetching text with the specified font family
texts = [span.get_text(strip=True) for span in soup.find_all('span', style=lambda value: value and "font-family: arial,helvetica,sans-serif;" in value)]

# Proceeding with the organization of the data
data = {}
data['Position'] = texts[0]  # This seems to be the first item

# Extracting Email, Phone, Fax, and Personal Page
data['Email'] = texts[texts.index('Email') + 1]
data['Phone'] = texts[texts.index('Phone') + 1]
data['Fax'] = texts[texts.index('Fax') + 1]

# For the Personal Page, we will search for any 'a' tag after the 'Fax' data
personal_page_tag = soup.find('span', string=data['Fax']).find_next('a')
# Check for a valid 'http' link for Personal Page
data['Personal Page'] = personal_page_tag['href'] if personal_page_tag and re.match(r'^http', personal_page_tag['href']) else 'N/A'


# Extracting Research Area and Research Focus
data['Research Area'] = texts[texts.index('Research Area') + 1]
# research_focus_index = texts.index('Research Focus') + 1
# data['Research Focus'] = texts[research_focus_index]

# Extracting Selected Publications
publications_start_index = texts.index('Selected Publications') + 1
data['Selected Publications'] = texts[publications_start_index:]

data

{'Position': 'Assistant Professor, Mathematical\xa0 Sciences',
 'Email': 'aganguli(AT)iisermohali.ac.in',
 'Phone': 'Fax',
 'Fax': '+91 172 2240124',
 'Personal Page': 'https://sites.google.com/site/abhikganguli/',
 'Research Area': 'Number Theory',
 'Selected Publications': ['Abhik Ganguli, Eknath GhateReductions of Galois representations via the mod p Local Langlands correspondence,J. Number Theory 147 (2015), 250-286.']}

In [20]:
from bs4 import BeautifulSoup
import requests
import re

def fetch_faculty_info(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Fetching text with the specified font family
    texts = [span.get_text(strip=True) for span in soup.find_all('span', style=lambda value: value and "font-family: arial,helvetica,sans-serif;" in value)]

    # Organizing the data
    data = {}
    data['Position'] = texts[0]  # This seems to be the first item

    # Extracting Email, Phone, Fax, and Personal Page
    data['Email'] = texts[texts.index('Email') + 1]
    data['Phone'] = texts[texts.index('Phone') + 1]
    data['Fax'] = texts[texts.index('Fax') + 1]

    # For the Personal Page, we will search for any 'a' tag after the 'Fax' data
    personal_page_tag = soup.find('span', string=data['Fax']).find_next('a')
    # Check for a valid 'http' link for Personal Page
    data['Personal Page'] = personal_page_tag['href'] if personal_page_tag and re.match(r'^http', personal_page_tag['href']) else 'N/A'

    # Extracting Research Area and Research Focus
    data['Research Area'] = texts[texts.index('Research Area') + 1]

    # Extracting Selected Publications
    publications_start_index = texts.index('Selected Publications') + 1
    data['Selected Publications'] = texts[publications_start_index:]

    return data

# Example usage:
url = "https://www.iisermohali.ac.in/faculty/dps/abhishek"
info = fetch_faculty_info(url)
info


{'Position': 'Associate Professor, Physical Sciences',
 'Email': 'abhishek(AT)iisermohali.ac.in',
 'Phone': '+91 172 2293129',
 'Fax': '+91 172 2240266',
 'Personal Page': 'N/A',
 'Research Area': 'Condensed Matter Theory : Soft and Biological Matter',
 'Selected Publications': ['Abhishek Chaudhuri, Bhaswati Bhattacharya, Kripa Gowrishankar, Satyajit Mayor and Madan Rao Spatiotemporal regulation of chemical reaction kinetics of cell surface molecules by active remodeling of cortical actin, Proc. Natl. Acad. Sci. USA 108, 14825 (2011).',
  'Abhishek Chaudhuri, Giuseppe Battaglia and Ramin Golestanian Effect of interactions on the cellular uptake of nanoparticles, Phys. Biol. 8, 046002 (2011) Selected as highlights of 2011 in Physical Biology.',
  'Jack Cohen, Abhishek Chaudhuri and Ramin Golestanian Active polymer translocation through flickering pores, Phys. Rev. Lett. 107, 238102 (2011).',
  'Abhishek Chaudhuri, Anupam Kundu, Dibyendu Roy, Abhishek Dhar, Joel L. Lebowitz and Herbert S

In [22]:
import requests
from bs4 import BeautifulSoup
import csv
import re


BASE_URL = 'https://www.iisermohali.ac.in'


def fetch_faculty_info(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Fetching text with the specified font family
    texts = [span.get_text(strip=True) for span in soup.find_all('span', style=lambda value: value and "font-family: arial,helvetica,sans-serif;" in value)]
    
    # Organizing the data
    data = {}
    if texts:
        data['Position'] = texts[0]
        data['Email'] = texts[texts.index('Email') + 1] if 'Email' in texts else 'N/A'
        data['Phone'] = texts[texts.index('Phone') + 1] if 'Phone' in texts else 'N/A'
        data['Fax'] = texts[texts.index('Fax') + 1] if 'Fax' in texts else 'N/A'

        # Personal Page
        fax_span = soup.find('span', string=data.get('Fax', ''))
        personal_page_tag = fax_span.find_next('a') if fax_span else None
        data['Personal Page'] = personal_page_tag['href'] if personal_page_tag and re.match(r'^http', personal_page_tag['href']) else 'N/A'

        # Research Area and Selected Publications
        if 'Research Area' in texts:
            data['Research Area'] = texts[texts.index('Research Area') + 1]
        if 'Selected Publications' in texts:
            publications_start_index = texts.index('Selected Publications') + 1
            data['Selected Publications'] = texts[publications_start_index:]
    return data


response = requests.get(f'{BASE_URL}/faculty/people/faculty')
soup = BeautifulSoup(response.content, 'html.parser')
faculty_rows = soup.find_all('tr')[1:]

faculty_details = []

for row in faculty_rows:
    columns = row.find_all('td')
    if len(columns) == 6:
        faculty = {}
        name_element = columns[0].find('a')
        faculty['Name'] = name_element.text if name_element else columns[0].text.strip()
        faculty['Link'] = BASE_URL + name_element['href'] if name_element else "No Link Provided"
        faculty['Department'] = columns[1].text.strip()
        faculty['Expertise'] = columns[2].text.strip()
        faculty['Office'] = columns[3].text.strip()
        faculty['Phone'] = columns[4].text.strip()
        faculty['Email'] = columns[5].text.strip() + "@iisermohali.ac.in"
        
        # Fetching details from the faculty's personal page
        additional_info = fetch_faculty_info(faculty['Link'])
        faculty.update(additional_info)
        
        faculty_details.append(faculty)


# Write to CSV
with open('faculty_details.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Name', 'Link', 'Department', 'Expertise', 'Position', 'Phone', 'Fax', 'Personal Page', 'Selected Publications']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for faculty in faculty_details:
        writer.writerow(faculty)


ProxyError: HTTPSConnectionPool(host='www.iisermohali.ac.inhttps', port=443): Max retries exceeded with url: //www.iisermohali.ac.in/finance-committee/people/dr-j-gowrishankar (Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 503 Service Unavailable')))

In [27]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

BASE_URL = 'https://www.iisermohali.ac.in'

def fetch_faculty_info(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Fetching text with the specified font family
        texts = [span.get_text(strip=True) for span in soup.find_all('span', style=lambda value: value and "font-family: arial,helvetica,sans-serif;" in value)]
        
        # Organizing the data
        data = {}
        if texts:
            data['Position'] = texts[0]
            data['Email'] = texts[texts.index('Email') + 1] if 'Email' in texts else 'N/A'
            data['Phone'] = texts[texts.index('Phone') + 1] if 'Phone' in texts else 'N/A'
            data['Fax'] = texts[texts.index('Fax') + 1] if 'Fax' in texts else 'N/A'

            # Personal Page
            fax_span = soup.find('span', string=data.get('Fax', ''))
            personal_page_tag = fax_span.find_next('a') if fax_span else None
            data['Personal Page'] = personal_page_tag['href'] if personal_page_tag and re.match(r'^http', personal_page_tag['href']) else 'N/A'

            # Research Area and Selected Publications
            if 'Research Area' in texts:
                data['Research Area'] = texts[texts.index('Research Area') + 1]
            if 'Selected Publications' in texts:
                publications_start_index = texts.index('Selected Publications') + 1
                data['Selected Publications'] = texts[publications_start_index:]
    except:
        data = {
            'Position': 'N/A',
            'Email': 'N/A',
            'Phone': 'N/A',
            'Fax': 'N/A',
            'Personal Page': 'N/A',
            'Research Area': 'N/A',
            'Selected Publications': 'N/A'
        }
    return data

response = requests.get(f'{BASE_URL}/faculty/people/faculty')
soup = BeautifulSoup(response.content, 'html.parser')
faculty_rows = soup.find_all('tr')[1:]

# Create an empty DataFrame to store faculty details
df = pd.DataFrame()

for row in faculty_rows:
    columns = row.find_all('td')
    if len(columns) == 6:
        faculty = {}
        name_element = columns[0].find('a')
        faculty['Name'] = name_element.text if name_element else columns[0].text.strip()
        
        # Check if the link already contains the BASE_URL
        if name_element and BASE_URL not in name_element['href']:
            faculty['Link'] = BASE_URL + name_element['href']
        else:
            faculty['Link'] = name_element['href'] if name_element else "No Link Provided"
        
        faculty['Department'] = columns[1].text.strip()
        faculty['Expertise'] = columns[2].text.strip()
        faculty['Office'] = columns[3].text.strip()
        faculty['Phone'] = columns[4].text.strip()
        faculty['Email'] = columns[5].text.strip() + "@iisermohali.ac.in"
        
        # Fetching details from the faculty's personal page
        additional_info = fetch_faculty_info(faculty['Link'])
        faculty.update(additional_info)
        
        # Append the faculty details to the dataframe
        df = df.append(faculty, ignore_index=True)

# Save the dataframe to a CSV file
df.to_csv('faculty_details.csv', index=False, encoding='utf-8')


  df = df.append(faculty, ignore_index=True)
  df = df.append(faculty, ignore_index=True)
  df = df.append(faculty, ignore_index=True)
  df = df.append(faculty, ignore_index=True)
  df = df.append(faculty, ignore_index=True)
  df = df.append(faculty, ignore_index=True)
  df = df.append(faculty, ignore_index=True)
  df = df.append(faculty, ignore_index=True)
  df = df.append(faculty, ignore_index=True)
  df = df.append(faculty, ignore_index=True)
  df = df.append(faculty, ignore_index=True)
  df = df.append(faculty, ignore_index=True)
  df = df.append(faculty, ignore_index=True)
  df = df.append(faculty, ignore_index=True)
  df = df.append(faculty, ignore_index=True)
  df = df.append(faculty, ignore_index=True)
  df = df.append(faculty, ignore_index=True)
  df = df.append(faculty, ignore_index=True)
  df = df.append(faculty, ignore_index=True)
  df = df.append(faculty, ignore_index=True)
  df = df.append(faculty, ignore_index=True)
  df = df.append(faculty, ignore_index=True)
  df = df.