In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

URL = 'https://lhr.nu.edu.pk/faculty/'

department_mapping = {
    'FAST School of Computing Faculty': 'fsc',
    'Department of Electrical Engineering Faculty': 'ee',
    'Department of Civil Engineering Faculty': 'cv',
    'Department of Science & Humanities Faculty': 'ss',
    'FAST School of Management Faculty': 'fsm' }

try:
    response = requests.get(URL)
    if response.status_code == 200:
        print("Page fetched successfully.")
        soup = BeautifulSoup(response.content, 'html.parser')

        department_sections = soup.find_all('div', class_='container')
        print(f"Found {len(department_sections)} department sections.")

        for department_section in department_sections:
            department_name_tag = department_section.find('h1', class_='mb-2 mt-3')
            if department_name_tag:
                department_name = department_name_tag.text.strip()
                print(f"Processing department: {department_name}")

                department_code = department_mapping.get(department_name)

                if department_code is None:
                    print(f"Skipping department: {department_name}")
                    continue

                faculty_data = []
                faculty_list = department_section.find_all('div', class_='col-lg-3 col-md-4 col-sm-6 col-12')
                print(f"Found {len(faculty_list)} faculty members in {department_name}.")

                for faculty in faculty_list:
                    if faculty.find('p', class_='small text-center font-italic') and "HEC Approved PhD Supervisor" in faculty.find('p', class_='small text-center font-italic').text:
                        phd_supervisor = True
                    else:
                        phd_supervisor = False

                    faculty_link_tag = faculty.find('a', class_='faculty-link')
                    if faculty_link_tag and 'href' in faculty_link_tag.attrs:
                        href = faculty_link_tag['href']
                        id = href.split('/')[-1]
                    else:
                        id = "N/A"

                    serial_no = len(faculty_data) + 1

                    name = faculty.find('h5').text.strip() if faculty.find('h5') else "N/A"
                    designation = faculty.find('p', class_='small text-center font-italic').text.strip() if faculty.find('p', class_='small text-center font-italic') else "N/A"
                    email = faculty.find('p', class_='mb-0 text-center').text.strip() if faculty.find('p', class_='mb-0 text-center') else "N/A"
                    image_url = faculty.find('img')['src'] if faculty.find('img') else "N/A"

                    faculty_data.append({
                        'ID': id,
                        'Name': name,
                        'Designation': designation,
                        'Email': email,
                        'Department': department_name,
                        'ImageURL': image_url,
                        'HEC Approved PhD Supervisor': phd_supervisor
                    })

                if faculty_data:
                    df = pd.DataFrame(faculty_data)
                    csv_filename = f"{department_code}.csv"
                    df.to_csv(csv_filename, index=False)
                    print(f"Data saved to {csv_filename}")
                else:
                    print(f"No faculty data found for {department_name}.")

    else:
        print(f"Failed to fetch the page. Status code: {response.status_code}")

except Exception as e:
    print(f"An error occurred: {e}")


Page fetched successfully.
Found 11 department sections.
Processing department: FAST School of Computing Faculty
Found 89 faculty members in FAST School of Computing Faculty.
Data saved to fsc.csv
Processing department: Department of Electrical Engineering Faculty
Found 24 faculty members in Department of Electrical Engineering Faculty.
Data saved to ee.csv
Processing department: Department of Civil Engineering Faculty
Found 22 faculty members in Department of Civil Engineering Faculty.
Data saved to cv.csv
Processing department: FAST School of Management Faculty
Found 35 faculty members in FAST School of Management Faculty.
Data saved to fsm.csv
Processing department: Department of Science & Humanities Faculty
Found 30 faculty members in Department of Science & Humanities Faculty.
Data saved to ss.csv


Step 2:
There is some more information present on the profile page of each faculty member e.g. education, phone,
publication, etc. You need to collect the phone number (extension only) and the latest education.
Load all saved files from step#1 to data frames named fsc, ee, cv, fsm, and ss.
Now select only the ID column from this data frame and save it to a list. Create an iterator that takes each
id from the list and constitutes a faculty page link as follows:
http://lhr.nu.edu.pk/fsc/facultyProfile/ID
Collect the data from the faculty profile page and it must look like the following.
ID Extension Highest Education
Integer Integer String
Transform it into a data frame and save it as &quot;fsm_2.csv&quot;. Do the same for each department and save the
files. At the end of step#2, you must have the following .csv files.
fsc.csv, ee.csv, cv.csv, fsm.csv, ss.csv
fsc_2.csv, ee_2.csv, cv_2.csv, fsm_2.csv, ss_2.csv

In [15]:

csv_files = ['fsc.csv', 'ee.csv', 'cv.csv', 'ss.csv', 'fsm.csv']

output_filenames = {
    'fsc': 'fsc_2.csv',
    'ee': 'ee_2.csv',
    'cv': 'cv_2.csv',
    'ss': 'ss_2.csv',
    'fsm': 'fsm_2.csv'
}

for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    id_list = df['ID'].tolist()
    faculty_profile_data = []

    for id in id_list:
        faculty_url = f'https://lhr.nu.edu.pk/{csv_file.split(".")[0]}/facultyProfile/{id}'
        print(f"Fetching data from: {faculty_url}")
        response = requests.get(faculty_url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            extension = soup.find('span', class_='small').text.strip().split(":")[1] if soup.find('span', class_='small') else "N/A"
            education_section = soup.find('div', class_='col-lg-8 col-md-6 col-sm-12 text-justify')
            highest_education = education_section.find('ul').find('li').text.strip() if education_section.find('ul').find('li') else "N/A"

            faculty_profile_data.append({
                'ID': id,
                'Extension': extension,
                'Highest Education': highest_education
            })
        else:
            print(f"Failed to retrieve data for ID {id}. Status code: {response.status_code}")

    df_profile = pd.DataFrame(faculty_profile_data)
    output_filename = output_filenames[csv_file.split('.')[0]]
    df_profile.to_csv(output_filename, index=False)
    print(f"Data saved to {output_filename}")


Fetching data from: https://lhr.nu.edu.pk/fsc/facultyProfile/1238
Fetching data from: https://lhr.nu.edu.pk/fsc/facultyProfile/4027
Fetching data from: https://lhr.nu.edu.pk/fsc/facultyProfile/4391
Fetching data from: https://lhr.nu.edu.pk/fsc/facultyProfile/6113
Fetching data from: https://lhr.nu.edu.pk/fsc/facultyProfile/4329
Fetching data from: https://lhr.nu.edu.pk/fsc/facultyProfile/6968
Fetching data from: https://lhr.nu.edu.pk/fsc/facultyProfile/9113
Fetching data from: https://lhr.nu.edu.pk/fsc/facultyProfile/6174
Fetching data from: https://lhr.nu.edu.pk/fsc/facultyProfile/9540
Fetching data from: https://lhr.nu.edu.pk/fsc/facultyProfile/5181
Fetching data from: https://lhr.nu.edu.pk/fsc/facultyProfile/4261
Fetching data from: https://lhr.nu.edu.pk/fsc/facultyProfile/6048
Fetching data from: https://lhr.nu.edu.pk/fsc/facultyProfile/5116
Fetching data from: https://lhr.nu.edu.pk/fsc/facultyProfile/6811
Fetching data from: https://lhr.nu.edu.pk/fsc/facultyProfile/4236
Fetching d

Now comes the data integration part. Load all files into data frames. Merge fsc and fsc_2 data frames
using left join on ID column. Do the same for ee, cv, fsm and ss.
Now you have ID, Name, Designation, HEC Approved PHD Supervisor, Email, Department, ImageURL,
Extension and Education in a single data frame.
Concatenate all merged data frames to create a single data frame having the information of all faculty
members from all departments. Save this data frame as a .csv file named fast_lhr_faculty.csv.

In [16]:

csv_files = ['fsc.csv', 'fsc_2.csv', 'ee.csv', 'ee_2.csv', 'cv.csv', 'cv_2.csv', 'ss.csv', 'ss_2.csv','fsm.csv', 'fsm_2.csv']
merged_dfs = []

for i in range(0, len(csv_files), 2):
    original_df = pd.read_csv(csv_files[i])
    additional_df = pd.read_csv(csv_files[i + 1])
    merged_df = original_df.merge(additional_df, on='ID', how='left')
    merged_dfs.append(merged_df)

final_df = pd.concat(merged_dfs, ignore_index=True)
final_df.to_csv('fast_lhr_faculty.csv', index=False)

print("All data has been saved to fast_lhr_faculty.csv")


All data has been saved to fast_lhr_faculty.csv
