In [7]:
from IPython.display import display, Markdown, Latex
from scrap_cs_faculty import *

with open("readme.md") as f:
    readme_md = f.read()

display(Markdown(readme_md)) 

## US CS Faculty Dataset
- [CS Faculty Composition and Hiring Trends (Blog)](https://jeffhuang.com/computer-science-open-data/#cs-faculty-composition-and-hiring-trends)
- [2200 Computer Science Professors in 50 top US Graduate Programs](https://cs.brown.edu/people/apapouts/faculty_dataset.html)
- [CS Professors (Data Explorer)](https://drafty.cs.brown.edu/csprofessors?src=csopendata)
- [Drafty Project](https://drafty.cs.brown.edu/)
- [CSRankings.org](https://csrankings.org/#/fromyear/2011/toyear/2023/index?ai&vision&mlmining&nlp&inforet&act&crypt&log&us)



Use beautiful_soup to scrap CS Faculty info

In [2]:
SCHOOL = "UIUC-CS"
URL = SCHOOL_DICT[SCHOOL]["url"]  #  "https://www.cs.cornell.edu/people/faculty"
URL

'https://cs.illinois.edu/about/people/department-faculty'

In [3]:
page = requests.get(URL, headers=BROWSER_HEADERS)
soup = BeautifulSoup(page.content, "html.parser")

## Find Elements by HTML Class Name

In [4]:
cs_persons = soup.find_all("div", class_="details")

In [5]:
len(cs_persons), cs_persons[0]

(121,
 <div class="details">
 <div class="name"><a href="/about/people/department-faculty/zaher">Tarek  Abdelzaher</a></div>
 <div class="title">Sohaib and Sara Abbasi Professor and Willett Faculty Scholar</div>
 <div class="contact">
 <div class="phone">
 <a href="tel:(217) 265-6793">
 <span class="fa-stack fa-lg">
 <i class="fa fa-circle fa-stack-2x"></i>
 <i class="fa fa-phone fa-stack-1x fa-inverse"></i>
 </span>
 </a>
 </div>
 <div class="email hide-empty" data-value="zaher@illinois.edu">
 <a href="mailto:zaher@illinois.edu">
 <span class="fa-stack fa-lg">
 <i class="fa fa-circle fa-stack-2x"></i>
 <i class="fa fa-envelope fa-stack-1x fa-inverse"></i>
 </span>
 </a>
 </div>
 </div>
 </div>)

### Extract Text From HTML Elements

You can add .text to a Beautiful Soup object to return only the text content of the HTML elements that the object contains:

In [6]:
DEBUG = False # True # 

In [7]:
school, dept = map_school_dept(SCHOOL)
data = []
for n, person in enumerate(cs_persons):
    try:
        data_dict = {"school": school, "department": dept} # default
        if DEBUG and n > 0: break  # debug
        
        # get name/url
        x = person.find("div", class_="name")
        name_x = x.find("a")
        data_dict['name'] = name_x.text.strip()
        data_dict['job_title'] = person.find("div", class_="title").text.strip()

        # get phone, email, url
        for x in person.find_all("a"):
            if "tel:" in x["href"]:
                data_dict['phone'] = x["href"].replace("tel:", "")
            elif "mailto:" in x["href"]:
                data_dict['email'] = x["href"].replace("mailto:", "")
            else:
                data_dict['url'] = uiuc_fix_url(x["href"])

        img_url = person.parent.find("div", class_="photo")["style"]
        data_dict['img_url'] = uiuc_match_img_url(img_url)

        if DEBUG:
            print(f"n={n}\t=============")
            print(f"name= {data_dict.get('name','')}")
            print(f"job_title= {data_dict.get('job_title','')}")
            print(f"phone= {data_dict.get('phone','')}")
            print(f"email= {data_dict.get('email','')}")
            print(f"url= {data_dict.get('url','')}")
            print(f"img_url= {data_dict.get('img_url','')}")
        
        if data_dict:
            row_data = []
            for c in COLUMNS:
                cell = data_dict.get(c,"")
                row_data.append(cell)
            data.append(row_data)
    except Exception as e:
        print(f"[Error] {str(e)}\n{person.prettify()}")

In [8]:
data[-1]

['Craig Zilles',
 'Professor and Severns Faculty Scholar',
 '',
 '',
 '',
 '',
 '',
 'https://cs.illinois.edu/about/people/department-faculty/zilles',
 'https://ws.engr.illinois.edu/directory/viewphoto.aspx?id=5020&s=400&type=portrait',
 '(217) 244-0553',
 'zilles@illinois.edu',
 '',
 '',
 'Computer Science',
 'Univ Illinois Urbana-Champaign']

In [9]:
df = pd.DataFrame(data, columns=COLUMNS)

In [10]:
print(f"Number of faculties at {SCHOOL}: {df.shape[0]}")

Number of faculties at UIUC-CS: 121


In [11]:
df

Unnamed: 0,name,job_title,phd_univ,phd_year,research_area,research_concentration,research_focus,url,img_url,phone,email,cell_phone,office_address,department,school
0,Tarek Abdelzaher,Sohaib and Sara Abbasi Professor and Willett F...,,,,,,https://cs.illinois.edu/about/people/departmen...,https://ws.engr.illinois.edu/directory/viewpho...,(217) 265-6793,zaher@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign
1,Sarita V. Adve,Richard T. Cheng Professor,,,,,,https://cs.illinois.edu/about/people/departmen...,https://ws.engr.illinois.edu/directory/viewpho...,(217) 333-8461,sadve@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign
2,Vikram Adve,Donald B. Gillies Professor in Computer Science,,,,,,https://cs.illinois.edu/about/people/departmen...,https://ws.engr.illinois.edu/directory/viewpho...,(217) 244-2016,vadve@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign
3,Gul A. Agha,Research Professor and Professor Emeritus,,,,,,https://cs.illinois.edu/about/people/departmen...,https://ws.engr.illinois.edu/directory/viewpho...,(217) 244-3087,agha@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign
4,Ram Alagappan,Assistant Professor,,,,,,https://cs.illinois.edu/about/people/departmen...,https://ws.engr.illinois.edu/directory/viewpho...,,ramn@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,Wenzhen Yuan,Assistant Professor starting Fall 2023,,,,,,https://cs.illinois.edu/about/people/departmen...,https://ws.engr.illinois.edu/directory/viewpho...,,yuanwz@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign
117,ChengXiang Zhai,Donald Biggar Willett Professor in Engineering,,,,,,https://cs.illinois.edu/about/people/departmen...,https://ws.engr.illinois.edu/directory/viewpho...,(217) 244-4943,czhai@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign
118,Lingming Zhang,Associate Professor,,,,,,https://cs.illinois.edu/about/people/departmen...,https://ws.engr.illinois.edu/directory/viewpho...,(217) 244-8921,lingming@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign
119,Han Zhao,Assistant Professor,,,,,,https://cs.illinois.edu/about/people/departmen...,https://ws.engr.illinois.edu/directory/viewpho...,,hanzhao@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign


In [12]:
# import xlsxwriter
file_xlsx = f"faculty-{SCHOOL}.xlsx"
writer = pd.ExcelWriter(file_xlsx, engine='xlsxwriter')
df.to_excel(writer, sheet_name="Faculty", index=False)
# if not df_research.empty:
#     df_research.to_excel(writer, sheet_name = "Research Groups", index=False)
writer.save()