In [1]:
from IPython.display import display, Markdown, Latex
from scrap_cs_faculty import *

with open("readme.md") as f:
    readme_md = f.read()

display(Markdown(readme_md)) 

## US CS Faculty Dataset
- [CS Faculty Composition and Hiring Trends (Blog)](https://jeffhuang.com/computer-science-open-data/#cs-faculty-composition-and-hiring-trends)
- [2200 Computer Science Professors in 50 top US Graduate Programs](https://cs.brown.edu/people/apapouts/faculty_dataset.html)
- [CS Professors (Data Explorer)](https://drafty.cs.brown.edu/csprofessors?src=csopendata)
- [Drafty Project](https://drafty.cs.brown.edu/)
- [CSRankings.org](https://csrankings.org/#/fromyear/2011/toyear/2023/index?ai&vision&mlmining&nlp&inforet&act&crypt&log&us)


## Top 6 schools
Stanford, UCB, MIT, CMU, Cornell, UIUC

## add more schools

- https://www.cs.princeton.edu/people/faculty?type=main
- https://www.cs.washington.edu/people/faculty
- https://directory.seas.upenn.edu/computer-and-information-science/
- https://www.cms.caltech.edu/cms-people/faculty
- https://www.cs.columbia.edu/people/faculty/
- https://cse.engin.umich.edu/people/faculty/
- https://samueli.ucla.edu/search-faculty/#cs
- https://cse.ucsd.edu/people/faculty-profiles
- https://seas.harvard.edu/computer-science/people?role[46]=46
- https://www.cc.gatech.edu/people/faculty?page=24  (page=[0,25))


In [2]:
SCHOOL = "UWash-CS"
URL = SCHOOL_DICT[SCHOOL]["url"]  #  "https://www.eecs.mit.edu/role/faculty-aid/"
print(URL)

https://www.cs.washington.edu/people/faculty


In [3]:
base_url = "/".join(URL.split("/")[:3])
base_url

'https://www.cs.washington.edu'

In [4]:
page = requests.get(URL, headers=BROWSER_HEADERS)

In [5]:
soup = BeautifulSoup(page.content, "html.parser")

In [6]:
results = soup.find("div", class_="view-content")
print(results.prettify())

<div class="view-content">
 <div>
  <div class="row directory-row">
   <div class="col-sm-2 directory-photo-container">
    <a href="https://homes.cs.washington.edu/~althoff/">
     <img class="faculty-photo foo" onerror='this.src = "//s3-us-west-2.amazonaws.com/www-cse-public/portraits/_default_sm.jpg"; ' src="//s3-us-west-2.amazonaws.com/www-cse-public/images/portraits/althoff_sm.jpg"/>
    </a>
   </div>
   <div class="col-sm-10">
    <div class="directory-name">
     <a href="https://homes.cs.washington.edu/~althoff/">
      Tim Althoff
     </a>
    </div>
    <div class="directory-office">
     CSE2 313
    </div>
    <div class="directory-email">
     althoff
     <img src="/images/bw_ampersand.png"/>
     cs.washington.edu
    </div>
    <div class="directory-research-interests">
     <a href="/research/data_management">
      Data Science, Data Management &amp; Visualization
     </a>
     ,
     <a href="/research/hci">
      Human-Computer Interaction &amp; Accessible Techno

## Find Elements by HTML Class Name

In [7]:
cs_persons = soup.find_all("div", class_="row directory-row")

In [8]:
len(cs_persons) , cs_persons[0]

(99,
 <div class="row directory-row">
 <div class="col-sm-2 directory-photo-container">
 <a href="https://homes.cs.washington.edu/~althoff/"><img class="faculty-photo foo" onerror='this.src = "//s3-us-west-2.amazonaws.com/www-cse-public/portraits/_default_sm.jpg"; ' src="//s3-us-west-2.amazonaws.com/www-cse-public/images/portraits/althoff_sm.jpg"/></a>
 </div>
 <div class="col-sm-10">
 <div class="directory-name"><a href="https://homes.cs.washington.edu/~althoff/">Tim Althoff</a></div>
 <div class="directory-office">CSE2 313</div>
 <div class="directory-email">althoff<img src="/images/bw_ampersand.png"/>cs.washington.edu</div>
 <div class="directory-research-interests"><a href="/research/data_management">Data Science, Data Management &amp; Visualization</a>, <a href="/research/hci">Human-Computer Interaction &amp; Accessible Technology</a>, <a href="/research/ml">Machine Learning</a>, <a href="/research/nlp">Natural Language Processing</a></div>
 <div class="directory-research-blurb"><

### Extract Text From HTML Elements

You can add .text to a Beautiful Soup object to return only the text content of the HTML elements that the object contains:

In [9]:
DEBUG =  False # True # 

if DEBUG:
    N_START, N_STOP = 0, 1
else:
    N_START, N_STOP = 0, -1

school, dept = map_school_dept(SCHOOL)
data = []
all_research_dict = {}
for n, person in enumerate(cs_persons[N_START:N_STOP]):
    try:
        data_dict = {"school": school, "department": dept} # default
#         if DEBUG and n > 0: break  # debug
        
        # parse profile and image URL
        url_img = person.find("div", class_="col-sm-2 directory-photo-container")
        url = url_img.find("a")["href"]
        img_url = url_img.find("img")["src"]
        data_dict['url'] = url
        data_dict['img_url'] = f"https:{img_url}"
        
        person_detail = person.find("div", class_="col-sm-10")
        
        # parse name
        name = person_detail.find("div", class_="directory-name")
        if name:
            data_dict['name'] = name.text.strip()
        
        office = person_detail.find("div", class_="directory-office")
        if office:
            data_dict['office_address'] = office.text.strip()
            
        email = person_detail.find("div", class_="directory-email")
        if email:
            data_dict['email'] = email.text.strip().replace("cs.washington.edu", "@cs.washington.edu")

        blurb = person_detail.find("div", class_="directory-research-blurb")
        if blurb:
            data_dict['research_area'] = blurb.text.strip()
                                
        print(f"========\t n={n}\t=============")
        if DEBUG:
            print(f"name= {data_dict.get('name','')}")
            print(f"job_title= {data_dict.get('job_title','')}")
            print(f"phone= {data_dict.get('phone','')}")
            print(f"office= {data_dict.get('office_address','')}")
            print(f"email= {data_dict.get('email','')}")
            print(f"url= {data_dict.get('url','')}")
            print(f"img_url= {data_dict.get('img_url','')}")
            print(f"phd_univ= {data_dict.get('phd_univ','')}")
            print(f"phd_year= {data_dict.get('phd_year','')}")
            print(f"research_area= {data_dict.get('research_area','')}")
            print(f"department= {data_dict.get('department','')}")
            print(f"school= {data_dict.get('school','')}")
            print(f"url_profile= {data_dict.get('url_profile','')}")
            
        if data_dict:
            row_data = []
            for c in COLUMNS_v2:
                cell = data_dict.get(c,"")
                row_data.append(cell)
            data.append(row_data)
    except Exception as e:
        print(f"[Error] {str(e)}")
#         print(f"{person.prettify()}")



In [10]:
len(data) , data[-1]

(98,
 ['Luke Zettlemoyer',
  '',
  '',
  '',
  '',
  '',
  '',
  'http://www.cs.washington.edu/people/faculty/lsz/',
  'https://s3-us-west-2.amazonaws.com/www-cse-public/images/portraits/lsz_sm.jpg',
  '',
  'lsz@cs.washington.edu',
  '',
  'CSE 534',
  'Computer Science',
  'Univ Washington',
  '',
  ''])

In [11]:
df = pd.DataFrame(data, columns=COLUMNS_v2)

In [12]:
print(f"Number of faculties at {SCHOOL}: {df.shape[0]}")

Number of faculties at UWash-CS: 98


In [13]:
df

Unnamed: 0,name,job_title,phd_univ,phd_year,research_area,research_concentration,research_focus,url,img_url,phone,email,cell_phone,office_address,department,school,url_profile,url_author
0,Tim Althoff,,,,"Data Science, Data Mining, Social Network Anal...",,,https://homes.cs.washington.edu/~althoff/,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,althoff@cs.washington.edu,,CSE2 313,Computer Science,Univ Washington,,
1,Richard Anderson,,,,"Computing for the developing world, health inf...",,,https://www.cs.washington.edu/people/faculty/a...,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,anderson@cs.washington.edu,,CSE2 344,Computer Science,Univ Washington,,
2,Ruth E. Anderson,,,,"Computer science education, educational techno...",,,http://homes.cs.washington.edu/~rea,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,rea@cs.washington.edu,,CSE 558,Computer Science,Univ Washington,,
3,Tom Anderson,,,,"Distributed systems, networks, operating syste...",,,http://www.cs.washington.edu/people/faculty/tom/,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,tom@cs.washington.edu,,CSE 646,Computer Science,Univ Washington,,
4,Magdalena Balazinska,,,,"Databases, cloud computing, big-data analytics...",,,http://www.cs.washington.edu/people/faculty/ma...,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,magda@cs.washington.edu,,CSE 584,Computer Science,Univ Washington,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,Xi Wang,,,,"Computer systems, security, and programming la...",,,/people/faculty/xi,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,xi@cs.washington.edu,,CSE 580,Computer Science,Univ Washington,,
94,Robbie Weber,,,,,,,http://weberrobbie.com/,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,rtweber2@cs.washington.edu,,,Computer Science,Univ Washington,,
95,James R Wilcox,,,,"Computer science education, programming langua...",,,http://homes.cs.washington.edu/~jrw12/,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,jrw12@cs.washington.edu,,,Computer Science,Univ Washington,,
96,Brett Wortzman,,,,Computer science education; Computer science t...,,,https://homes.cs.washington.edu/~brettwo/,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,brettwo@cs.washington.edu,,CSE 542,Computer Science,Univ Washington,,


In [14]:
df["email"].to_list()

['althoff@cs.washington.edu',
 'anderson@cs.washington.edu',
 'rea@cs.washington.edu',
 'tom@cs.washington.edu',
 'magda@cs.washington.edu',
 'leibatt@cs.washington.edu',
 'beame@cs.washington.edu',
 'gilbo@cs.washington.edu',
 'bodik@cs.washington.edu',
 'bboots@cs.washington.edu',
 'bricker@cs.washington.edu',
 'mcakmak@cs.washington.edu',
 'luisceze@cs.washington.edu',
 'yejin@cs.washington.edu',
 'coladan@cs.washington.edu',
 'curless@cs.washington.edu',
 'ssdu@cs.washington.edu',
 'mernst@cs.washington.edu',
 'ali@cs.washington.edu',
 'jfogarty@cs.washington.edu',
 'fox@cs.washington.edu',
 'jonf@cs.washington.edu',
 'elba@cs.washington.edu',
 'gshyam@cs.washington.edu',
 'mgolub@cs.washington.edu',
 'djg@cs.washington.edu',
 'abhgupta@cs.washington.edu',
 'hannaneh@cs.washington.edu',
 'jheer@cs.washington.edu',
 'kheimerl@cs.washington.edu',
 'jhsia@cs.washington.edu',
 'scottsi@cs.washington.edu',
 'vsiyer@cs.washington.edu',
 'jamieson@cs.washington.edu',
 'nj@cs.washington.ed

In [15]:
# import xlsxwriter
file_xlsx = f"faculty-{SCHOOL}.xlsx"
print(f"{file_xlsx}")
writer = pd.ExcelWriter(file_xlsx, engine='xlsxwriter')
df.to_excel(writer, sheet_name="Faculty", index=False)
# if not df_research.empty:
#     df_research.to_excel(writer, sheet_name = "Research Groups", index=False)
writer.close()

faculty-UWash-CS.xlsx
