In [2]:
#!pip install unidecode

Collecting unidecode
  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
Installing collected packages: unidecode
Successfully installed unidecode-1.1.1


In [1]:
from lxml import html
import requests
import unidecode

In [2]:
people_cid = [
    "iola",
    "maspan",
    "joma",
    "hansj",
    "dagb",
    "alme",
    "gorang",
    "malemyr",
    "mujakob",
    "simaria",
    "borgue",
    "acatic",
    "inigoa",
    "varo",
    "hanem",
    "mstrom",
    "szandra",
]

In [3]:
class Publication:
    def __init__(self, title, year, publication_cit, publication_type, authors):
        self.title = title
        self.year = year
        self.publication_cit = publication_cit
        self.publication_type = publication_type
        self.authors = authors

In [4]:
class Person:
    def __init__(self, cid, name, affiliation, person_bio, publications):
        self.cid = cid
        self.name = name
        self.affiliation = affiliation
        self.person_bio = person_bio
        self.publications = publications

In [9]:
people = []
for cid in people_cid:
    page = requests.get('https://research.chalmers.se/en/person/'+cid)
    tree = html.fromstring(page.content)
    if page.status_code == 404:
        print(f'{cid} does not have a profile in Research Chalmers.')
    else:
        print(f'Getting info for {cid}')
        name = tree.xpath('//h1[@id="person-name"]/span/text()')[0].strip()
        affiliation = tree.xpath('//div[@itemprop="affiliation"]/strong/text()')[0].strip().replace(' at','')
        try:
            person_bio = tree.xpath('//span[@itemprop="description"]/text()')[0].strip()
        except:
            person_bio = ''
        items = tree.xpath('//div[@class="publication-item"]')
        publications =  []
        for item in items:
            title = item.xpath('h4[@class="publication-title list-group-item-heading"]/a/text()')[0]
            year = item.xpath('div[@class="publication-year pull-right"]/strong/text()')[0]
            publication_cit = item.xpath('div[@id="publication-cit"]/text()')[0].strip()
            publication_type = item.xpath('div[@class="publication-type"]/text()')[0].strip()
            authors = item.xpath('div/strong/span/text()')
            publications.append(Publication(title, year, publication_cit, publication_type, authors))
        person = Person(cid, name, affiliation, person_bio, publications)
        people.append(person)
        file_name = unidecode.unidecode(person.name).lower().replace(' ','-')
        with open('_people/'+file_name+'.md', 'w', encoding="utf-8") as file:
            # Further file processing goes here
            file.writelines(['---','\n'])
            file.writelines(['object-id: ',file_name,'\n'])
            file.writelines(['cid: ',cid,'\n'])
            file.writelines(['name: ',name,'\n'])
            file.writelines(['role: ',affiliation,'\n'])
            file.writelines(['picture: ',file_name+'.jpg','\n'])
            file.writelines(['---','\n\n'])
            file.writelines([person_bio,'\n'])
        picture_url = tree.xpath('//div[@class="person-image pull-right"]/span/img/@src')[0]
        r = requests.get(picture_url, allow_redirects=True)
        with open('assets/img/people/'+file_name+'.jpg', 'wb') as picture_file:
            picture_file.write(r.content)

Getting info for iola
Getting info for maspan
Getting info for joma
Getting info for hansj
Getting info for dagb
Getting info for alme
Getting info for gorang
Getting info for malemyr
Getting info for mujakob
Getting info for simaria
Getting info for borgue
Getting info for acatic
Getting info for inigoa
Getting info for varo
Getting info for hanem
Getting info for mstrom
Getting info for szandra
