In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import requests

In [2]:
file_path = '~/Downloads/data/people.csv'
df = pd.read_csv(file_path)

group_urls = set()
for urls in df['groups_urls'].unique():
  if isinstance(urls, str):
    group_urls |= set(urls.split())

group_df = pd.DataFrame(group_urls)

In [3]:
def get_group_image_url(project_url):
    """
    scrape the media lab website to get the image corresponding to a project

    :param project_url: the url to the project on the media lab website
    :return: the url of the project image if it exists, None otherwise
    """
    try:
        response = requests.get(project_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        hero = soup.find('div', {'class': 'hero'})
        style = hero['style']
        print(style)
    except:
        return
    match = re.search('url\((".+")\)', style)
    if match:
        image_url = match.group(0)[5:-2]
        return image_url

In [4]:
group_df.rename({0: 'url'}, axis=1, inplace=True)
group_df

Unnamed: 0,url
0,https://www.media.mit.edu/groups/ml-learning/o...
1,https://www.media.mit.edu/groups/sculpting-evo...
2,https://www.media.mit.edu/groups/ethics-and-go...
3,https://www.media.mit.edu/groups/open-agricult...
4,https://www.media.mit.edu/groups/directors-fel...
...,...
94,https://www.media.mit.edu/groups/music-mind-an...
95,https://www.media.mit.edu/groups/digital-curre...
96,https://www.media.mit.edu/groups/synthetic-neu...
97,https://www.media.mit.edu/groups/center-for-mo...


In [5]:
group_df['image_url'] = group_df.apply(lambda x:get_group_image_url(x['url']), axis=1)

background-image: url("https://dam-prod2.media.mit.edu/thumb/2018/05/10/8448809049_729d412860_k.jpg.1400x1400.jpg"); background-position: 50.0% 50.0%;
background-image: url("https://dam-prod2.media.mit.edu/thumb/2016/12/12/IMG_6301.JPG.1400x1400.jpg"); background-position: 50.0% 51.8164832883537%;
background-image: url("https://dam-prod2.media.mit.edu/thumb/2017/09/13/Worm_Holes%2C_By_OmniDaily.gif.1400x1400.gif"); background-position: 54.1719342604298% 44.422107485670494%;
background-image: url("https://dam-prod2.media.mit.edu/thumb/2019/06/20/STC%20New%20Growth%202.JPG.1400x1400.jpg"); background-position: 50.0% 50.0%;
background-image: url("https://dam-prod2.media.mit.edu/thumb/2019/06/24/DF-home-grid-ML-REV-042619-1600x916.jpg.1400x1400.jpg"); background-position: 50.8526187576127% 63.8655462184874%;
background-image: linear-gradient(25deg, #de1923  0%,#f46a21  100%);
background-image: url("https://dam-prod2.media.mit.edu/thumb/2017/08/17/landsat_art_kilimanjro_lrg.jpg.1400x1400.jp

In [6]:
def get_bio(people_url):
    try:
        response = requests.get(people_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        bio = soup.findAll('div', {'class': 'main-copy'})[0].get_text()
        return bio.strip()
    except:
        return

group_df['description'] = group_df.apply(lambda x:get_bio(x['url']), axis=1)

In [7]:
def get_name(people_url):
    try:
        response = requests.get(people_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        bio = soup.find('title').get_text()
        return bio[11:-16]
    except:
        return

group_df['title'] = group_df.apply(lambda x:get_name(x['url']), axis=1)

In [8]:
group_df = group_df[group_df['title'] != ""]
group_df.to_csv('groups.csv')