In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from os.path import basename
import regex as re
import numpy as np
import math
import time
import data_io
import utils as u
import datetime as dt
import openpyxl

## Get Princeton links

In [None]:
def get_princeton_links(PAGE_CONTROL=None):
    article_df = u.ARTICLE_LINK_BASE.copy()
    indexer = 0
    if PAGE_CONTROL:
        max_pages = PAGE_CONTROL
    else:
        max_pages = 400
    for i in range(0, max_pages):
        temp_url = f"https://www.princeton.edu/news?category=971&search=&page={i}"
        soup = u.process_request(temp_url)
        if type(soup) == int:
            return article_df
        temp = soup.find_all('div',class_='news-run item')
        for t in temp:
            link_text = t.find('a')['href']
            article_df.loc[indexer, 'article_title'] = t.find('a').get_text().strip()
            article_df.loc[indexer, 'article_link'] = 'https://www.princeton.edu'+t.find('a')['href']
            date = re.search("/\d{4}/\d{2}/\d{2}", link_text).group()
            article_df.loc[indexer, 'article_date'] = date
            indexer += 1
    article_df = u.process_article_link_dataset(article_df, uni = 'princeton_')
    
    return article_df 

In [None]:
princeton_links = get_princeton_links(PAGE_CONTROL=9)

In [None]:
princeton_2019 = princeton_links[princeton_links['article_date'].str.contains('2019')]
princeton_2019 = princeton_2019.reset_index(drop = True)
princeton_2019

In [None]:
princeton_2019.to_csv(data_io.YEAR_LINK_FNAME.replace("uni", 'princeton').replace('year', '2019'),
                     encoding='utf-8-sig', index=False)

## Get MIT links

In [None]:
#70,42,8
def get_mit_links(PAGE_CONTROLS = [None, None, None]):
    #Note: since MIT has a funky main news page, you'll need to scrape the three 3 science schools
    mit_urls = ["https://news.mit.edu/school/engineering?type=1",
                "https://news.mit.edu/school/science?type=1",
                "https://news.mit.edu/school/mit-schwarzman-college-computing?type=1"]
    article_df = u.ARTICLE_LINK_BASE.copy()
    
    indexer = 0
    for m in range(0, len(mit_urls)):
        if PAGE_CONTROLS[m]:
            max_pages = PAGE_CONTROLS[m]
        else:
            r = requests.get(mit_urls[m])
            soup = BeautifulSoup(r.content, features = 'html')
            pages = soup.find('div',class_='page-term--views--header')
            pages = pages.find("header").get_text().strip()
            page_num_text = re.search('Displaying \d [-] \d{1,2} of \d{1,5} news', pages).group()
            divisor = re.search("[-] \d{1,2}", page_num_text).group()
            divisor = int(divisor.replace("- ", ""))
            total_arts = re.search('of \d{1,5}', page_num_text).group()
            total_arts = int(total_arts.replace("of ", ""))
            max_pages = int(math.ceil(total_arts/divisor))
        i = 0
        for i in range(0, max_pages+1):
            req = f"{mit_urls[m]}&page={i}"
            soup = process_request(req)
            if type(soup) == int:
                return article_df
            arts = soup.find_all('article')
            for a in arts:
                title = a.find('h3').get_text().strip()
                link = a['about']
                link = 'https://news.mit.edu'+link
                if link not in article_df['article_link'].to_list():
                    article_df.loc[indexer, 'article_title'] = title
                    article_df.loc[indexer, 'article_link'] = link
                    indexer += 1
    article_df = u.process_article_link_dataset(article_df_orig, uni = 'mit_')
    return article_df

In [None]:
mit_links = get_mit_links(PAGE_CONTROLS = [70,42,8])

In [None]:
mit_links

In [None]:
mit_links.to_csv(f'{data_io.DATA}mit_links.csv', index = False, encoding = 'utf-8-sig')

## Get Yale Links

In [None]:


def get_yale_links(PAGE_CONTROL = None):
    article_df = u.ARTICLE_LINK_BASE.copy()
    indexer = 0
    if PAGE_CONTROL:
        max_pages = PAGE_CONTROL
    else:
        test_link = 'https://news.yale.edu/search?sort=created&order=desc&f%5B0%5D=field_topic%3A36'
        #r = requests.get(test_link)
        soup = u.process_request(test_link)
        temp_page_count = soup.find("li", class_='pager-last last').find('a')['href']
        max_pages = re.search('page=\d{1,4}', temp_page_count).group()
        max_pages = max_pages.replace('page=', '')
        max_pages = int(max_pages)
        
       
    for i in range(0, max_pages):
        soup = u.process_request(f"https://news.yale.edu/search?sort=created&order=desc&f%5B0%5D=field_topic%3A36&page={i}")
        dates = soup.find_all("div", class_='date')
        links = soup.find_all('h3',class_='views-field views-field-title')
        k = 0
        for k in range(0, len(links)):
            article_df.loc[indexer, 'article_title'] = links[k].find('a').get_text().strip()
            article_df.loc[indexer, 'article_link'] = "https://news.yale.edu"+links[k].find('a')['href']
            article_df.loc[indexer, 'article_date'] = dates[k].find('p').get_text().strip()
            indexer += 1
    article_df = u.process_article_link_dataset(article_df, uni = 'yale_')
    return article_df

In [None]:
yale_links = get_yale_links(PAGE_CONTROL=45)

In [None]:
yale_links

In [None]:

yale_links.to_csv(f'{data_io.DATA}yale_links.csv', index = False, encoding = 'utf-8-sig')

## Get Columbia links

In [None]:
def get_columbia_links(PAGE_CONTROL = None):
    base_link = 'https://science.fas.columbia.edu/news/'
    if PAGE_CONTROL:
        max_pages = PAGE_CONTROL
    else:
        max_pages = 101
        
    article_df = u.ARTICLE_LINK_BASE.copy()
    
    i = 1
    indexer = 0
    for i in range(1, max_pages):
        soup = u.process_request(f'{base_link}page/{i}/')
        if type(soup) == int:
            return article_df
        temp_links = soup.find_all('h2',class_='post-title')
        dates = soup.find_all("span", class_='mdate')
        for i in range(0, len(temp_links)):
            this_link = temp_links[i].find('a')['href']
            if this_link not in article_df['article_link'].to_list():
                article_df.loc[indexer, 'article_title'] = temp_links[i].find('a').get_text().strip()
                article_df.loc[indexer, 'article_link'] = this_link
                article_df.loc[indexer, 'article_date'] = dates[i].get_text()
                indexer += 1
    article_df = u.process_article_link_dataset(article_df, uni = 'columbia_')
    return article_df

In [None]:
columbia_links = get_columbia_links(PAGE_CONTROL=41)

In [None]:
columbia_links.to_csv(f'{data_io.DATA}columbia_links.csv', index = False, encoding = 'utf-8-sig')

## Get Univ of Pennsylvania Links

In [None]:
def get_penn_links(PAGE_CONTROL = None):
    article_df = u.ARTICLE_LINK_BASE.copy()
    base_url = 'https://penntoday.upenn.edu/topic/science-and-technology/'
    
    paginated = 'https://penntoday.upenn.edu/topic/science-and-technology?page='
    if not PAGE_CONTROL:
        
        soup = u.process_request(base_url)
        num_pages = soup.find('li', class_='pager__item pager__item--last')
        num_pages = num_pages.find('a')['href']
        num_pages = int(num_pages.replace('?page=', ''))
        max_pages = num_pages
    else:
        max_pages = PAGE_CONTROL
        
    indexer = 0
    for i in range(0, max_pages):
        this_page = f'{paginated}{i}'
        soup = u.process_request(this_page)
        if type(soup) == int:
            return article_df
        link_container = soup.find_all("div", class_="tease__content")
        #links = soup.find_all('a',class_='tease__link')
        metadata = soup.find_all('p',class_='tease__meta')
        for i in range(0, len(link_container)):
            link = link_container[i].find('a',class_='tease__link')['href']
            link = f'https://penntoday.upenn.edu{link}'
            title = link_container[i].get_text().strip()
            title = ' '.join(title.split())
            title = title.strip()
            date = metadata[i].find("time").get_text()
            article_df.loc[indexer, 'article_title'] = title
            article_df.loc[indexer, 'article_link'] = link
            article_df.loc[indexer, 'article_date'] = date
            indexer += 1
    article_df = u.process_article_link_dataset(article_df, uni = 'penn_')
    return article_df

In [None]:
penn_links = get_penn_links(PAGE_CONTROL=35)


In [None]:
penn_links

In [None]:
penn_links.to_csv(f'{data_io.DATA}penn_links.csv', encoding = 'utf-8-sig', index = False)

## Get Harvard links

In [None]:

def get_harvard_links(PAGE_CONTROL = None):
    #article_df = ARTICLE_LINK_BASE.copy()
    
    soup = u.process_request("https://news.harvard.edu/gazette/section/science-technology/")
    if type(soup) == int:
        return article_df
    indexer = 0
    if PAGE_CONTROL:
        num_pages = PAGE_CONTROL
    else:
        num_pages = soup.find('h3', class_="archive-paging__page-text").get_text()
        num_pages = int(num_pages.replace('Page 1 of ', ''))
    links = []
    dates = []
    titles = []
    for n in range(0, num_pages):
        if n == 0:
            this_link = f'https://news.harvard.edu/gazette/section/science-technology/'
        else:
            this_link = f'https://news.harvard.edu/gazette/section/science-technology/page/{n}/'
        soup = u.process_request(this_link)
        if type(soup) == int:
            return article_df
        
        article_links = soup.find_all('h2', class_="tz-article-image__title")
        titles.extend([a.find("a").get_text().strip() for a in article_links])
        links.extend([a.find('a')['href'].strip() for a in article_links])
        
        article_dates = soup.find_all('div', class_='tz-article-image__cat-and-date')
        dates.extend([a.find("time").get_text().strip() for a in article_dates])
    article_df = pd.DataFrame({'article_link': links,
                              'article_date': dates,
                              'article_title': titles})
    article_df = u.process_article_link_dataset(article_df, uni = 'harvard_')
    return article_df

In [None]:
harvard_links = get_harvard_links(PAGE_CONTROL = 13)
#Shuffle the dataframe
harvard_links = harvard_links.sample(frac=1)
harvard_links

In [None]:
harvard_links.to_csv(f'{data_io.DATA}harvard_links.csv', encoding = 'utf-8-sig',index = False)