# Assignment1 Part B Writing a web scraper

In [1]:
import requests
from bs4 import BeautifulSoup
import re

In [2]:
url = 'https://wikimediafoundation.org/wiki/Home'
res = requests.get(url)
res.status_code

200

In [3]:
soup=BeautifulSoup(res.text, 'html.parser')

#### Collect all of the external links:

In [4]:
def splitUrl(url):
    parts  =  url.replace("https://", "").split("/")
    return parts[0]
    
domain = splitUrl(url)
a_list = soup.find_all('a', href=re.compile("^((?!"+domain+").)*$"))
                       
def get_link(a):
    href = a.get('href')
    if href is None:
        return
    
    if href.startswith('//'):
        href = 'https:' + href
    
    if not href.startswith('https'):
        return
        
    return href

links = [get_link(a) for a in a_list if get_link(a)]
links

['https://en.wikipedia.org/wiki/en:free_content',
 'https://en.wikipedia.org/wiki/wiki',
 'https://www.wikipedia.org',
 'https://blog.wikimedia.org/2018/01/23/cote-divoire-library-partnerships/',
 'https://blog.wikimedia.org/2018/01/23/education-survey-report/',
 'https://blog.wikimedia.org/2018/01/18/on-that-net-neutrality-clickstream-diagram/',
 'https://blog.wikimedia.org/2018/01/17/public-domain-grows-next-year/',
 'https://blog.wikimedia.org/2018/01/17/add-your-photos-to-wikimedia-commons/',
 'https://blog.wikimedia.org/',
 'https://donate.wikimedia.org/wiki/Special:FundraiserRedirector',
 'https://donate.wikimedia.org/wiki/Special:FundraiserRedirector',
 'https://annual.wikimedia.org/',
 'https://annual.wikimedia.org/',
 'https://lists.wikimedia.org/mailman/listinfo/WikimediaAnnounce-l',
 'https://meta.wikimedia.org/wiki/Wikimedia_Foundation_Annual_Plan/2017-2018/Final',
 'https://policy.wikimedia.org/',
 'https://transparency.wikimedia.org/',
 'https://status.wikimedia.org/',
 '

#### Associate the link with a textual description of it from the website:

In [5]:
def get_link_with_text(a):
    href = a.get('href')
    if href is None:
        return
    
    if href.startswith('//'):
        href = 'https:' + href
    
    if not href.startswith('https'):
        return
    
    return href, a.text

links_text = [get_link_with_text(a) for a in a_list if get_link_with_text(a)]
links_text

[('https://en.wikipedia.org/wiki/en:free_content', 'free'),
 ('https://en.wikipedia.org/wiki/wiki', 'wiki'),
 ('https://www.wikipedia.org', 'Wikipedia'),
 ('https://blog.wikimedia.org/2018/01/23/cote-divoire-library-partnerships/',
  'In Cote d’Ivoire, partnering with libraries provides opportunities'),
 ('https://blog.wikimedia.org/2018/01/23/education-survey-report/',
  'Three ways we’re changing the Education team at the Wikimedia Foundation'),
 ('https://blog.wikimedia.org/2018/01/18/on-that-net-neutrality-clickstream-diagram/',
  'How we made that net neutrality clickstream diagram'),
 ('https://blog.wikimedia.org/2018/01/17/public-domain-grows-next-year/',
  'The public domain starts growing again next year, and it’s about time'),
 ('https://blog.wikimedia.org/2018/01/17/add-your-photos-to-wikimedia-commons/',
  'How to add your photos to Wikimedia Commons and add to the sum of all knowledge'),
 ('https://blog.wikimedia.org/', 'Read the complete blog archive. »'),
 ('https://dona

#### function to check whether the link is valid:

In [6]:
from datetime import datetime

class Validate(object):
    def __init__(self):
        self.cache = dict()
    
    def validateUrl(self, url):
        if url in self.cache: 
            return self.cache[url]
        
        response = requests.get(url)
        self.last_check = datetime.now().strftime("%A %B %-d, %Y %I:%M %p")
        result = response.status_code == 200
        self.cache[url] = (self.last_check, result)
        print('{}\t{}'.format(url, result))
        return self.last_check, result

In [7]:
from itertools import chain
v = Validate()
res = [list(chain.from_iterable((v.validateUrl(link), [text, link]))) for link, text in links_text]

https://en.wikipedia.org/wiki/en:free_content	True
https://en.wikipedia.org/wiki/wiki	True
https://www.wikipedia.org	True
https://blog.wikimedia.org/2018/01/23/cote-divoire-library-partnerships/	True
https://blog.wikimedia.org/2018/01/23/education-survey-report/	True
https://blog.wikimedia.org/2018/01/18/on-that-net-neutrality-clickstream-diagram/	True
https://blog.wikimedia.org/2018/01/17/public-domain-grows-next-year/	True
https://blog.wikimedia.org/2018/01/17/add-your-photos-to-wikimedia-commons/	True
https://blog.wikimedia.org/	True
https://donate.wikimedia.org/wiki/Special:FundraiserRedirector	True
https://annual.wikimedia.org/	True
https://lists.wikimedia.org/mailman/listinfo/WikimediaAnnounce-l	True
https://meta.wikimedia.org/wiki/Wikimedia_Foundation_Annual_Plan/2017-2018/Final	True
https://policy.wikimedia.org/	True
https://transparency.wikimedia.org/	True
https://status.wikimedia.org/	True
https://meta.wikimedia.org/wiki/IRC_Office_Hours	True
https://meta.wikimedia.org/wiki/F

In [8]:
res

[['Thursday January 25, 2018 07:19 PM',
  True,
  'free',
  'https://en.wikipedia.org/wiki/en:free_content'],
 ['Thursday January 25, 2018 07:19 PM',
  True,
  'wiki',
  'https://en.wikipedia.org/wiki/wiki'],
 ['Thursday January 25, 2018 07:19 PM',
  True,
  'Wikipedia',
  'https://www.wikipedia.org'],
 ['Thursday January 25, 2018 07:19 PM',
  True,
  'In Cote d’Ivoire, partnering with libraries provides opportunities',
  'https://blog.wikimedia.org/2018/01/23/cote-divoire-library-partnerships/'],
 ['Thursday January 25, 2018 07:19 PM',
  True,
  'Three ways we’re changing the Education team at the Wikimedia Foundation',
  'https://blog.wikimedia.org/2018/01/23/education-survey-report/'],
 ['Thursday January 25, 2018 07:19 PM',
  True,
  'How we made that net neutrality clickstream diagram',
  'https://blog.wikimedia.org/2018/01/18/on-that-net-neutrality-clickstream-diagram/'],
 ['Thursday January 25, 2018 07:19 PM',
  True,
  'The public domain starts growing again next year, and it’s

#### Save the external links(urls), textual description, a boolean for valid, and the last vaild datetime check to an excel file:

In [9]:
import pandas as pd
df = pd.DataFrame(res)
df.columns = ['Check Datetime', 'Valid', 'Text', 'External Link']
df.to_csv('web_external_links.csv')