# This notebook downloads and saves as text documents press releases FOR SEPTEMBER from the 6 New Zealand political parties likely to be represented in Parliament following the September 2017 General Election: National, Labour, NZ First, Greens, Maori Party, and ACT and saves in a 'test' folder.

In [None]:
import urllib.request
from urllib.request import urlopen
from bs4 import BeautifulSoup
import os

In [None]:
def clear_all():
    urls   = []
    titles = []
    all_links = []
    slash_links = []
    releases_pages = []
    return(urls, titles, all_links, slash_links, releases_pages)

def write_to_file(textstr, filename):    
    text = bytes(textstr, 'utf-8')
    file = open(filename,'wb')
    file.write(text)
    file.close()
    return()


### 1. Get Labour press releases

In [None]:
# First scrape press release URLs
all_links = []

# Pages with press release urls on them (5 pages of them from start Sep):
releases_pages = ['http://www.labour.org.nz/press_releases']
for i in range(2,6):  # (2,6) is September
    pg = 'http://www.labour.org.nz/press_releases?page=' + str(i) 
    releases_pages.append(pg)

for each_page in releases_pages:    
    r = urllib.request.urlopen(each_page).read()
    soup = BeautifulSoup(r, "lxml")
    #print(soup.prettify())
    
    mess = soup.find_all('span', class_="read-more")
    for subby in mess:
        links = subby.find_all('a')        
        for link in links:
            relative_url = link.get('href')
            relative_url = relative_url.replace('http://www.labour.org.nz', '') # Fix the occasional absolute url
            new_link = 'http://www.labour.org.nz' + str(relative_url)
            all_links.append(new_link)

print(len(all_links))
#print(all_links)

In [None]:
def scrape_labour(urls):
    clear_all()
    for newurl in urls:
        print(newurl)    
        filename = newurl[25:] + '.txt'
        full_path = os.path.join('test/labour', filename)
        if not os.path.exists(full_path):
            page      = urllib.request.urlopen(newurl).read()
            soup      = BeautifulSoup(page, "lxml")
         #   print(soup.prettify())
            reltext   = soup.find('div', class_ = 'content blog-content').get_text()
            if soup.find(class_ = 'byline'):
                relbyline = soup.find(class_ = 'byline').get_text()
            else:
                print('url', newurl, 'has no byline. Add manually.')
                relbyline = 'XXX'
        #     text = str(relbyline) + str(reltext)        # If you want author and date at start use this
            text = str(reltext)        
            write_to_file(text, full_path)
        else:
            continue
    
scrape_labour(all_links)    


### 2. Get National Party press releases

In [None]:
clear_all()
slash_links = []
all_links = []

# Pages with press release urls on them:
releases_pages = ['https://www.national.org.nz/press']
for i in range(2,10):  # (2,10) is Sep (8 onward)
    pg = 'https://www.national.org.nz/press?page=' + str(i) 
    releases_pages.append(pg)

for each_page in releases_pages:    
    r = urllib.request.urlopen(each_page).read()
    soup = BeautifulSoup(r, "lxml")
    for link in soup.find_all('a'):
        all_links.append(link.get('href'))
        
for linky in all_links:
    if linky:
        if len(linky)>36:
            if linky[0:36] == 'http://www.facebook.com/share.php?u=':
                slash_links.append(linky[36:])
            else:   
                pass
        else:
            pass
    else:
        pass

print(len(slash_links))

In [None]:
# Now scrape similarly to before.

def scrape_national(urls):
    counter = 0
    for newurl in urls:
    #    print(newurl)   
        filename = newurl[28:] + '.txt'
        full_path = os.path.join('test/national', filename)
        if not os.path.exists(full_path):
            page      = urllib.request.urlopen(newurl).read()
            soup      = BeautifulSoup(page, "lxml")
    #        print(soup.prettify())
            reltext   = soup.find('div', class_ = 'content').get_text()
            if soup.find(class_ = 'author'):  # Some press releases are missing an author field
                relauthor = soup.find(class_ = 'author').get_text()
            else:
                print('url', newurl, 'has no author field. May need to add manually.')
                relauthor = 'XXX'        
            if soup.time:  # Some press releases are missing a time field ??
                reldate   = soup.time.get_text()
            else:
                print('url', newurl, 'has no date field. May need to add manually.')
                reldate = ' XXX' 
            text      = str(reltext)
#            text      = str(relauthor) + str(reldate) + ' ' + str(reltext)  # Use this if you want author and time
            write_to_file(text, full_path)
        else:
            continue
        counter += 1
    print(counter,"press releases saved as text documents.")   
    
scrape_national(slash_links)    


### 3. Get NZ First press releases

In [None]:
clear_all()
all_links = []

# Pages with press release urls on them (47 pages of them for June/July/August):
releases_pages = ['http://www.nzfirst.org.nz/news']
for i in range(2,8):  # (2,8) is all of Sep
    pg = 'http://www.nzfirst.org.nz/news?page=' + str(i) 
    releases_pages.append(pg)

for each_page in releases_pages:    
    r = urllib.request.urlopen(each_page).read()
    soup = BeautifulSoup(r, "lxml")
    #print(soup.prettify())
    
    mess = soup.find_all('div', class_="blog-title-wrap")
    for subby in mess:
        messy = subby.find_all('a')
        #print(mess)
        for link in messy:
            relative_url = link.get('href')
            new_link = 'http://www.nzfirst.org.nz' + str(relative_url)
            all_links.append(new_link)

print(len(all_links))
#print(all_links[:44])


In [None]:
# Now scrape similarly to before.
''
def scrape_nzfirst(urls):
    counter = 0
    for newurl in urls:
        print(newurl)
        try:
            page = urllib.request.urlopen(newurl).read()
        except urllib.error.URLError:
            print ("Page not found!")    
            continue
        filename = newurl[26:] + '.txt'
        full_path = os.path.join('test/nzfirst', filename)
        if not os.path.exists(full_path):
            soup      = BeautifulSoup(page, "lxml")
      #      print(soup.prettify())
            if soup.find('div', class_ = 'content wysiwyg'):
                reltext = soup.find('div', class_ = "content wysiwyg")
            else:
                print("for url ", url, "I cannot find class content wysiwyg")
            if soup.find(class_ = 'page-tag'):  
                relauthor = soup.find(class_ = 'page-tag').get_text()
            else:
                print('url', newurl, 'has no author field. May need to add manually.')
                relauthor = 'unauthored '        
            if soup.find('div', class_ = 'meta').get_text():               
                reldate   = soup.find('div',class_ = 'meta').get_text()
            else:
                print('url', newurl, 'has no date field. Add manually.')
                reldate = ' XXX' 
    #         text      = str(relauthor) + str(reldate) + ' ' + reltext.get_text() # Use this if you want author and date
            text      = reltext.get_text()

            write_to_file(text, full_path)
            counter += 1
    print(counter,"NZ First press releases saved as text documents.")   
    
scrape_nzfirst(all_links) 



### 4. Get Green party press releases

In [None]:
all_links = []

# Pages with press release urls on them (10 pages of them for June/July/August):
releases_pages = ['https://www.greens.org.nz/media?f[0]=type%3Agreens_press_release']
for i in range(1,4):  # (1,7) is all of Sep
    pg = 'https://www.greens.org.nz/media?page=' + str(i) + '1&f[0]=type%3Agreens_press_release' 
    releases_pages.append(pg)

for each_page in releases_pages:    
    r = urllib.request.urlopen(each_page).read()
    soup = BeautifulSoup(r, "lxml")
  #  print(soup.prettify())
    mess = soup.find_all(class_="field-title")
    for subby in mess:
        messy = subby.find_all('a')
        for link in messy:
            relative_url = link.get('href')
            new_link = 'http://www.greens.org.nz' + str(relative_url)
            all_links.append(new_link)

print(len(all_links))
print(all_links)

In [None]:
# Now scrape similarly to before.

def scrape_greens(urls):
    counter = 0
    for newurl in urls: 
        filename = newurl[25:] + '.txt' # Their urls don't work as file names because of slashes etc
        filename = filename.replace('/','') # strip slashes
        filename = filename.replace('%','') # strip %
        filename = filename[20:]
        full_path = os.path.join('test/green', filename)
        print(newurl)
        try:
            page = urllib.request.urlopen(newurl).read()
        except urllib.error.URLError:
            print ("Page not found!")    
            continue
        soup      = BeautifulSoup(page, "lxml")
       # print(soup.prettify())
        if soup.find('div', class_ = 'field-body'):
            reltext = soup.find('div', class_ = "field-body")
        else:
            print("for url ", newurl, "I cannot find content")
            continue
        if soup.find(class_ = 'field-posted'):  
            relauthor_date = soup.find('div', class_ = 'field-posted').get_text()
        else:
            print('url', newurl, 'has no author-date (field-posted) field. May need to add manually.')
            relauthor_date = 'unauthored '        
#        text      = str(relauthor_date) + ' ' + reltext.get_text()
        text      = reltext.get_text()    # Version without author and date
        
        if not os.path.exists(filename):
            write_to_file(text, full_path)
        else:
            continue
        counter += 1
    print(counter, "Greens press releases saved as text documents.")   
    
scrape_greens(all_links) 


### 5. Get Maori party press releases

In [None]:
# Their press releases start on their main page and the urls are ridiculous thereafter

all_links = []

# Pages with press release urls on them (xx pages of them for June/July/August):
releases_pages = ['http://www.maoriparty.org']
#for i in range(2,3):  # Actually all of Sep are on the home page
#    pg = 'http://www.maoriparty.org/?fp=t%2F0pb2vva74axufxqvxziwymplu1c0katfo8mi5qhh3jupuv2btt7auhgr2j5o%2Fnyzsa7ra4dg2ukkxsikmqxg%3D%3D&page=' + str(i) + '&poru=wnycig1t%2Ba24%2Fpnhlfz7yto%2Faklt5encfsqsueywx7hlehvjtjzlv2zerv7z2haa&prvtof=j9ejw%2Falc85a68hgs2xyeg76zfbcphy1bxbsk8ororm%3D' + str(i) 
#    releases_pages.append(pg)

for each_page in releases_pages:    
    r = urllib.request.urlopen(each_page).read()
    soup = BeautifulSoup(r, "lxml")
    #print(soup.prettify())
    
    mess = soup.find_all(class_="blog")
    for subby in mess:
        messy = subby.find_all('a')
        for link in messy:
            relative_url = link.get('href')
            new_link = 'http://www.maoriparty.org' + str(relative_url)
            all_links.append(new_link)

all_links = list(set(all_links))      # There is duplication in the list.      
print(len(all_links))
print(all_links)  

    

In [None]:
# Now scrape similarly to before.

def scrape_maori(urls):
    counter = 0
    for newurl in urls: 
        print(newurl)
        filename = newurl[26:] + '.txt'         
        full_path = os.path.join('test/maori', filename)
        if not os.path.exists(full_path):

            try:
                page = urllib.request.urlopen(newurl).read()
            except urllib.error.URLError:
                print ("Page not found!")    
                continue
            soup      = BeautifulSoup(page, "lxml")
           # print(soup.prettify())
            if soup.find('div', class_ = 'content'):
                reltext = soup.find('div', class_ = "content")
            else:
                print("for url ", url, "I cannot find content")

            text      =  reltext.get_text()

            write_to_file(text, full_path)
        else:
            continue
        counter += 1
    print(counter, "Maori Party press releases saved as text documents.")   
    
scrape_maori(all_links) 

### 6. Get ACT party press releases

In [None]:
clear_all()

all_links = []
# Pages with press release urls on them (18 pages of them for June/July/August):
releases_pages = ['http://act.org.nz/news'] # 1st page
for i in range(2,6):  # (2,6) is all of Sep
    pg = 'http://act.org.nz/news/page/' + str(i) + '/' 
    releases_pages.append(pg)

for each_page in releases_pages:    
    r = urllib.request.urlopen(each_page).read()
    soup = BeautifulSoup(r, "lxml")
  #  print(soup.prettify())
    mess = soup.find_all(class_="read-more")
    for subby in mess:
        messy = subby.find_all('a')  # It's a full link, not relative
        for link in messy:
            new_link = link.get('href')
            all_links.append(new_link)

print(len(all_links))
print(all_links)

In [None]:
# Now scrape similarly to before.

def scrape_act(urls):
    counter = 0
    for newurl in urls: 
        print(newurl)
        filename = newurl[18:] + '.txt' 
        filename = filename.replace('/','') # strip slashes
        full_path = os.path.join('test/act', filename)
        if not os.path.exists(full_path):
            try:
                page = urllib.request.urlopen(newurl).read()
            except urllib.error.URLError:
                print ("Page not found!")    
                continue
            soup      = BeautifulSoup(page, "lxml")
            #print(soup.prettify())

            if soup.find('div', class_ = 'entry-content'):
                reltext = soup.find('div', class_ = "entry-content")
                temptext = reltext.get_text()
                temptext = temptext.replace('Facebook0Twitter', '')
            else:
                print("for url ", url, "I cannot find class entry-content")

            if soup.find(class_ = 'fn'):  
                relauthor = soup.find(class_ = 'fn').get_text()
                relauthor = relauthor + ' '
            else:
                print('url', newurl, 'has no author field. May need to add manually.')
                relauthor = 'unauthored '      

            if soup.find('time', class_ = 'updated'):               
                reldate   = soup.find('time', class_ = 'updated').get_text()
            else:
                print('url', newurl, 'has no date field. Add manually.')
                reldate = ' XXX' 
            text      = str(relauthor) + str(reldate) + ' ' + temptext  # Version with date and author
            text      = temptext

            write_to_file(text, full_path)
        else:
            continue
        counter += 1
    print(counter, "ACT press releases saved as text documents.")   
    
scrape_act(all_links) 


In [None]:
# The earlier data did include some releases from early September. Delete double-ups

parties = ['act', 'green', 'labour', 'maori', 'national', 'nzfirst']

for party in parties:
    new_folder = os.path.join('test', party)
    print('new_folder:', new_folder)
    for release in os.listdir(new_folder):
        new_path = os.path.join('test', party,  release)
        print ('new_path:', new_path)
        old_path = os.path.join(party, release)
        if os.path.exists(old_path):
            os.remove(new_path)
            print('deleted file', new_path)