<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px" width="50">

# Capstone Project: Predicting Stock Price Changes of  Healthcare Companies based on News Headlines

### Import Libraries

In [143]:
import pandas as pd
import datetime
import requests
import re
import time
import numpy as np
from bs4 import BeautifulSoup

pd.set_option('display.max.columns', None)
pd.set_option('display.max.colwidth', 100)

### Write functions to support webscraping from Reuters

In [2]:
# Write function to extract information from Reuters website in preferred format
def get_headlines(links_site):
    
    # Scrape the html of the site
    resp = requests.get(links_site)
 
    if not resp.ok:
        return None
 
    html = resp.content
    s = str(html)
    
    # Extract raw data
    headlines = re.findall(r'headline: "(.*?)",', s)
    dates = re.findall(r'date: "(.*?)",', s)
    links = re.findall(r'href: "(.*?)",',s)

    ### Format headlines, links and time
    
    # Eliminate HTML tags from headline
    headlines = [re.sub('<[^<]+?>', '',item) for item in headlines]
    
    # Edit links
    prefix = 'https://www.reuters.com'
    links = [prefix + x for x in links]
    
    # Reformat time
    dates_stripped = [date.split(" ") for date in dates]

    index1 = 0
    
    while (index1 < len(dates)):
        
        # Extract raw time and convert
        year = dates_stripped[index1][2]
        rawmonth = dates_stripped[index1][0]
        raw_day = dates_stripped[index1][1]
        month = time.strptime(rawmonth,"%B").tm_mon
        day = raw_day.split(",")[0]
        
        dates[index1] = str(year)+"-"+str(month)+"-"+str(day)
        
        index1 = index1+1
    
    dataframe = pd.DataFrame({'dates':dates,'headlines':headlines,'links':links, 'article_content':''})

    return dataframe

In [3]:
# Write function to scrape news info from reuters 
def scrape_reuters(query):
    
    index = 1
    all_news = pd.DataFrame(columns=['dates', 'headlines', 'links', 'article_content'])
 
    # Loop through subsequent Reuters pages
    for i in range(500):
        site = ('https://www.reuters.com/assets/searchArticleLoadMoreJson?blob=' + query 
                + '&bigOrSmall=big&articleWithBlog=true&sortBy=relevance&dateRange=all&numResultsToShow=100&pn='+ 
                str(i) + '&callback=addMoreNewsResults')
        
        current_site_news = get_headlines(site)
        all_news = pd.concat([all_news,current_site_news], ignore_index=True)
        time.sleep(2)
        i += 1
        
    # scrape 10 years' worth of news 
    all_news = all_news[(all_news['dates'] > '2011-01-01')].sort_values("dates")
    
    # drop duplicates of same article 
    all_news.drop_duplicates(subset=['links'], inplace=True, ignore_index=True)
    
    # extract article contents from article links
    for i in range(0, len(all_news)):
        url = all_news["links"][i]
        resp = requests.get(url)
        soup = BeautifulSoup(resp.content, 'lxml')
        container = soup.findAll("p", {"class": "Paragraph-paragraph-2Bgue ArticleBody-para-TD_9x"})            
        container1 = soup.findAll("p", {"class": "Text__text___3eVx1j Text__dark-grey___AS2I_p Text__regular___Bh17t- Text__large___1i0u1F Body__base___25kqPt Body__large_body___3g04wK ArticleBody__element___3UrnEs"})            
        
        for p in range(0, len(container)):
            text = []
            para = [container[p].text]
            text = " ".join(text + para)
            all_news["article_content"][i] = "".join(all_news["article_content"][i] + text)
            p += 1
        
        for c in range(0, len(container1)):
            text = []
            para = [container1[c].text]
            text = " ".join(text + para)
            all_news["article_content"][i] = "".join(all_news["article_content"][i] + text)
            c += 1
        
    return all_news

### Webscraping for news related to Pfizer [NYSE: PFE]

In [4]:
# scrape news related to pfizer 
news = scrape_reuters("pfizer")

In [5]:
# show the number of news headlines scraped
len(news)

8817

In [11]:
news.head(10)

Unnamed: 0,dates,headlines,links,article_content
0,2012-1-13,Nestle declines comment on Pfizer unit bid report,https://www.reuters.com/article/idUSWEA7960201...,"ZURICH, Jan 13 (Reuters) - Nestle, the world’..."
1,2012-1-13,Nestle declines comment on Pfizer unit bid report,https://www.reuters.com/article/idUSTRE80C0IM2...,"ZURICH (Reuters) - Nestle NESN.VX, the world'..."
2,2012-1-27,US FDA approves Pfizer\'s Inlyta for kidney ca...,https://www.reuters.com/article/idUSTRE80Q25F2...,WASHINGTON (Reuters) - Pfizer’s Inlyta drug f...
3,2012-1-27,US FDA approves Pfizer\'s Inlyta for kidney ca...,https://www.reuters.com/article/idUSTRE80Q1OD2...,WASHINGTON (Reuters) - Pfizer’s Inlyta drug f...
4,2012-1-28,India\'s Pfizer Oct-Dec net profit at 482.8 ml...,https://www.reuters.com/article/idUSL4E8CS03Z2...,
5,2012-1-31,"Generics take toll on Pfizer, Lilly profits",https://www.reuters.com/article/idUSTRE80U1BD2...,(Reuters) - Competition from low-cost generic...
6,2012-1-31,"Pfizer trims 2012 view, citing stronger dollar",https://www.reuters.com/article/idUSTRE80U0V32...,(Reuters) - Pfizer Inc PFE.N reported sharply...
7,2012-10-04,Trial suggests Prevnar may also protect ages 1...,https://www.reuters.com/article/idUSBRE8930Q62...,(Reuters) - Pfizer Inc said a late-stage tria...
8,2012-10-04,UPDATE 2-Trial suggests Prevnar may also prote...,https://www.reuters.com/article/idUSL3E8L45JE2...,Oct 4 (Reuters) - Pfizer Inc said a late-stag...
9,2012-10-04,Pfizer\'s Prevenar 13 vaccine meets trial goal,https://www.reuters.com/article/idUSL3E8L45BL2...,Oct 4 (Reuters) - Pfizer Inc said a late-stag...


In [12]:
news.info(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8817 entries, 0 to 8816
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   dates            8817 non-null   object
 1   headlines        8817 non-null   object
 2   links            8817 non-null   object
 3   article_content  8817 non-null   object
dtypes: object(4)
memory usage: 275.7+ KB


### Removal of Duplicated News Updates

In [50]:
def clean_updates(text):
    text = re.sub(r'UPDATE 1-', '', text)
    text = re.sub(r'UPDATE 2-', '', text)
    text = re.sub(r'UPDATE 3-', '', text)
    text = re.sub(r'UPDATE 4-', '', text)
    text = re.sub(r'BRIEF-', '', text)
    return text

In [51]:
news['headlines'] = news['headlines'].apply(clean_updates)

In [52]:
news.headlines.head(30)

0     Nestle declines comment on Pfizer unit bid report
1     US FDA approves Pfizer\'s Inlyta for kidney ca...
2           Generics take toll on Pfizer, Lilly profits
3        Pfizer trims 2012 view, citing stronger dollar
4     Trial suggests Prevnar may also protect ages 1...
5        Pfizer\'s Prevenar 13 vaccine meets trial goal
6     Pfizer to appeal India decision to revoke canc...
7     Pfizer to pay $164 million in investor lawsuit...
8     Pfizer agrees to $164 mln settlement of Celebr...
9     Pfizer to pay $164 mln in investor lawsuit ove...
10    Pfizer says pain drug as safe as rival pills i...
11    Pfizer kidney cancer drug fails as initial tre...
12    Pfizer to buy maker of attention-deficit drug ...
13    Pfizer to buy maker of attention-deficit drug ...
14    Pfizer lung cancer drug gets conditional EU ap...
15    Pfizer reschedules issuance of third quarter 2...
16    Shionogi to take 10 pct stake in GSK, Pfizer H...
17    Pfizer sales weak on vaccine, emerging mar

In [53]:
# drop duplicates of same article with different headlines 
news.drop_duplicates(subset=['headlines'], ignore_index=True, inplace=True)

In [54]:
# drop rows with no article contents scraped
news = news[news['article_content'] != ' ']

In [109]:
news.head(10)

Unnamed: 0,dates,headlines,links,article_content
0,2012-1-13,Nestle declines comment on Pfizer unit bid report,https://www.reuters.com/article/idUSWEA7960201...,"ZURICH, Jan 13 (Reuters) - Nestle, the world’..."
1,2012-1-27,US FDA approves Pfizer\'s Inlyta for kidney ca...,https://www.reuters.com/article/idUSTRE80Q25F2...,WASHINGTON (Reuters) - Pfizer’s Inlyta drug f...
2,2012-1-31,"Generics take toll on Pfizer, Lilly profits",https://www.reuters.com/article/idUSTRE80U1BD2...,(Reuters) - Competition from low-cost generic...
3,2012-1-31,"Pfizer trims 2012 view, citing stronger dollar",https://www.reuters.com/article/idUSTRE80U0V32...,(Reuters) - Pfizer Inc PFE.N reported sharply...
4,2012-10-04,Trial suggests Prevnar may also protect ages 1...,https://www.reuters.com/article/idUSBRE8930Q62...,(Reuters) - Pfizer Inc said a late-stage tria...
5,2012-10-04,Pfizer\'s Prevenar 13 vaccine meets trial goal,https://www.reuters.com/article/idUSL3E8L45BL2...,Oct 4 (Reuters) - Pfizer Inc said a late-stag...
6,2012-10-05,Pfizer to appeal India decision to revoke canc...,https://www.reuters.com/article/idUSBRE89408D2...,"MUMBAI (Reuters) - Pfizer Ltd, the India unit..."
7,2012-10-09,Pfizer to pay $164 million in investor lawsuit...,https://www.reuters.com/article/idUSBRE8981F72...,(Reuters) - Pfizer Inc has agreed to pay $164...
8,2012-10-09,Pfizer agrees to $164 mln settlement of Celebr...,https://www.reuters.com/article/idUSWEN7642201...,Oct 9 (Reuters) - Pfizer Inc : * Says has agr...
9,2012-10-09,Pfizer to pay $164 mln in investor lawsuit ove...,https://www.reuters.com/article/idUSL1E8L9KJJ2...,* Agrees to settlement of Celebrex securities...


In [73]:
# export merged dataset to csv for further data cleaning & EDA 
news.to_csv("../assets/scraped_news.csv", index=False)