<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px" width="50">

# Capstone Project: Predicting Stock Price Changes of  Healthcare Companies based on News Headlines

### Import Libraries

In [47]:
import pandas as pd
import datetime
import requests
import re
import time
import numpy as np
from bs4 import BeautifulSoup

pd.set_option('display.max.columns', None)
pd.set_option('display.max.colwidth', 100)

### Basic Data Cleaning and Aggregation by Unique Dates

In [48]:
# import scraped data & retain only news text
pfizer = pd.read_csv("../assets/scraped_news.csv")
pfizer.drop(columns=['links'], inplace=True)

In [49]:
pfizer.head()

Unnamed: 0,dates,headlines,article_content
0,2012-1-13,Nestle declines comment on Pfizer unit bid report,"ZURICH, Jan 13 (Reuters) - Nestle, the world’s biggest food group, declined to comment on a rep..."
1,2012-1-27,US FDA approves Pfizer\'s Inlyta for kidney cancer,WASHINGTON (Reuters) - Pfizer’s Inlyta drug for patients with advanced kidney cancer won approv...
2,2012-1-31,"Generics take toll on Pfizer, Lilly profits",(Reuters) - Competition from low-cost generic drugs squeezed quarterly profits at Pfizer Inc PF...
3,2012-1-31,"Pfizer trims 2012 view, citing stronger dollar","(Reuters) - Pfizer Inc PFE.N reported sharply lower quarterly earnings, hurt by generic forms o..."
4,2012-10-04,Trial suggests Prevnar may also protect ages 18-49,(Reuters) - Pfizer Inc said a late-stage trial of its vaccine to protect against pneumococcal b...


In [50]:
# create a function to check if company is mentioned in the news report at all
def company_in_text(text, company):
    if company in text.lower():
        return 1
    else:
        return 0

In [51]:
# apply function to create feature column
pfizer['mentioned'] = pfizer['headlines'].apply(lambda x: company_in_text(x, "pfizer"))

In [52]:
# show sample articles where the company was not mentioned at all
pfizer[pfizer['mentioned']==0].tail()

Unnamed: 0,dates,headlines,article_content,mentioned
6107,2021-9-01,EU health body says no urgent need for vaccine boosters,Sept 1 (Reuters) - The European Centre for Disease Prevention and Control (ECDC) said on Wednes...,0
6108,2021-9-01,U.S. administers third dose of COVID-19 vaccine to over 1 million people - CDC,Sept 1 (Reuters) - The United States has administered a third dose of either Pfizer Inc (PFE.N)...,0
6109,2021-9-01,COVID SCIENCE-Antibody levels higher after Moderna shot; Lilly arthritis drug used with steroid ...,(Reuters) - The following is a summary of some recent studies on COVID-19. They include researc...,0
6110,2021-9-01,South Korea in eleventh-hour talks to head off strike by health workers,"SEOUL, Sept 1 (Reuters) - South Korean government officials were in last-ditch efforts on Wedne...",0
6111,2021-9-01,China beats Taiwan to the punch in announcing new vaccine delivery,"BEIJING/TAIPEI, Sept 1 (Reuters) - China beat Taiwan to the punch on Wednesday in announcing th...",0


In [53]:
# we remove articles with no mention of the company in our dataset
pfizer = pfizer[pfizer['mentioned']==1]

In [54]:
pfizer.head()

Unnamed: 0,dates,headlines,article_content,mentioned
0,2012-1-13,Nestle declines comment on Pfizer unit bid report,"ZURICH, Jan 13 (Reuters) - Nestle, the world’s biggest food group, declined to comment on a rep...",1
1,2012-1-27,US FDA approves Pfizer\'s Inlyta for kidney cancer,WASHINGTON (Reuters) - Pfizer’s Inlyta drug for patients with advanced kidney cancer won approv...,1
2,2012-1-31,"Generics take toll on Pfizer, Lilly profits",(Reuters) - Competition from low-cost generic drugs squeezed quarterly profits at Pfizer Inc PF...,1
3,2012-1-31,"Pfizer trims 2012 view, citing stronger dollar","(Reuters) - Pfizer Inc PFE.N reported sharply lower quarterly earnings, hurt by generic forms o...",1
5,2012-10-04,Pfizer\'s Prevenar 13 vaccine meets trial goal,Oct 4 (Reuters) - Pfizer Inc said a late-stage study of its Prevenar 13 vaccine met the main tr...,1


In [55]:
# count number of news each day
news_volume = pfizer.groupby('dates').size()
news_volume.head()

dates
2012-1-13     1
2012-1-27     1
2012-1-31     2
2012-10-04    1
2012-10-05    1
dtype: int64

In [56]:
# group headlines together such that each day will only be recorded as one observation
agg_text = pfizer.groupby('dates')['headlines'].apply(list)
agg_df = pd.DataFrame({'text':agg_text, 'news_count':news_volume})
    
# set the date index in a particular format 
agg_df.index = pd.to_datetime(agg_df.index,format="%Y-%m-%d")

In [57]:
agg_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 998 entries, 2012-01-13 to 2021-09-01
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        998 non-null    object
 1   news_count  998 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 23.4+ KB


In [58]:
agg_df.head()

Unnamed: 0_level_0,text,news_count
dates,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-01-13,[Nestle declines comment on Pfizer unit bid report],1
2012-01-27,[US FDA approves Pfizer\'s Inlyta for kidney cancer],1
2012-01-31,"[Generics take toll on Pfizer, Lilly profits, Pfizer trims 2012 view, citing stronger dollar]",2
2012-10-04,[Pfizer\'s Prevenar 13 vaccine meets trial goal],1
2012-10-05,[Pfizer to appeal India decision to revoke cancer drug patent],1


### Webscraping for stocks data using the Alpha Vantage API

In [59]:
def scrape_stocks(ticker):
    
    # pull API request
    url = ('https://www.alphavantage.co/query?function=TIME_SERIES_DAILY_ADJUSTED&symbol=' + ticker + 
           '&outputsize=full&apikey=<INSERT YOUR API KEY>')
    r = requests.get(url)
    data = r.json()
    
    # save data in panda dataframe
    df = pd.DataFrame(data["Time Series (Daily)"])
    df = df.T
    
    # grab only stocks data between 01 Jan 2011 and 31 August 2021
    df = df.loc['2021-08-31':'2012-01-10']
    
    # keep only relevant data columns (i.e. adjusted close price and volume)
    df = df.iloc[:, 4:6]
    
    # rename column names 
    df.rename(columns={"5. adjusted close": "close_price",
                       "6. volume": "volume" }, inplace=True)
    
    # change datatype from string to float
    for column in df.columns:
        df[column] = pd.to_numeric(df[column])
        
    # set the date index in standard format 
    df.index = pd.to_datetime(df.index,format="%Y-%m-%d")
    df.sort_index(inplace=True)
    df['date'] = pd.to_datetime(df.index,format="%Y-%m-%d")
    
    # create feature columns
    df['pct_px_change'] = (df['close_price'].pct_change()) * 100
    df['abs_pct_change'] = abs((df['close_price'].pct_change()) * 100)
        
    return df 

In [60]:
# scrape relevant stocks data
pfe = scrape_stocks('pfe')
sp500 = scrape_stocks('spy')
sp500.drop(columns=['abs_pct_change'], inplace=True)

In [61]:
pfe.head()

Unnamed: 0,close_price,volume,date,pct_px_change,abs_pct_change
2012-01-10,14.571357,27730000,2012-01-10,,
2012-01-11,14.544791,28764200,2012-01-11,-0.182315,0.182315
2012-01-12,14.604564,27358300,2012-01-12,0.410959,0.410959
2012-01-13,14.504942,29067200,2012-01-13,-0.682128,0.682128
2012-01-17,14.568036,35544000,2012-01-17,0.434982,0.434982


In [62]:
# write loop to create new variable to record price changes over 5 days and store info in new dataframe
pct_px_change_5d = []
abs_pct_px_change_5d = []
date = []
x = len(pfe) - 5

for i in range(0, x):
    d = pfe['date'][i]
    px = (pfe['close_price'][i+5] - pfe['close_price'][i]) / pfe['close_price'][i] * 100
    date.append(d)
    pct_px_change_5d.append(px)
    abs_pct_px_change_5d.append(abs(px))
        
df2 = pd.DataFrame({'date':date, '5d_change':pct_px_change_5d, '5d_abs':abs_pct_px_change_5d})

# merge both dataframes
merged_df = pfe.merge(df2, on="date")

In [63]:
# write loop to create new variable to record price changes over 5 days and store info in new dataframe
pct_px_change_5d = []
date = []
x = len(sp500) - 5

for i in range(0, x):
    d = sp500['date'][i]
    px = (sp500['close_price'][i+5] - sp500['close_price'][i]) / sp500['close_price'][i] * 100
    date.append(d)
    pct_px_change_5d.append(px)
        
df2 = pd.DataFrame({'date':date, 'sp500_5d_change':pct_px_change_5d})

# further merge dataframes
df2 = df2.merge(sp500, on='date')
df2.drop(columns=['close_price', 'volume'], inplace=True)
df2.rename(columns={"pct_px_change": "sp500_pct_px_change"}, inplace=True)
merged_df = merged_df.merge(df2, on="date")

In [64]:
merged_df["adjusted"] = merged_df['pct_px_change'] - merged_df['sp500_pct_px_change']
merged_df["adjusted_abs"] = abs(merged_df["adjusted"])
merged_df["adjusted_5d"] = merged_df['5d_change'] - merged_df['sp500_5d_change']
merged_df["adjusted_5d_abs"] = abs(merged_df["adjusted_5d"])

In [65]:
merged_df.head()

Unnamed: 0,close_price,volume,date,pct_px_change,abs_pct_change,5d_change,5d_abs,sp500_5d_change,sp500_pct_px_change,adjusted,adjusted_abs,adjusted_5d,adjusted_5d_abs
0,14.571357,27730000,2012-01-10,,,0.136737,0.136737,1.270038,,,,-1.133301,1.133301
1,14.544791,28764200,2012-01-11,-0.182315,0.182315,-0.273973,0.273973,1.749226,0.054209,-0.236524,0.236524,-2.023199,2.023199
2,14.604564,27358300,2012-01-12,0.410959,0.410959,-0.409277,0.409277,1.884024,0.239938,0.171021,0.171021,-2.293301,2.293301
3,14.504942,29067200,2012-01-13,-0.682128,0.682128,-0.595238,0.595238,2.149953,-0.517335,-0.164794,0.164794,-2.745192,2.745192
4,14.568036,35544000,2012-01-17,0.434982,0.434982,-1.253704,1.253704,1.639091,0.388078,0.046903,0.046903,-2.892795,2.892795


In [66]:
# create date column so that the news data can be merged with the stock ticker data
agg_df['date'] = pd.to_datetime(agg_df.index,format="%Y-%m-%d")

In [67]:
# merge news & stocks data
pfizer = merged_df.merge(agg_df, on="date")
pfizer.drop(columns=['close_price', 'sp500_5d_change', 'sp500_pct_px_change'], inplace=True)

In [68]:
pfizer = pfizer.dropna()

In [69]:
pfizer.head()

Unnamed: 0,volume,date,pct_px_change,abs_pct_change,5d_change,5d_abs,adjusted,adjusted_abs,adjusted_5d,adjusted_5d_abs,text,news_count
0,29067200,2012-01-13,-0.682128,0.682128,-0.595238,0.595238,-0.164794,0.164794,-2.745192,2.745192,[Nestle declines comment on Pfizer unit bid report],1
1,132872800,2012-01-27,-0.693481,0.693481,-0.284616,0.284616,-0.647985,0.647985,-2.348036,2.348036,[US FDA approves Pfizer\'s Inlyta for kidney cancer],1
2,55526800,2012-01-31,-0.834106,0.834106,-0.62002,0.62002,-0.796045,0.796045,-3.26242,3.26242,"[Generics take toll on Pfizer, Lilly profits, Pfizer trims 2012 view, citing stronger dollar]",2
3,37375900,2012-02-07,0.477327,0.477327,1.330166,1.330166,0.224445,0.224445,1.033408,1.033408,"[India\'s Pfizer to spin-off animal healthcare business, Dealtalk: Nestle in lead to scoop up Pf...",3
4,25262900,2012-02-14,0.140845,0.140845,0.140647,0.140647,0.266436,0.266436,-0.480701,0.480701,"[Pfizer says its drug is best hope for Alzheimer\'s, Pfizer says its drug is best hope vs. Alzhe...",2


In [134]:
pfizer.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 910 entries, 0 to 909
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   volume           910 non-null    int64         
 1   date             910 non-null    datetime64[ns]
 2   pct_px_change    910 non-null    float64       
 3   abs_pct_change   910 non-null    float64       
 4   5d_change        910 non-null    float64       
 5   5d_abs           910 non-null    float64       
 6   adjusted         910 non-null    float64       
 7   adjusted_abs     910 non-null    float64       
 8   adjusted_5d      910 non-null    float64       
 9   adjusted_5d_abs  910 non-null    float64       
 10  text             910 non-null    object        
 11  news_count       910 non-null    int64         
dtypes: datetime64[ns](1), float64(8), int64(2), object(1)
memory usage: 92.4+ KB


### Repeat the same steps for the other companies' data

In [70]:
def clean_updates(text):
    text = re.sub(r'UPDATE 1-', '', text)
    text = re.sub(r'UPDATE 2-', '', text)
    text = re.sub(r'UPDATE 3-', '', text)
    text = re.sub(r'UPDATE 4-', '', text)
    text = re.sub(r'BRIEF-', '', text)
    return text

In [71]:
def clean_and_merge(news, company, ticker):
    
    news['headlines'] = news['headlines'].apply(clean_updates)
    news.drop_duplicates(subset=['headlines'], ignore_index=True, inplace=True)
    news.drop(columns=['links'], inplace=True)
    news['mentioned'] = news['headlines'].apply(lambda x: company_in_text(x, company))
    news = news[news['mentioned']==1]
    news_volume = news.groupby('dates').size()
    agg_text = news.groupby('dates')['headlines'].apply(list)
    agg_df = pd.DataFrame({'text':agg_text, 'news_count':news_volume})
    agg_df['date'] = pd.to_datetime(agg_df.index,format="%Y-%m-%d")
    
    df = scrape_stocks(ticker)
    pct_px_change_5d = []
    abs_pct_px_change_5d = []
    date = []
    x = len(df) - 5
    
    for i in range(0, x):
        d = df['date'][i]
        px = (df['close_price'][i+5] - df['close_price'][i]) / df['close_price'][i] * 100
        date.append(d)
        pct_px_change_5d.append(px)
        abs_pct_px_change_5d.append(abs(px))
        
    df2 = pd.DataFrame({'date':date, '5d_change':pct_px_change_5d, '5d_abs':abs_pct_px_change_5d})
    
    merged_df = df.merge(df2, on="date")
    data = merged_df.merge(agg_df, on="date")
    
    sp500 = scrape_stocks('spy')
    sp500.drop(columns=['abs_pct_change'], inplace=True)
    y = len(sp500) - 5
    pct_px_change_5d = []
    date = []
    
    for i in range(0, y):
        d = sp500['date'][i]
        px = (sp500['close_price'][i+5] - sp500['close_price'][i]) / sp500['close_price'][i] * 100
        date.append(d)
        pct_px_change_5d.append(px)
        
    df2 = pd.DataFrame({'date':date, 'sp500_5d_change':pct_px_change_5d})
    
    df2 = df2.merge(sp500, on='date')
    df2.drop(columns=['close_price', 'volume'], inplace=True)
    df2.rename(columns={"pct_px_change": "sp500_pct_px_change"}, inplace=True)
    merged_df = merged_df.merge(df2, on="date")
    
    merged_df["adjusted"] = merged_df['pct_px_change'] - merged_df['sp500_pct_px_change']
    merged_df["adjusted_abs"] = abs(merged_df["adjusted"])
    merged_df["adjusted_5d"] = merged_df['5d_change'] - merged_df['sp500_5d_change']
    merged_df["adjusted_5d_abs"] = abs(merged_df["adjusted_5d"])
    
    data = merged_df.merge(agg_df, on="date")
    data.drop(columns=['close_price', 'sp500_5d_change', 'sp500_pct_px_change'], inplace=True)
    
    return data

### Biogen [NASDAQ: BIIB]

In [75]:
df = pd.read_csv("../assets/scraped_news_biogen.csv")

In [76]:
biogen = clean_and_merge(df, "biogen", "biib")

In [77]:
biogen.head()

Unnamed: 0,volume,date,pct_px_change,abs_pct_change,5d_change,5d_abs,adjusted,adjusted_abs,adjusted_5d,adjusted_5d_abs,text,news_count
0,2615600,2012-01-31,1.175159,1.175159,3.060619,3.060619,1.213219,1.213219,0.418219,0.418219,"[Biogen 4th quarter profit up; 2012 forecast falls short, Biogen profit up; 2012 forecast falls ...",2
1,1029400,2012-02-14,-0.25,0.25,-2.664996,2.664996,-0.124409,0.124409,-3.286344,3.286344,[Biogen to acquire developer of fibrosis treatments],1
2,2154300,2012-05-01,-1.626744,1.626744,0.834408,0.834408,-2.24875,2.24875,3.811529,3.811529,"[Biogen profit below Street view; outlook strong, Biogen Q1 profit falls short of Street view]",2
3,1416000,2012-05-09,-0.315956,0.315956,3.237492,3.237492,0.277234,0.277234,5.381296,5.381296,"[Biogen MS drug application accepted by U.S., EU regulators]",1
4,1313300,2012-07-24,0.179559,0.179559,4.552624,4.552624,1.038246,1.038246,1.730254,1.730254,[Biogen 2nd-quarter profit up on MS drug sales],1


In [133]:
biogen.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300 entries, 0 to 299
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   volume           300 non-null    int64         
 1   date             300 non-null    datetime64[ns]
 2   pct_px_change    300 non-null    float64       
 3   abs_pct_change   300 non-null    float64       
 4   5d_change        300 non-null    float64       
 5   5d_abs           300 non-null    float64       
 6   adjusted         300 non-null    float64       
 7   adjusted_abs     300 non-null    float64       
 8   adjusted_5d      300 non-null    float64       
 9   adjusted_5d_abs  300 non-null    float64       
 10  text             300 non-null    object        
 11  news_count       300 non-null    int64         
dtypes: datetime64[ns](1), float64(8), int64(2), object(1)
memory usage: 30.5+ KB


### Amgen [NASDAQ: AMGN]

In [78]:
df = pd.read_csv("../assets/scraped_news_amgen.csv")

In [79]:
amgen = clean_and_merge(df, "amgen", "amgn")

In [80]:
amgen.head()

Unnamed: 0,volume,date,pct_px_change,abs_pct_change,5d_change,5d_abs,adjusted,adjusted_abs,adjusted_5d,adjusted_5d_abs,text,news_count
0,7036800,2012-01-26,-1.632712,1.632712,2.262045,2.262045,-1.119737,1.119737,1.655433,1.655433,[Amgen to buy Micromet for $1.2 billion],1
1,11863300,2012-02-06,-0.230947,0.230947,-0.810185,0.810185,-0.164052,0.164052,-1.487017,1.487017,[FDA staff unsure about new use for Amgen\'s Xgeva],1
2,10922600,2012-02-08,-1.604742,1.604742,0.291532,0.291532,-1.9015,1.9015,0.757542,0.757542,[FDA panel votes against wider use of Amgen drug],1
3,4508500,2012-04-02,0.205973,0.205973,-2.818969,2.818969,-0.525509,0.525509,1.368848,1.368848,"[Amgen, Astra to collaborate on several drugs]",1
4,4790100,2012-04-03,-0.264278,0.264278,-1.766524,1.766524,0.144633,0.144633,1.249191,1.249191,[AstraZeneca hunting more deals like Amgen tie-up],1


In [132]:
amgen.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 389 entries, 0 to 388
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   volume           389 non-null    int64         
 1   date             389 non-null    datetime64[ns]
 2   pct_px_change    389 non-null    float64       
 3   abs_pct_change   389 non-null    float64       
 4   5d_change        389 non-null    float64       
 5   5d_abs           389 non-null    float64       
 6   adjusted         389 non-null    float64       
 7   adjusted_abs     389 non-null    float64       
 8   adjusted_5d      389 non-null    float64       
 9   adjusted_5d_abs  389 non-null    float64       
 10  text             389 non-null    object        
 11  news_count       389 non-null    int64         
dtypes: datetime64[ns](1), float64(8), int64(2), object(1)
memory usage: 39.5+ KB


### AbbVie [NYSE: ABBV]

In [90]:
df = pd.read_csv("../assets/scraped_news_abbvie.csv")

In [91]:
abbvie = clean_and_merge(df, "abbvie", "abbv")

In [92]:
abbvie.head()

Unnamed: 0,volume,date,pct_px_change,abs_pct_change,5d_change,5d_abs,adjusted,adjusted_abs,adjusted_5d,adjusted_5d_abs,text,news_count
0,18800400,2013-01-09,0.563631,0.563631,6.076609,6.076609,0.309423,0.309423,5.302212,5.302212,[AbbVie sees no need for major acquisitions-CFO],1
1,10981600,2013-01-23,3.789127,3.789127,-2.301587,2.301587,3.628193,3.628193,-2.770222,2.770222,[Abbott results bode well for spun-off AbbVie],1
2,10841800,2013-01-30,-1.018494,1.018494,0.568643,0.568643,-0.626883,0.626883,-0.157684,0.157684,"[AbbVie forecasts slower Humira growth, AbbVie says interested in smaller \\""tuck in\\"" acquisit...",3
3,9378100,2013-04-23,2.126697,2.126697,2.01595,2.01595,1.095769,1.095769,0.811742,0.811742,"[AbbVie hepatitis C drugs knock out virus at 8 weeks, AbbVie hepatitis C drugs knock out virus a...",2
4,10096800,2013-04-26,3.616637,3.616637,-2.530541,2.530541,3.79327,3.79327,-4.508549,4.508549,"[AbbVie delivers on strong growth of Humira arthritis drug, AbbVie says expects Q2 EPS, excludin...",4


In [131]:
abbvie.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 344 entries, 0 to 343
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   volume           344 non-null    int64         
 1   date             344 non-null    datetime64[ns]
 2   pct_px_change    344 non-null    float64       
 3   abs_pct_change   344 non-null    float64       
 4   5d_change        344 non-null    float64       
 5   5d_abs           344 non-null    float64       
 6   adjusted         344 non-null    float64       
 7   adjusted_abs     344 non-null    float64       
 8   adjusted_5d      344 non-null    float64       
 9   adjusted_5d_abs  344 non-null    float64       
 10  text             344 non-null    object        
 11  news_count       344 non-null    int64         
dtypes: datetime64[ns](1), float64(8), int64(2), object(1)
memory usage: 34.9+ KB


### Gilead [NASDAQ: GILD]

In [93]:
df = pd.read_csv("../assets/scraped_news_gilead.csv")

In [94]:
gilead = clean_and_merge(df, "gilead", "gild")

In [95]:
gilead.head()

Unnamed: 0,volume,date,pct_px_change,abs_pct_change,5d_change,5d_abs,adjusted,adjusted_abs,adjusted_5d,adjusted_5d_abs,text,news_count
0,9324000,2012-03-06,-0.757084,0.757084,2.027027,2.027027,0.705439,0.705439,-1.913604,1.913604,[More patients relapse in Gilead hepatitis C trial],1
1,7975800,2012-03-07,-0.87184,0.87184,2.594547,2.594547,-1.569428,1.569428,-0.515483,0.515483,[Gilead Quad HIV drug causes fewer side effects],1
2,36016700,2012-04-19,12.100408,12.100408,0.899522,0.899522,12.742497,12.742497,-0.872189,0.872189,"[Gilead hepatitis C drug proves itself in key study, Gilead, Bristol hepatitis C trial data impr...",2
3,6678400,2012-04-26,0.285334,0.285334,-2.522762,2.522762,-0.411555,0.411555,-1.873504,1.873504,[Gilead profit narrowly misses Street view],1
4,8258300,2012-05-08,-0.881764,0.881764,4.872624,4.872624,-0.480596,0.480596,7.223412,7.223412,[FDA staff: Gilead\'s Truvada may help reduce HIV risk],1


In [130]:
gilead.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 387 entries, 0 to 386
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   volume           387 non-null    int64         
 1   date             387 non-null    datetime64[ns]
 2   pct_px_change    387 non-null    float64       
 3   abs_pct_change   387 non-null    float64       
 4   5d_change        387 non-null    float64       
 5   5d_abs           387 non-null    float64       
 6   adjusted         387 non-null    float64       
 7   adjusted_abs     387 non-null    float64       
 8   adjusted_5d      387 non-null    float64       
 9   adjusted_5d_abs  387 non-null    float64       
 10  text             387 non-null    object        
 11  news_count       387 non-null    int64         
dtypes: datetime64[ns](1), float64(8), int64(2), object(1)
memory usage: 39.3+ KB


### Merck [NYSE: MRK]

In [103]:
df = pd.read_csv("../assets/scraped_news_merck.csv")

In [104]:
merck = clean_and_merge(df, "merck", "mrk")

In [105]:
merck.head()

Unnamed: 0,volume,date,pct_px_change,abs_pct_change,5d_change,5d_abs,adjusted,adjusted_abs,adjusted_5d,adjusted_5d_abs,text,news_count
0,11366200,2012-01-10,,,0.908619,0.908619,,,-0.361419,0.361419,[FDA says Merck\'s HIV drug gets generic threat],1
1,16546800,2012-01-19,1.003344,1.003344,-1.222618,1.222618,0.475701,0.475701,-1.542107,1.542107,[Merck resolves Vioxx litigation in Canada],1
2,14475200,2012-01-25,-0.257865,0.257865,-0.129266,0.129266,-1.094621,1.094621,-0.061372,0.061372,"[FDA label shows Merck cholesterol drug helps heart, Merck KGaA get EU approval for wider use of...",3
3,11032300,2012-02-06,0.078186,0.078186,-0.755208,0.755208,0.145081,0.145081,-1.43204,1.43204,[Merck to file application this year for insomnia drug],1
4,9597700,2012-02-09,-0.702759,0.702759,-0.026212,0.026212,-0.828508,0.828508,-0.535964,0.535964,[Merck says hepatitis pill hampers HIV Drugs],1


In [128]:
merck.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 757 entries, 0 to 756
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   volume           757 non-null    int64         
 1   date             757 non-null    datetime64[ns]
 2   pct_px_change    756 non-null    float64       
 3   abs_pct_change   756 non-null    float64       
 4   5d_change        757 non-null    float64       
 5   5d_abs           757 non-null    float64       
 6   adjusted         756 non-null    float64       
 7   adjusted_abs     756 non-null    float64       
 8   adjusted_5d      757 non-null    float64       
 9   adjusted_5d_abs  757 non-null    float64       
 10  text             757 non-null    object        
 11  news_count       757 non-null    int64         
dtypes: datetime64[ns](1), float64(8), int64(2), object(1)
memory usage: 76.9+ KB


### Eli Lilly [NYSE: LLY]

In [106]:
df = pd.read_csv("../assets/scraped_news_lilly.csv")

In [107]:
lilly = clean_and_merge(df, "lilly", "lly")

In [108]:
lilly.head()

Unnamed: 0,volume,date,pct_px_change,abs_pct_change,5d_change,5d_abs,adjusted,adjusted_abs,adjusted_5d,adjusted_5d_abs,text,news_count
0,13418400,2012-01-31,1.248408,1.248408,-0.603926,0.603926,1.286468,1.286468,-3.246326,3.246326,"[Generics take toll on Pfizer, Lilly profits, Eli Lilly profit drops sharply on Zyprexa generics]",2
1,7765400,2012-02-01,0.150981,0.150981,-0.050251,0.050251,-0.724742,0.724742,-2.103546,2.103546,[Lilly Alzheimer\'s drug an unlikely ace in the hole],1
2,10405800,2012-04-16,1.071975,1.071975,0.530303,0.530303,1.137602,1.137602,0.720015,0.720015,[Vanda buys Lilly\'s experimental drug],1
3,8205700,2012-04-25,2.102102,2.102102,1.666667,1.666667,0.732937,0.732937,0.854827,0.854827,"[Lilly profit beats forecast; focus on new drug data, Lilly sales, profit beat forecasts]",2
4,5050800,2012-05-03,-0.506268,0.506268,1.163072,1.163072,0.256275,0.256275,3.482642,3.482642,[Pair arrested for $70 million heist of Lilly drugs],1


In [129]:
lilly.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 475 entries, 0 to 474
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   volume           475 non-null    int64         
 1   date             475 non-null    datetime64[ns]
 2   pct_px_change    475 non-null    float64       
 3   abs_pct_change   475 non-null    float64       
 4   5d_change        475 non-null    float64       
 5   5d_abs           475 non-null    float64       
 6   adjusted         475 non-null    float64       
 7   adjusted_abs     475 non-null    float64       
 8   adjusted_5d      475 non-null    float64       
 9   adjusted_5d_abs  475 non-null    float64       
 10  text             475 non-null    object        
 11  news_count       475 non-null    int64         
dtypes: datetime64[ns](1), float64(8), int64(2), object(1)
memory usage: 48.2+ KB


### Regeneron Pharmaceuticals [NASDAQ: REGN]

In [121]:
df = pd.read_csv("../assets/scraped_news_regeneron.csv")

In [122]:
regeneron = clean_and_merge(df, "regeneron", "regn")

In [123]:
regeneron.head()

Unnamed: 0,volume,date,pct_px_change,abs_pct_change,5d_change,5d_abs,adjusted,adjusted_abs,adjusted_5d,adjusted_5d_abs,text,news_count
0,2335500,2012-01-12,1.163392,1.163392,0.089445,0.089445,0.923454,0.923454,-1.794579,1.794579,[CORRECTED-Regeneron posts \'11 Eylea sales of $24-25 mln],1
1,4161600,2012-02-13,12.313871,12.313871,-13.423463,13.423463,11.569602,11.569602,-14.243498,14.243498,"[Regeneron raises 2012 Eylea sales forecast, shares jump]",1
2,660700,2012-03-13,1.743078,1.743078,3.917195,3.917195,-0.05951,0.05951,3.20714,3.20714,[Regeneron says eye drug capturing Roche patients],1
3,1284400,2012-03-26,0.654472,0.654472,0.200067,0.200067,-0.749037,0.749037,0.037649,0.037649,[Regeneron cholesterol drug faces sudden challenge],1
4,3220300,2012-04-26,9.039415,9.039415,-3.376623,3.376623,8.342526,8.342526,-2.727365,2.727365,[Regeneron nearly doubles Eylea sales forecast],1


In [127]:
regeneron.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 241 entries, 0 to 240
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   volume           241 non-null    int64         
 1   date             241 non-null    datetime64[ns]
 2   pct_px_change    241 non-null    float64       
 3   abs_pct_change   241 non-null    float64       
 4   5d_change        241 non-null    float64       
 5   5d_abs           241 non-null    float64       
 6   adjusted         241 non-null    float64       
 7   adjusted_abs     241 non-null    float64       
 8   adjusted_5d      241 non-null    float64       
 9   adjusted_5d_abs  241 non-null    float64       
 10  text             241 non-null    object        
 11  news_count       241 non-null    int64         
dtypes: datetime64[ns](1), float64(8), int64(2), object(1)
memory usage: 24.5+ KB


### Merge dataframes

In [124]:
merged = pd.concat([pfizer, biogen, amgen, abbvie, gilead, lilly, merck, regeneron])

In [125]:
merged.head()

Unnamed: 0,volume,date,pct_px_change,abs_pct_change,5d_change,5d_abs,adjusted,adjusted_abs,adjusted_5d,adjusted_5d_abs,text,news_count
0,29067200,2012-01-13,-0.682128,0.682128,-0.595238,0.595238,-0.164794,0.164794,-2.745192,2.745192,[Nestle declines comment on Pfizer unit bid report],1
1,132872800,2012-01-27,-0.693481,0.693481,-0.284616,0.284616,-0.647985,0.647985,-2.348036,2.348036,[US FDA approves Pfizer\'s Inlyta for kidney cancer],1
2,55526800,2012-01-31,-0.834106,0.834106,-0.62002,0.62002,-0.796045,0.796045,-3.26242,3.26242,"[Generics take toll on Pfizer, Lilly profits, Pfizer trims 2012 view, citing stronger dollar]",2
3,37375900,2012-02-07,0.477327,0.477327,1.330166,1.330166,0.224445,0.224445,1.033408,1.033408,"[India\'s Pfizer to spin-off animal healthcare business, Dealtalk: Nestle in lead to scoop up Pf...",3
4,25262900,2012-02-14,0.140845,0.140845,0.140647,0.140647,0.266436,0.266436,-0.480701,0.480701,"[Pfizer says its drug is best hope for Alzheimer\'s, Pfizer says its drug is best hope vs. Alzhe...",2


In [126]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3803 entries, 0 to 240
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   volume           3803 non-null   int64         
 1   date             3803 non-null   datetime64[ns]
 2   pct_px_change    3802 non-null   float64       
 3   abs_pct_change   3802 non-null   float64       
 4   5d_change        3803 non-null   float64       
 5   5d_abs           3803 non-null   float64       
 6   adjusted         3802 non-null   float64       
 7   adjusted_abs     3802 non-null   float64       
 8   adjusted_5d      3803 non-null   float64       
 9   adjusted_5d_abs  3803 non-null   float64       
 10  text             3803 non-null   object        
 11  news_count       3803 non-null   int64         
dtypes: datetime64[ns](1), float64(8), int64(2), object(1)
memory usage: 386.2+ KB


In [135]:
# export merged dataset to csv 
merged.to_csv("../assets/merged_headlines.csv", index=False)