<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Capstone Project: Predicting Stock Price Changes based on News

### Import Libraries

In [1]:
import pandas as pd
import datetime
import requests
import re
import time
import numpy as np
from bs4 import BeautifulSoup

pd.set_option('display.max.columns', None)
pd.set_option('display.max.colwidth', 100)

### Basic Data Cleaning and Aggregation by Unique Dates

In [2]:
# import scraped data & retain only news text
pfizer = pd.read_csv("../assets/scraped_news.csv")
pfizer.drop(columns=['links'], inplace=True)

In [3]:
pfizer.head()

Unnamed: 0,dates,headlines,article_content
0,2012-1-13,Nestle declines comment on Pfizer unit bid report,"ZURICH, Jan 13 (Reuters) - Nestle, the world’s biggest food group, declined to comment on a rep..."
1,2012-1-27,US FDA approves Pfizer\'s Inlyta for kidney cancer,WASHINGTON (Reuters) - Pfizer’s Inlyta drug for patients with advanced kidney cancer won approv...
2,2012-1-31,"Generics take toll on Pfizer, Lilly profits",(Reuters) - Competition from low-cost generic drugs squeezed quarterly profits at Pfizer Inc PF...
3,2012-1-31,"Pfizer trims 2012 view, citing stronger dollar","(Reuters) - Pfizer Inc PFE.N reported sharply lower quarterly earnings, hurt by generic forms o..."
4,2012-10-04,Trial suggests Prevnar may also protect ages 18-49,(Reuters) - Pfizer Inc said a late-stage trial of its vaccine to protect against pneumococcal b...


In [4]:
# create function to remove Country, Date and (Reuters) tags 
def remove_artcicle_tags(text):
    for t in text:
        t = re.findall(r'^.*\(\w+\) - (.*)', text)
    return t

In [5]:
# apply above function and print new df.head()
pfizer['article_content'] = pfizer['article_content'].apply(remove_artcicle_tags)
print(pfizer.head())

        dates                                           headlines  \
0   2012-1-13   Nestle declines comment on Pfizer unit bid report   
1   2012-1-27  US FDA approves Pfizer\'s Inlyta for kidney cancer   
2   2012-1-31         Generics take toll on Pfizer, Lilly profits   
3   2012-1-31      Pfizer trims 2012 view, citing stronger dollar   
4  2012-10-04  Trial suggests Prevnar may also protect ages 18-49   

                                                                                       article_content  
0  [Nestle, the world’s biggest food group, declined to comment on a report it had placed a bid to ...  
1  [Pfizer’s Inlyta drug for patients with advanced kidney cancer won approval from U.S. regulators...  
2  [Competition from low-cost generic drugs squeezed quarterly profits at Pfizer Inc PFE.N and Eli ...  
3  [Pfizer Inc PFE.N reported sharply lower quarterly earnings, hurt by generic forms of its Lipito...  
4  [Pfizer Inc said a late-stage trial of its vaccine to pro

In [6]:
# create new column to concatenate all text info
pfizer['news_text'] = pfizer['headlines'] + " " + pfizer['article_content'].astype(str)

In [7]:
# print sample to check that the concantenation worked 
pfizer['news_text'][0]

"Nestle declines comment on Pfizer unit bid report ['Nestle, the world’s biggest food group, declined to comment on a report it had placed a bid to buy Pfizer Inc’s baby formula business.“We never comment on market rumours,” a spokeswoman for the firm said.Bloomberg, citing people familiar with knowledge of the matter, reported that Nestle as well as France’s Danone SA had put in an offer to buy the unit.']"

In [8]:
# create a function to check if company is mentioned in the news report at all
def company_in_text(text, company):
    if company in text.lower():
        return 1
    else:
        return 0

In [9]:
# apply function to create feature column
pfizer['mentioned'] = pfizer['news_text'].apply(lambda x: company_in_text(x, "pfizer"))

In [10]:
# show sample articles where the company was not mentioned at all
pfizer[pfizer['mentioned']==0].tail()

Unnamed: 0,dates,headlines,article_content,news_text,mentioned
6024,2021-8-25,Delta Air Lines wields Covid vaccine stick,[],Delta Air Lines wields Covid vaccine stick [],0
6036,2021-8-26,Fact Check-Video does not prove COVID-19 vaccines cause blood anomalies\xc2\xa0,[],Fact Check-Video does not prove COVID-19 vaccines cause blood anomalies\xc2\xa0 [],0
6043,2021-8-27,Fact Check-Meme comparing products it says were approved by FDA is missing context,[],Fact Check-Meme comparing products it says were approved by FDA is missing context [],0
6071,2021-8-30,"Fact Check-Cigarettes are FDA regulated, not FDA approved",[],"Fact Check-Cigarettes are FDA regulated, not FDA approved []",0
6094,2021-8-31,COVID-19 cases rise in Australia\'s Victoria as lockdown extension looms,[Australian authorities on Wednesday extended the COVID-19 lockdown in Melbourne for another thr...,COVID-19 cases rise in Australia\'s Victoria as lockdown extension looms ['Australian authoritie...,0


In [11]:
# print an example of the text
print(pfizer['news_text'][6094])

COVID-19 cases rise in Australia\'s Victoria as lockdown extension looms ['Australian authorities on Wednesday extended the COVID-19 lockdown in Melbourne for another three weeks, as they shift their focus to rapid vaccination drives and move away from a suppression strategy to bring cases down to zero.Victorian Premier Daniel Andrews flagged a staggered easing of the tough restrictions once 70% of the state\'s adult residents receive at least one dose, a milestone he hopes to reach at least by Sept. 23, based on current vaccination rates."We have thrown everything at this, but it is now clear to us that we are not going to drive these numbers down, they are instead going to increase," Andrews told reporters in Melbourne, the state capital, after a lockdown for nearly a month failed to quell the outbreak. The lockdown was due to end on Thursday."We got to buy time to allow vaccinations to be undertaken all the while doing this very hard work, this very painful and difficult work, to ke

In [12]:
# we remove articles with no mention of the company in our dataset
pfizer = pfizer[pfizer['mentioned']==1]

In [13]:
pfizer.head()

Unnamed: 0,dates,headlines,article_content,news_text,mentioned
0,2012-1-13,Nestle declines comment on Pfizer unit bid report,"[Nestle, the world’s biggest food group, declined to comment on a report it had placed a bid to ...","Nestle declines comment on Pfizer unit bid report ['Nestle, the world’s biggest food group, decl...",1
1,2012-1-27,US FDA approves Pfizer\'s Inlyta for kidney cancer,[Pfizer’s Inlyta drug for patients with advanced kidney cancer won approval from U.S. regulators...,US FDA approves Pfizer\'s Inlyta for kidney cancer ['Pfizer’s Inlyta drug for patients with adva...,1
2,2012-1-31,"Generics take toll on Pfizer, Lilly profits",[Competition from low-cost generic drugs squeezed quarterly profits at Pfizer Inc PFE.N and Eli ...,"Generics take toll on Pfizer, Lilly profits ['Competition from low-cost generic drugs squeezed q...",1
3,2012-1-31,"Pfizer trims 2012 view, citing stronger dollar","[Pfizer Inc PFE.N reported sharply lower quarterly earnings, hurt by generic forms of its Lipito...","Pfizer trims 2012 view, citing stronger dollar ['Pfizer Inc PFE.N reported sharply lower quarter...",1
4,2012-10-04,Trial suggests Prevnar may also protect ages 18-49,[Pfizer Inc said a late-stage trial of its vaccine to protect against pneumococcal bacteria sugg...,Trial suggests Prevnar may also protect ages 18-49 ['Pfizer Inc said a late-stage trial of its v...,1


In [14]:
# count number of news each day
news_volume = pfizer.groupby('dates').size()
news_volume.head()

dates
2012-1-13     1
2012-1-27     1
2012-1-31     2
2012-10-04    2
2012-10-05    1
dtype: int64

In [15]:
# group headlines together such that each day will only be recorded as one observation
agg_text = pfizer.groupby('dates')['news_text'].apply(list)
agg_df = pd.DataFrame({'text':agg_text, 'news_count':news_volume})
    
# set the date index in a particular format 
agg_df.index = pd.to_datetime(agg_df.index,format="%Y-%m-%d")

In [16]:
agg_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1079 entries, 2012-01-13 to 2021-09-01
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        1079 non-null   object
 1   news_count  1079 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 25.3+ KB


In [17]:
agg_df.head()

Unnamed: 0_level_0,text,news_count
dates,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-01-13,"[Nestle declines comment on Pfizer unit bid report ['Nestle, the world’s biggest food group, dec...",1
2012-01-27,[US FDA approves Pfizer\'s Inlyta for kidney cancer ['Pfizer’s Inlyta drug for patients with adv...,1
2012-01-31,"[Generics take toll on Pfizer, Lilly profits ['Competition from low-cost generic drugs squeezed ...",2
2012-10-04,[Trial suggests Prevnar may also protect ages 18-49 ['Pfizer Inc said a late-stage trial of its ...,2
2012-10-05,"[Pfizer to appeal India decision to revoke cancer drug patent ['Pfizer Ltd, the India unit of U....",1


### Webscraping for stocks data using the Alpha Vantage API

In [18]:
def scrape_stocks(ticker):
    
    # pull API request
    url = ('https://www.alphavantage.co/query?function=TIME_SERIES_DAILY_ADJUSTED&symbol=' + ticker + 
           '&outputsize=full&apikey=4ALQKUAMC2QCOZW9')
    r = requests.get(url)
    data = r.json()
    
    # save data in panda dataframe
    df = pd.DataFrame(data["Time Series (Daily)"])
    df = df.T
    
    # grab only stocks data between 01 Jan 2011 and 31 August 2021
    df = df.loc['2021-08-31':'2012-01-10']
    
    # keep only relevant data columns (i.e. adjusted close price and volume)
    df = df.iloc[:, 4:6]
    
    # rename column names 
    df.rename(columns={"5. adjusted close": "close_price",
                       "6. volume": "volume" }, inplace=True)
    
    # change datatype from string to float
    for column in df.columns:
        df[column] = pd.to_numeric(df[column])
        
    # set the date index in standard format 
    df.index = pd.to_datetime(df.index,format="%Y-%m-%d")
    df.sort_index(inplace=True)
    df['date'] = pd.to_datetime(df.index,format="%Y-%m-%d")
    
    # create feature columns
    df['pct_px_change'] = (df['close_price'].pct_change()) * 100
    df['abs_pct_change'] = abs((df['close_price'].pct_change()) * 100)
        
    return df 

In [19]:
# scrape relevant stocks data
pfe = scrape_stocks('pfe')
sp500 = scrape_stocks('spy')
sp500.drop(columns=['abs_pct_change'], inplace=True)

In [20]:
pfe.head()

Unnamed: 0,close_price,volume,date,pct_px_change,abs_pct_change
2012-01-10,14.571357,27730000,2012-01-10,,
2012-01-11,14.544791,28764200,2012-01-11,-0.182315,0.182315
2012-01-12,14.604564,27358300,2012-01-12,0.410959,0.410959
2012-01-13,14.504942,29067200,2012-01-13,-0.682128,0.682128
2012-01-17,14.568036,35544000,2012-01-17,0.434982,0.434982


In [21]:
# write loop to create new variable to record price changes over 5 days and store info in new dataframe
pct_px_change_5d = []
abs_pct_px_change_5d = []
date = []
x = len(pfe) - 5

for i in range(0, x):
    d = pfe['date'][i]
    px = (pfe['close_price'][i+5] - pfe['close_price'][i]) / pfe['close_price'][i] * 100
    date.append(d)
    pct_px_change_5d.append(px)
    abs_pct_px_change_5d.append(abs(px))
        
df2 = pd.DataFrame({'date':date, '5d_change':pct_px_change_5d, '5d_abs':abs_pct_px_change_5d})

# merge both dataframes
merged_df = pfe.merge(df2, on="date")

In [22]:
# write loop to create new variable to record price changes over 5 days and store info in new dataframe
pct_px_change_5d = []
date = []
x = len(sp500) - 5

for i in range(0, x):
    d = sp500['date'][i]
    px = (sp500['close_price'][i+5] - sp500['close_price'][i]) / sp500['close_price'][i] * 100
    date.append(d)
    pct_px_change_5d.append(px)
        
df2 = pd.DataFrame({'date':date, 'sp500_5d_change':pct_px_change_5d})

# further merge dataframes
df2 = df2.merge(sp500, on='date')
df2.drop(columns=['close_price', 'volume'], inplace=True)
df2.rename(columns={"pct_px_change": "sp500_pct_px_change"}, inplace=True)
merged_df = merged_df.merge(df2, on="date")

In [23]:
merged_df["adjusted"] = merged_df['pct_px_change'] - merged_df['sp500_pct_px_change']
merged_df["adjusted_abs"] = abs(merged_df["adjusted"])
merged_df["adjusted_5d"] = merged_df['5d_change'] - merged_df['sp500_5d_change']
merged_df["adjusted_5d_abs"] = abs(merged_df["adjusted_5d"])

In [24]:
merged_df.head()

Unnamed: 0,close_price,volume,date,pct_px_change,abs_pct_change,5d_change,5d_abs,sp500_5d_change,sp500_pct_px_change,adjusted,adjusted_abs,adjusted_5d,adjusted_5d_abs
0,14.571357,27730000,2012-01-10,,,0.136737,0.136737,1.270038,,,,-1.133301,1.133301
1,14.544791,28764200,2012-01-11,-0.182315,0.182315,-0.273973,0.273973,1.749226,0.054209,-0.236524,0.236524,-2.023199,2.023199
2,14.604564,27358300,2012-01-12,0.410959,0.410959,-0.409277,0.409277,1.884024,0.239938,0.171021,0.171021,-2.293301,2.293301
3,14.504942,29067200,2012-01-13,-0.682128,0.682128,-0.595238,0.595238,2.149953,-0.517335,-0.164794,0.164794,-2.745192,2.745192
4,14.568036,35544000,2012-01-17,0.434982,0.434982,-1.253704,1.253704,1.639091,0.388078,0.046903,0.046903,-2.892795,2.892795


In [25]:
# create date column so that the news data can be merged with the stock ticker data
agg_df['date'] = pd.to_datetime(agg_df.index,format="%Y-%m-%d")

In [26]:
# merge news & stocks data
pfizer = merged_df.merge(agg_df, on="date")
pfizer.drop(columns=['close_price', 'sp500_5d_change', 'sp500_pct_px_change'], inplace=True)

In [27]:
pfizer = pfizer.dropna()

In [28]:
pfizer.head()

Unnamed: 0,volume,date,pct_px_change,abs_pct_change,5d_change,5d_abs,adjusted,adjusted_abs,adjusted_5d,adjusted_5d_abs,text,news_count
0,29067200,2012-01-13,-0.682128,0.682128,-0.595238,0.595238,-0.164794,0.164794,-2.745192,2.745192,"[Nestle declines comment on Pfizer unit bid report ['Nestle, the world’s biggest food group, dec...",1
1,132872800,2012-01-27,-0.693481,0.693481,-0.284616,0.284616,-0.647985,0.647985,-2.348036,2.348036,[US FDA approves Pfizer\'s Inlyta for kidney cancer ['Pfizer’s Inlyta drug for patients with adv...,1
2,55526800,2012-01-31,-0.834106,0.834106,-0.62002,0.62002,-0.796045,0.796045,-3.26242,3.26242,"[Generics take toll on Pfizer, Lilly profits ['Competition from low-cost generic drugs squeezed ...",2
3,37375900,2012-02-07,0.477327,0.477327,1.330166,1.330166,0.224445,0.224445,1.033408,1.033408,[India\'s Pfizer to spin-off animal healthcare business ['India’s Pfizer Ltd. said its board has...,3
4,25262900,2012-02-14,0.140845,0.140845,0.140647,0.140647,0.266436,0.266436,-0.480701,0.480701,[Pfizer says its drug is best hope for Alzheimer\'s ['Pfizer Inc research chief Mikael Dolsten s...,2


### Repeat the same steps for the other companies' data

In [29]:
def clean_updates(text):
    text = re.sub(r'UPDATE 1-', '', text)
    text = re.sub(r'UPDATE 2-', '', text)
    text = re.sub(r'UPDATE 3-', '', text)
    text = re.sub(r'UPDATE 4-', '', text)
    text = re.sub(r'BRIEF-', '', text)
    return text

In [30]:
def clean_and_merge(news, company, ticker):
    
    news['headlines'] = news['headlines'].apply(clean_updates)
    news.drop_duplicates(subset=['headlines'], ignore_index=True, inplace=True)
    news.drop(columns=['links'], inplace=True)
    news['article_content'] = news['article_content'].apply(remove_artcicle_tags)
    news ['news_text'] = news ['headlines'] + " " + news ['article_content'].astype(str)
    news['mentioned'] = news['news_text'].apply(lambda x: company_in_text(x, company))
    news = news[news['mentioned']==1]
    news_volume = news.groupby('dates').size()
    agg_text = news.groupby('dates')['news_text'].apply(list)
    agg_df = pd.DataFrame({'text':agg_text, 'news_count':news_volume})
    agg_df['date'] = pd.to_datetime(agg_df.index,format="%Y-%m-%d")
    
    df = scrape_stocks(ticker)
    pct_px_change_5d = []
    abs_pct_px_change_5d = []
    date = []
    x = len(df) - 5
    
    for i in range(0, x):
        d = df['date'][i]
        px = (df['close_price'][i+5] - df['close_price'][i]) / df['close_price'][i] * 100
        date.append(d)
        pct_px_change_5d.append(px)
        abs_pct_px_change_5d.append(abs(px))
        
    df2 = pd.DataFrame({'date':date, '5d_change':pct_px_change_5d, '5d_abs':abs_pct_px_change_5d})
    
    merged_df = df.merge(df2, on="date")
    data = merged_df.merge(agg_df, on="date")
    
    sp500 = scrape_stocks('spy')
    sp500.drop(columns=['abs_pct_change'], inplace=True)
    y = len(sp500) - 5
    pct_px_change_5d = []
    date = []
    
    for i in range(0, y):
        d = sp500['date'][i]
        px = (sp500['close_price'][i+5] - sp500['close_price'][i]) / sp500['close_price'][i] * 100
        date.append(d)
        pct_px_change_5d.append(px)
        
    df2 = pd.DataFrame({'date':date, 'sp500_5d_change':pct_px_change_5d})
    
    df2 = df2.merge(sp500, on='date')
    df2.drop(columns=['close_price', 'volume'], inplace=True)
    df2.rename(columns={"pct_px_change": "sp500_pct_px_change"}, inplace=True)
    merged_df = merged_df.merge(df2, on="date")
    
    merged_df["adjusted"] = merged_df['pct_px_change'] - merged_df['sp500_pct_px_change']
    merged_df["adjusted_abs"] = abs(merged_df["adjusted"])
    merged_df["adjusted_5d"] = merged_df['5d_change'] - merged_df['sp500_5d_change']
    merged_df["adjusted_5d_abs"] = abs(merged_df["adjusted_5d"])
    
    data = merged_df.merge(agg_df, on="date")
    data.drop(columns=['close_price', 'sp500_5d_change', 'sp500_pct_px_change'], inplace=True)
    
    return data

### Biogen [NASDAQ: BIIB]

In [31]:
df = pd.read_csv("../assets/scraped_news_biogen.csv")

In [32]:
biogen = clean_and_merge(df, "biogen", "biib")

In [33]:
biogen.head()

Unnamed: 0,volume,date,pct_px_change,abs_pct_change,5d_change,5d_abs,adjusted,adjusted_abs,adjusted_5d,adjusted_5d_abs,text,news_count
0,1404800,2012-01-13,-0.232158,0.232158,0.241317,0.241317,0.285176,0.285176,-1.908637,1.908637,[Cancer drug gets stronger label warning ['Seattle Genetics Inc said it found a second instance ...,1
1,3396400,2012-01-20,1.164583,1.164583,0.35551,0.35551,0.791846,0.791846,0.454033,0.454033,"[Falcone eyes mobile asset sales, Icahn invests ['Hedge fund manager Philip Falcone is looking a...",3
2,1551600,2012-01-27,0.602461,0.602461,2.80027,2.80027,0.647957,0.647957,0.73685,0.73685,"[Oshkosh, Icahn to square off at director vote ['Oshkosh Corp OSK.N and activist investor Carl I...",1
3,2615600,2012-01-31,1.175159,1.175159,3.060619,3.060619,1.213219,1.213219,0.418219,0.418219,"[Biogen 4th quarter profit up; 2012 forecast falls short [""Biogen Idec BIIB.O is spending heavil...",2
4,1029400,2012-02-14,-0.25,0.25,-2.664996,2.664996,-0.124409,0.124409,-3.286344,3.286344,[Biogen to acquire developer of fibrosis treatments ['Biogen Idec Inc BIIB.O has agreed to acqui...,1


### Amgen [NASDAQ: AMGN]

In [34]:
df = pd.read_csv("../assets/scraped_news_amgen.csv")

In [35]:
amgen = clean_and_merge(df, "amgen", "amgn")

In [36]:
amgen.head()

Unnamed: 0,volume,date,pct_px_change,abs_pct_change,5d_change,5d_abs,adjusted,adjusted_abs,adjusted_5d,adjusted_5d_abs,text,news_count
0,6150500,2012-01-10,,,5.405817,5.405817,,,4.135779,4.135779,"[New DNA reader to bring promise [""A new DNA reader could bring genetics to medical clinics.Afte...",2
1,5996400,2012-01-13,0.088836,0.088836,0.266272,0.266272,0.606171,0.606171,-1.883681,1.883681,[Analysis: Investors likely to wait out 2012 drug launches ['Burned by disappointing early sales...,1
2,5700100,2012-01-19,-0.115574,0.115574,-1.533121,1.533121,-0.643217,0.643217,-1.85261,1.85261,[Swiss stocks - Factors to watch on Jan 19 ['Swiss shares were poised to open higher on Thursday...,1
3,6839100,2012-01-20,0.621927,0.621927,-1.768003,1.768003,0.24919,0.24919,-1.669481,1.669481,[EU agency issues guideline on biosimilar MS drugs ['European regulators took another step towar...,1
4,6436400,2012-01-25,1.495821,1.495821,0.910273,0.910273,0.659064,0.659064,0.978167,0.978167,[Roche in $5.7 billion bid for gene decoder Illumina ['Swiss drugmaker Roche Holding AG ROG.VX h...,1


### AbbVie [NYSE: ABBV]

In [37]:
df = pd.read_csv("../assets/scraped_news_abbvie.csv")

In [38]:
abbvie = clean_and_merge(df, "abbvie", "abbv")

In [39]:
abbvie.head()

Unnamed: 0,volume,date,pct_px_change,abs_pct_change,5d_change,5d_abs,adjusted,adjusted_abs,adjusted_5d,adjusted_5d_abs,text,news_count
0,18800400,2013-01-09,0.563631,0.563631,6.076609,6.076609,0.309423,0.309423,5.302212,5.302212,"[AbbVie sees no need for major acquisitions-CFO ['AbbVie Inc, the pharmaceutical company spun of...",1
1,13040200,2013-01-15,1.49604,1.49604,9.248555,9.248555,1.427999,1.427999,7.684674,7.684674,[Analysis: Drug industry bets on new blockbusters in 2013 ['Drugmakers are betting that a new wa...,1
2,10981600,2013-01-23,3.789127,3.789127,-2.301587,2.301587,3.628193,3.628193,-2.770222,2.770222,"[Abbott results bode well for spun-off AbbVie ['AbbVie Inc, the new branded pharmaceuticals comp...",1
3,10841800,2013-01-30,-1.018494,1.018494,0.568643,0.568643,-0.626883,0.626883,-0.157684,0.157684,"[AbbVie forecasts slower Humira growth ['AbbVie ABBV.N, the pharmaceuticals business spun off ea...",4
4,7284400,2013-02-05,-0.214592,0.214592,-4.784946,4.784946,-1.224356,1.224356,-5.427118,5.427118,"[Senate to mull ban on \\""pay for delay\\"" pharmaceutical deals ['Key Democratic and Republican ...",2


### Gilead [NASDAQ: GILD]

In [40]:
df = pd.read_csv("../assets/scraped_news_gilead.csv")

In [41]:
gilead = clean_and_merge(df, "gilead", "gild")

In [42]:
gilead.head()

Unnamed: 0,volume,date,pct_px_change,abs_pct_change,5d_change,5d_abs,adjusted,adjusted_abs,adjusted_5d,adjusted_5d_abs,text,news_count
0,15055000,2012-01-10,,,7.367232,7.367232,,,6.097194,6.097194,"[Pharma pays up for scarce assets ['When it comes to healthcare deals, the new motto may be “too...",1
1,7038000,2012-01-20,0.487185,0.487185,2.698145,2.698145,0.114448,0.114448,2.796667,2.796667,"[FDA lifts hold on Insmed\'s lung disease drug [""Insmed Inc INSM.O said U.S. health regulators l...",1
2,37671600,2012-02-03,10.920706,10.920706,-1.727763,1.727763,9.518837,9.518837,-1.593974,1.593974,"[Nasdaq vaults to 11-year high on surge in jobs [""A surge in hiring in the world’s largest econo...",1
3,7321500,2012-02-14,-0.628415,0.628415,-18.375951,18.375951,-0.502824,0.502824,-18.997299,18.997299,"[Duquesne alumni reveal holdings at Point State [""Point State Capital, one of last year's most c...",2
4,6095400,2012-02-15,0.485748,0.485748,-17.639548,17.639548,0.951759,0.951759,-19.177895,19.177895,[TV network aims for new viewing audience: dogs ['Nielsen isn’t tracking the network’s ratings a...,1


### Merck [NYSE: MRK]

In [43]:
df = pd.read_csv("../assets/scraped_news_merck.csv")

In [44]:
merck = clean_and_merge(df, "merck", "mrk")

In [45]:
merck.head()

Unnamed: 0,volume,date,pct_px_change,abs_pct_change,5d_change,5d_abs,adjusted,adjusted_abs,adjusted_5d,adjusted_5d_abs,text,news_count
0,11366200,2012-01-10,,,0.908619,0.908619,,,-0.361419,0.361419,[FDA says Merck\'s HIV drug gets generic threat ['Health regulators said they have received appl...,2
1,11945800,2012-01-11,-0.285566,0.285566,2.212965,2.212965,-0.339775,0.339775,0.463739,0.463739,[German stocks - Factors to watch on January 11 ['The following are some of the factors that may...,2
2,9677400,2012-01-13,-0.751101,0.751101,1.174322,1.174322,-0.233766,0.233766,-0.975632,0.975632,"[China seeks to unlock secrets of herbs, roots [""Chinese legends have long extolled the benefits...",2
3,16546800,2012-01-19,1.003344,1.003344,-1.222618,1.222618,0.475701,0.475701,-1.542107,1.542107,[Merck resolves Vioxx litigation in Canada ['Merck & Co MRK.N on Thursday said it had reached a ...,1
4,15333900,2012-01-20,-0.152827,0.152827,-1.734694,1.734694,-0.525564,0.525564,-1.636172,1.636172,"[Achillion shares fall on concerns over hep C drug\'s future [""Achillion Pharmaceuticals' ACHN.O...",2


### Eli Lilly [NYSE: LLY]

In [46]:
df = pd.read_csv("../assets/scraped_news_lilly.csv")

In [47]:
lilly = clean_and_merge(df, "lilly", "lly")

In [48]:
lilly.head()

Unnamed: 0,volume,date,pct_px_change,abs_pct_change,5d_change,5d_abs,adjusted,adjusted_abs,adjusted_5d,adjusted_5d_abs,text,news_count
0,5085900,2012-01-11,-0.86784,0.86784,0.475238,0.475238,-0.922049,0.922049,-1.273988,1.273988,[Antibiotics for livestock vital to feed world -OIE ['The world body in charge of fighting anima...,3
1,4420100,2012-01-13,-0.597312,0.597312,-1.352028,1.352028,-0.079978,0.079978,-3.501981,3.501981,[Analysis: Investors likely to wait out 2012 drug launches ['Burned by disappointing early sales...,1
2,5899600,2012-01-17,0.225338,0.225338,-1.049213,1.049213,-0.16274,0.16274,-2.688304,2.688304,[China vaccine maker joins Aeras in fight against TB ['China’s top vaccine maker is teaming up w...,1
3,5173700,2012-01-18,0.374719,0.374719,-0.721752,0.721752,-0.730894,0.730894,-2.090568,2.090568,[How to play it: Finding value in healthcare ['It’s the sector that can’t seem to catch a break....,1
4,9629000,2012-01-23,-1.029892,1.029892,-0.380711,0.380711,-0.772219,0.772219,-0.198354,0.198354,"[FDA extends review of Alexza product, shares fall ['Alexza Pharmaceuticals Inc said U.S. health...",1


### Regeneron Pharmaceuticals [NASDAQ: REGN]

In [49]:
df = pd.read_csv("../assets/scraped_news_regeneron.csv")

In [50]:
regeneron = clean_and_merge(df, "regeneron", "regn")

In [51]:
regeneron.head()

Unnamed: 0,volume,date,pct_px_change,abs_pct_change,5d_change,5d_abs,adjusted,adjusted_abs,adjusted_5d,adjusted_5d_abs,text,news_count
0,2335500,2012-01-12,1.163392,1.163392,0.089445,0.089445,0.923454,0.923454,-1.794579,1.794579,"[CORRECTED-Regeneron posts \'11 Eylea sales of $24-25 mln ['Sales of Eylea, the eye drug launche...",1
1,1818000,2012-01-13,0.843343,0.843343,-0.468829,0.468829,1.360677,1.360677,-2.618783,2.618783,[Analysis: Investors likely to wait out 2012 drug launches ['Burned by disappointing early sales...,1
2,1136000,2012-01-27,4.508247,4.508247,13.233575,13.233575,4.553743,4.553743,11.170155,11.170155,[Infinity stops cancer drug trial ['Infinity Pharmaceuticals pulled the plug on a mid-stage tria...,1
3,4161600,2012-02-13,12.313871,12.313871,-13.423463,13.423463,11.569602,11.569602,-14.243498,14.243498,"[US STOCKS-Wall St higher on Greek deal, banks lead ['U.S. stocks rose Monday as Greece’s parlia...",7
4,338300,2012-03-09,0.951036,0.951036,7.57392,7.57392,0.564287,0.564287,5.143238,5.143238,[FDA asks experts if pain drugs get second chance ['U.S. drug regulators are asking experts for ...,1


### Merge dataframes

In [52]:
merged = pd.concat([pfizer, biogen, amgen, abbvie, gilead, lilly, merck, regeneron])

In [53]:
merged.head()

Unnamed: 0,volume,date,pct_px_change,abs_pct_change,5d_change,5d_abs,adjusted,adjusted_abs,adjusted_5d,adjusted_5d_abs,text,news_count
0,29067200,2012-01-13,-0.682128,0.682128,-0.595238,0.595238,-0.164794,0.164794,-2.745192,2.745192,"[Nestle declines comment on Pfizer unit bid report ['Nestle, the world’s biggest food group, dec...",1
1,132872800,2012-01-27,-0.693481,0.693481,-0.284616,0.284616,-0.647985,0.647985,-2.348036,2.348036,[US FDA approves Pfizer\'s Inlyta for kidney cancer ['Pfizer’s Inlyta drug for patients with adv...,1
2,55526800,2012-01-31,-0.834106,0.834106,-0.62002,0.62002,-0.796045,0.796045,-3.26242,3.26242,"[Generics take toll on Pfizer, Lilly profits ['Competition from low-cost generic drugs squeezed ...",2
3,37375900,2012-02-07,0.477327,0.477327,1.330166,1.330166,0.224445,0.224445,1.033408,1.033408,[India\'s Pfizer to spin-off animal healthcare business ['India’s Pfizer Ltd. said its board has...,3
4,25262900,2012-02-14,0.140845,0.140845,0.140647,0.140647,0.266436,0.266436,-0.480701,0.480701,[Pfizer says its drug is best hope for Alzheimer\'s ['Pfizer Inc research chief Mikael Dolsten s...,2


In [54]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7252 entries, 0 to 559
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   volume           7252 non-null   int64         
 1   date             7252 non-null   datetime64[ns]
 2   pct_px_change    7249 non-null   float64       
 3   abs_pct_change   7249 non-null   float64       
 4   5d_change        7252 non-null   float64       
 5   5d_abs           7252 non-null   float64       
 6   adjusted         7249 non-null   float64       
 7   adjusted_abs     7249 non-null   float64       
 8   adjusted_5d      7252 non-null   float64       
 9   adjusted_5d_abs  7252 non-null   float64       
 10  text             7252 non-null   object        
 11  news_count       7252 non-null   int64         
dtypes: datetime64[ns](1), float64(8), int64(2), object(1)
memory usage: 736.5+ KB


In [55]:
# export merged dataset to csv 
merged.to_csv("../assets/merged.csv", index=False)