<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Capstone Project: Predicting Stock Price Changes based on News

### Import Libraries

In [1]:
import pandas as pd
import datetime
import requests
import re
import time
import numpy as np
from bs4 import BeautifulSoup

pd.set_option('display.max.columns', None)
pd.set_option('display.max.colwidth', 100)

### Basic Data Cleaning and Aggregation by Unique Dates

In [2]:
# import scraped data & retain only news text
pfizer = pd.read_csv("../assets/scraped_news.csv")
pfizer.drop(columns=['links'], inplace=True)

In [3]:
pfizer.head()

Unnamed: 0,dates,headlines,article_content
0,2012-1-13,Nestle declines comment on Pfizer unit bid report,"ZURICH, Jan 13 (Reuters) - Nestle, the world’s biggest food group, declined to comment on a rep..."
1,2012-1-27,US FDA approves Pfizer\'s Inlyta for kidney cancer,WASHINGTON (Reuters) - Pfizer’s Inlyta drug for patients with advanced kidney cancer won approv...
2,2012-1-31,"Generics take toll on Pfizer, Lilly profits",(Reuters) - Competition from low-cost generic drugs squeezed quarterly profits at Pfizer Inc PF...
3,2012-1-31,"Pfizer trims 2012 view, citing stronger dollar","(Reuters) - Pfizer Inc PFE.N reported sharply lower quarterly earnings, hurt by generic forms o..."
4,2012-10-04,Trial suggests Prevnar may also protect ages 18-49,(Reuters) - Pfizer Inc said a late-stage trial of its vaccine to protect against pneumococcal b...


In [4]:
# create function to remove Country, Date and (Reuters) tags 
def remove_artcicle_tags(text):
    for t in text:
        t = re.findall(r'^.*\(\w+\) - (.*)', text)
    return t

In [None]:
# apply above function and print new df.head()
pfizer['article_content'] = pfizer['article_content'].apply(remove_artcicle_tags)
print(df.head())

In [None]:
# create new column to concatenate all text info
pfizer['news_text'] = pfizer['headlines'] + " " + pfizer['article_content'].astype(str)

In [None]:
# print sample to check that the concantenation worked 
pfizer['news_text'][0]

In [None]:
# create a function to check if company is mentioned in the news report at all
def company_in_text(text, company):
    if company in text.lower():
        return 1
    else:
        return 0

In [None]:
# apply function to create feature column
pfizer['mentioned'] = pfizer['news_text'].apply(lambda x: company_in_text(x, "pfizer"))

In [None]:
# show sample articles where the company was not mentioned at all
pfizer[pfizer['mentioned']==0].tail()

In [None]:
# print an example of the text
print(pfizer['news_text'][6094])

In [None]:
# we remove articles with no mention of the company in our dataset
pfizer = pfizer[pfizer['mentioned']==1]

In [None]:
pfizer.head()

In [None]:
# count number of news each day
news_volume = pfizer.groupby('dates').size()
news_volume.head()

In [None]:
# group headlines together such that each day will only be recorded as one observation
agg_text = pfizer.groupby('dates')['news_text'].apply(list)
agg_df = pd.DataFrame({'text':agg_text, 'news_count':news_volume})
    
# set the date index in a particular format 
agg_df.index = pd.to_datetime(agg_df.index,format="%Y-%m-%d")

In [None]:
agg_df.info()

In [None]:
agg_df.head()

### Webscraping for stocks data using the Alpha Vantage API

In [None]:
def scrape_stocks(ticker):
    
    # pull API request
    url = ('https://www.alphavantage.co/query?function=TIME_SERIES_DAILY_ADJUSTED&symbol=' + ticker + 
           '&outputsize=full&apikey=4ALQKUAMC2QCOZW9')
    r = requests.get(url)
    data = r.json()
    
    # save data in panda dataframe
    df = pd.DataFrame(data["Time Series (Daily)"])
    df = df.T
    
    # grab only stocks data between 01 Jan 2011 and 31 August 2021
    df = df.loc['2021-08-31':'2012-01-10']
    
    # keep only relevant data columns (i.e. adjusted close price and volume)
    df = df.iloc[:, 4:6]
    
    # rename column names 
    df.rename(columns={"5. adjusted close": "close_price",
                       "6. volume": "trading_volume" }, inplace=True)
    
    # change datatype from string to float
    for column in df.columns:
        df[column] = pd.to_numeric(df[column])
        
    # set the date index in standard format 
    df.index = pd.to_datetime(df.index,format="%Y-%m-%d")
    df.sort_index(inplace=True)
    df['date'] = pd.to_datetime(df.index,format="%Y-%m-%d")
    
    # rename column names 
    df['pct_px_change'] = (df['close_price'].pct_change()) * 100
    df['abs_pct_change'] = abs((df['close_price'].pct_change()) * 100)
        
    return df 

In [None]:
# scrape relevant stocks data
pfe = scrape_stocks('pfe')

In [None]:
pfe.head()

In [None]:
# write loop to create new variable to record price changes over 5 days and store info in new dataframe
pct_px_change_5d = []
abs_pct_px_change_5d = []
date = []

for i in range(0, 2422):
    d = pfe['date'][i]
    px = (pfe['close_price'][i+5] - pfe['close_price'][i]) / pfe['close_price'][i] * 100
    date.append(d)
    pct_px_change_5d.append(px)
    abs_pct_px_change_5d.append(abs(px))
        
df2 = pd.DataFrame({'date':date, '5d_change':pct_px_change_5d, '5d_abs':abs_pct_px_change_5d})

In [None]:
df2.head()

In [None]:
# merge both dataframes
merged_df = pfe.merge(df2, on="date")

In [None]:
merged_df

In [None]:
# create date column so that the news data can be merged with the stock ticker data
agg_df['date'] = pd.to_datetime(agg_df.index,format="%Y-%m-%d")

In [None]:
# merge news & stocks data
pfizer = merged_df.merge(agg_df, on="date")

In [None]:
pfizer = pfizer.dropna()

### Repeat the same steps for the other companies' data

In [None]:
def clean_and_merge(news, company, ticker):
    
    news['headlines'] = news['headlines'].apply(clean_updates)
    news.drop_duplicates(subset=['headlines'], ignore_index=True, inplace=True)
    news.drop(columns=['links'], inplace=True)
    news['article_content'] = news['article_content'].apply(remove_artcicle_tags)
    news ['news_text'] = news ['headlines'] + " " + news ['article_content'].astype(str)
    news['mentioned'] = news['news_text'].apply(lambda x: company_in_text(x, company))
    news = news[news['mentioned']==1]
    news_volume = news.groupby('dates').size()
    agg_text = news.groupby('dates')['news_text'].apply(list)
    agg_df = pd.DataFrame({'text':agg_text, 'news_count':news_volume})
    agg_df['date'] = pd.to_datetime(agg_df.index,format="%Y-%m-%d")
    
    df = scrape_stocks(ticker)
    pct_px_change_5d = []
    abs_pct_px_change_5d = []
    date = []
    x = len(df) - 5

    for i in range(0, x):
        d = df['date'][i]
        px = (df['close_price'][i+5] - df['close_price'][i]) / df['close_price'][i] * 100
        date.append(d)
        pct_px_change_5d.append(px)
        abs_pct_px_change_5d.append(abs(px))
        
    df2 = pd.DataFrame({'date':date, '5d_change':pct_px_change_5d, '5d_abs':abs_pct_px_change_5d})
    
    merged_df = df.merge(df2, on="date")
    data = merged_df.merge(agg_df, on="date")
    
    return data

### Biogen [NASDAQ: BIIB]

In [None]:
df = pd.read_csv("../assets/scraped_news_biogen.csv")

In [None]:
biogen = clean_and_merge(df, "biogen", "biib")

In [None]:
biogen.head()

### Amgen [NASDAQ: AMGN]

In [None]:
df = pd.read_csv("../assets/scraped_news_amgen.csv")

In [None]:
amgen = clean_and_merge(df, "amgen", "amgn")

In [None]:
amgen.head()

### AbbVie [NYSE: ABBV]

In [None]:
df = pd.read_csv("../assets/scraped_news_abbvie.csv")

In [None]:
abbvie = clean_and_merge(df, "abbvie", "abbv")

In [None]:
abbvie.head()

### Gilead [NASDAQ: GILD]

In [None]:
df = pd.read_csv("../assets/scraped_news_gilead.csv")

In [None]:
gilead = clean_and_merge(df, "gilead", "gild")

In [None]:
gilead.head()

### Merck [NYSE: MRK]

In [None]:
df = pd.read_csv("../assets/scraped_news_merck.csv")

In [None]:
merck = clean_and_merge(df, "merck", "mrk")

In [None]:
merck.head()

### Eli Lilly [NYSE: LLY]

In [None]:
df = pd.read_csv("../assets/scraped_news_lilly.csv")

In [None]:
lilly = clean_and_merge(df, "lilly", "lly")

In [None]:
lilly.head()

### Regeneron Pharmaceuticals [NASDAQ: REGN]

In [None]:
df = pd.read_csv("../assets/scraped_news_regeneron.csv")

In [None]:
regeneron = clean_and_merge(df, "regeneron", "regn")

In [None]:
regeneron.head()

### Merge dataframes

In [None]:
merged = pd.concat([pfizer, biogen, amgen, abbvie, gilead, lilly, merck, regeneron])

In [None]:
merged.head()

In [None]:
merged.info()

In [None]:
# export merged dataset to csv 
merged.to_csv("../assets/merged.csv", index=False)