### Imports

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import matplotlib.pyplot as plt
import config
import torch
import csv
from datetime import datetime, date, timedelta
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from transformers import pipeline
import numpy as np
import re
from scipy.special import softmax

ticker = 'AAPL'



# Data

### Daily Info

In [48]:
url = f'https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol={ticker}&apikey={config.alphavantage_apikey}'
r = requests.get(url)
data = r.json()

header = list(data['Time Series (Daily)']['2024-05-14'].keys())
header.insert(0, 'Date')

# Writing to CSV
with open(f'data/{ticker}_daily.csv', 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=header)
    
    writer.writeheader()
    
    for date, values in data['Time Series (Daily)'].items():
        row = {'Date': date}
        row.update(values)
        writer.writerow(row)


### Insider Trades

In [49]:
url = f"http://openinsider.com/search?q={ticker}"

# Request the webpage
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find the table
table = soup.find('table', class_='tinytable')

if table:
    # Extract table rows
    rows = table.find_all('tr')

    # Create an empty list to store the data
    data = []

    # Loop through rows and extract data
    for row in rows[1:]:  # Skip the header row
        cells = row.find_all('td')
        insider_info = {
            'Trade Date': cells[2].get_text(strip=True),
            'Value': cells[11].get_text(strip=True)
        }
        data.append(insider_info)

    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(data)

    # Remove dollar sign and comma from 'Value' column and convert to numeric type
    df['Value'] = df['Value'].str.replace('$', '').str.replace(',', '').astype(float)

    # Group the data by 'Trade Date' and aggregate the values
    df_merged = df.groupby('Trade Date').agg({'Value': 'sum'}).reset_index()

    # Save DataFrame to CSV
    df_merged.to_csv(f'data/{ticker}_insider_trades.csv', index=False)
    print('Insider trades data saved')

else:
    print('No table found')

Insider trades data saved


  df['Value'] = df['Value'].str.replace('$', '').str.replace(',', '').astype(float)


### Articles

In [50]:
# List of articles to scrape
articles = [
    "https://finance.yahoo.com/news/apple-touts-stopping-1-8bn-170000332.html",
    "https://finance.yahoo.com/news/openai-leap-forward-human-ai-150259365.html",
    "https://finance.yahoo.com/m/df9a8cd8-0f31-39bc-909d-d877af2ff523/heard-on-the-street-apple-s.html",
    "https://finance.yahoo.com/news/where-apple-stock-5-years-183000424.html",
    "https://finance.yahoo.com/m/6104f210-fff1-3b57-a907-5f578b487d65/apple-makes-rare-apology-for.html"
]

# Function to scrape Yahoo Finance article
def scrape_yahoo_finance(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        # Get header
        header_element = soup.find(id="caas-lead-header-undefined")
        header = header_element.get_text().strip()
        # Get body
        article_elements = soup.find_all(class_="caas-body")
        body = ""
        for element in article_elements:
            body += element.get_text().strip() + "\n"
        # Get date
        date_text = soup.find_all(class_="caas-attr-time-style")[0].get_text().strip()
        date = datetime.strptime(date_text[:17], "%a, %b %d, %Y").strftime("%Y-%m-%d")
        return header + ". " + body, date
    else:
        print("Failed to retrieve the webpage. Status code:", response.status_code)
        return None, None

# Function to query sentiment analysis models
def query_sentiment_analysis(text):
    # Summarizer
    summarizer_api_url = "https://api-inference.huggingface.co/models/sshleifer/distilbart-cnn-12-6"
    summarizer_headers = {f"Authorization": f"Bearer {config.huggingface_apikey}"}
    summarizer_output = requests.post(summarizer_api_url, headers=summarizer_headers, json={"inputs": text}).json()
    summarized_text = summarizer_output[0]['summary_text']
    
    # Finbert
    finbert_api_url = "https://api-inference.huggingface.co/models/ProsusAI/finbert"
    finbert_headers = {"Authorization": f"Bearer {config.huggingface_apikey}"}
    finbert_output = requests.post(finbert_api_url, headers=finbert_headers, json={"inputs": summarized_text}).json()
    
    # Extract the list of sentiments
    sentiments = finbert_output[0]
    # Initialize variables to store the most likely sentiment label and score
    most_likely_label = None
    max_score = -1
    # Iterate through the list of sentiments
    for sentiment in sentiments:
        label = sentiment['label']
        score = sentiment['score']
        # Update most_likely_label and max_score if the current score is higher
        if score > max_score:
            most_likely_label = label
            max_score = score
    # Convert sentiment label to numerical value
    if most_likely_label == 'positive':
        numerical_sentiment = 1
    elif most_likely_label == 'neutral':
        numerical_sentiment = 0
    elif most_likely_label == 'negative':
        numerical_sentiment = -1
    return numerical_sentiment

# Initialize list to store article data
articledata = []

# Iterate through the list of articles
for article in articles:
    # Scrape article content and date
    article_text, article_date = scrape_yahoo_finance(article)
    if article_text is not None and article_date is not None:
        # Analyze sentiment and convert to numerical value
        numerical_sentiment = query_sentiment_analysis(article_text)
        # Append article date and sentiment to articledata list
        articledata.append({'Date': article_date, 'Sentiment': numerical_sentiment})

# Convert article data to DataFrame
df = pd.DataFrame(articledata)

# Group by date and sum the sentiments
df_grouped = df.groupby('Date').sum().reset_index()

# Save DataFrame to CSV
df_grouped.to_csv(f'data/{ticker}_sentiment.csv', index=False)

print("Articles sentiment data combined and saved")

Failed to retrieve the webpage. Status code: 404
Articles sentiment data combined and saved


In [5]:
article = "https://finance.yahoo.com/news/apple-touts-stopping-1-8bn-170000332.html"
articledata = []

Scrape

In [6]:
def scrape_yahoo_finance(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Get header
        header_element = soup.find(id="caas-lead-header-undefined")
        header = header_element.get_text().strip()
        
        # Get body
        article_elements = soup.find_all(class_="caas-body")
        body = ""
        for element in article_elements:
            body += element.get_text().strip() + "\n"

        # Get date
        date_text = soup.find_all(class_="caas-attr-time-style")[0].get_text().strip()
        date = datetime.strptime(date_text[:17], "%a, %b %d, %Y").strftime("%Y-%m-%d")
        articledata.append(date)
        # print(date)
        
        # text = header + "\n" + body
        text = header + ". " + body
        return text
    
    else:
        print("Failed to retrieve the webpage. Status code:", response.status_code)
        return None

text = scrape_yahoo_finance(article)
print(text)


Apple touts stopping $1.8BN in App Store fraud last year in latest pitch to developers. Apple released new data about anti-fraud measures related to its operation of the iOS App Store on Tuesday morning, trumpeting a claim that it stopped over $7 billion in "potentially fraudulent transactions" across the four years between 2020 and 2023.More than $1.8 billion of that total was stopped in 2023, per Apple, which is down from the $2 billion in potentially fraudulent transactions Apple reported preventing in 2022. It also said it blocked over 14 million stolen credit cards and more than 3.3 million accounts from transacting again between 2020 and 2023.As with any self-reported corporate metrics, the aim is to shape a narrative: In Apple's case it's a longstanding claim that its mobile ecosystem sets "the standard for security, reliability, and user experience", as its blog post puts it.It's worth noting that counter-narratives do exist, such as the developer lawsuit Apple settled back in 

Summarize (if necessary)

In [None]:
# # Summarizer 1

# print(text)
# print(type(text))

# # text = f'Provide a summary for this article, keeping it relevant to {ticker}. I want the summary to revolve around {ticker}.' + text

# if len(text.split()) > 300:
#     summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
#     # print(summarizer(text, max_length=200, min_length=30, do_sample=False))
#     x = summarizer(text, max_length=200, min_length=30, do_sample=False)

#     print(x)

#     text = summarizer(text, max_length=200, min_length=30, do_sample=False)[0]['summary_text']
#     print('Summarized')

# else: print('Not summarized')

In [None]:
# # Summarizer 2

# summarizer = pipeline("summarization", "jordiclive/flan-t5-3b-summarizer", torch_dtype=torch.bfloat16)

# results = summarizer(
#         text,
#         num_beams=5,
#         min_length=5,
#         no_repeat_ngram_size=3,
#         truncation=True,
#         max_length=200,
#     )

# print(results)

In [None]:
# # Summarizer 3

# summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum")

# summarizer(text)

In [7]:
# Summarizer 4

API_URL = "https://api-inference.huggingface.co/models/sshleifer/distilbart-cnn-12-6"
headers = {f"Authorization": f"Bearer {config.huggingface_apikey}"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
output = query({
	"inputs": text,
})

text = output[0]['summary_text']
print(output[0]['summary_text'])

 Apple touts stopping $1.8BN in App Store fraud last year in latest pitch to developers . Apple says it blocked over 14 million stolen credit cards and more than 3.3 million accounts from transacting again . In 2023, Apple said it rejected more than 1.7 million app submissions for failing to meet "stringent" standards for privacy, security, and content .


Sentiment

In [8]:
API_URL = "https://api-inference.huggingface.co/models/ProsusAI/finbert"
headers = {"Authorization": f"Bearer {config.huggingface_apikey}"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

output = query({
	"inputs": text,
})

print(output)

# Extract the list of sentiments
sentiments = output[0]

# Initialize variables to store the most likely sentiment label and score
most_likely_label = None
max_score = -1

# Iterate through the list of sentiments
for sentiment in sentiments:
    label = sentiment['label']
    score = sentiment['score']
    
    # Update most_likely_label and max_score if the current score is higher
    if score > max_score:
        most_likely_label = label
        max_score = score

print("Most likely sentiment:", most_likely_label)

if most_likely_label == 'positive':
    numerical_sentiment = 1
elif most_likely_label == 'neutral':
    numerical_sentiment = 0
elif most_likely_label == 'negative':
    numerical_sentiment = -1

print("Numerical sentiment:", numerical_sentiment)

articledata.append(numerical_sentiment)

[[{'label': 'negative', 'score': 0.9409266710281372}, {'label': 'neutral', 'score': 0.045905645936727524}, {'label': 'positive', 'score': 0.013167675584554672}]]
Most likely sentiment: negative
Numerical sentiment: -1


In [9]:
print(articledata)

['2024-05-14', -1]


# Merge

In [51]:
historical = pd.read_csv(f'{ticker}_daily.csv')
insider = pd.read_csv(f'{ticker}_insider_trades.csv')
news = pd.read_csv(f'{ticker}_sentiment.csv')

newdata = pd.merge(historical, insider, how='outer', left_on='Date', right_on='Trade Date')
newdata = pd.merge(newdata, news, how='outer', on='Date')

newdata.rename(columns={'4. close': 'Close', '5. volume': 'Volume', 'Value': 'Insider_Trades'}, inplace=True)
newdata = newdata.drop(columns=['1. open', '2. high', '3. low', 'Trade Date'])

newdata.to_csv(f'data/{ticker}_merged.csv', index=False)

### Visualize