In [1]:
import requests
import os
import datetime
import pandas as pd
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from tqdm import tqdm

# **Wikipedia: S&P500 Tickers and Sector**s

In [3]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
response = requests.get(wiki_url)
content = response.content
soup = BeautifulSoup(content, 'html.parser')

In [4]:
table = soup.find('table', class_='wikitable sortable')
rows = table.find_all('tr')[1:]

In [5]:
tickers = []
sector_ticker_dict = {}

for row in rows:
    cells = row.find_all('td')
    tickers.append(cells[0].text.strip())
    sector_ticker_dict[cells[0].text.strip()] = cells[2].text.strip()

# **FMP API: Financial Data from the last 10 years**

In [68]:
load_dotenv('.env')
api_key = os.getenv('FMP_KEY')

In [69]:
sample_ticker = 'AAPL'
sample_days = 2520
sample_endpoint = f'https://financialmodelingprep.com/api/v3/historical-price-full/{sample_ticker}?timeseries={sample_days}&apikey={api_key}'

sample_response = requests.get(sample_endpoint)
sample_data = sample_response.json()

In [70]:
columns = list(sample_data['historical'][0].keys()) + ['symbol']

In [71]:
def endpoint_string_10_years(ticker, days):
    return f'https://financialmodelingprep.com/api/v3/historical-price-full/{ticker}?timeseries={days}&apikey={api_key}'

In [72]:
sp500_df = pd.DataFrame(columns=columns)
for ticker in tqdm(tickers, desc='Collecting data'):
    endpoint = endpoint_string_10_years(ticker, 2520)
    response = requests.get(endpoint)
    data = response.json()

    if data:
        df = pd.DataFrame(data['historical'])
        df['symbol'] = ticker
    else:
        print(f'No data for {ticker}.')
    sp500_df = pd.concat([sp500_df, df], ignore_index=True)

Collecting data: 100%|██████████| 503/503 [05:17<00:00,  1.58it/s]


In [74]:
sp500_df[sp500_df['symbol']=='AAPL']

Unnamed: 0,date,open,high,low,close,adjClose,volume,unadjustedVolume,change,changePercent,vwap,label,changeOverTime,symbol
53498,2023-09-28,169.34,172.026,167.62,170.55,170.55,41404049,41404049,1.21,0.714539,170.07,"September 28, 23",0.007145,AAPL
53499,2023-09-27,172.62,173.040,169.05,170.43,170.43,66425563,63936754,-2.19,-1.270000,170.53,"September 27, 23",-0.012700,AAPL
53500,2023-09-26,174.82,175.200,171.66,171.96,171.96,64548945,64588900,-2.86,-1.640000,172.72,"September 26, 23",-0.016400,AAPL
53501,2023-09-25,174.20,176.970,174.15,176.08,176.08,46172740,46172700,1.88,1.080000,175.67,"September 25, 23",0.010800,AAPL
53502,2023-09-22,174.67,177.079,174.05,174.79,174.79,56688985,56663000,0.12,0.068701,175.47,"September 22, 23",0.000687,AAPL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54750,2018-10-05,56.99,57.100,55.15,56.07,53.70,134322000,134322000,-0.92,-1.610000,56.11,"October 05, 18",-0.016100,AAPL
54751,2018-10-04,57.70,58.090,56.68,57.00,54.59,128168000,128168000,-0.70,-1.210000,57.26,"October 04, 18",-0.012100,AAPL
54752,2018-10-03,57.51,58.370,57.45,58.02,55.57,114619200,114619200,0.51,0.886800,57.94,"October 03, 18",0.008868,AAPL
54753,2018-10-02,56.81,57.500,56.66,57.32,54.90,99152800,99152800,0.51,0.897730,57.16,"October 02, 18",0.008977,AAPL


In [75]:
sp500_df.isnull().any()

date                False
open                False
high                False
low                 False
close               False
adjClose            False
volume              False
unadjustedVolume    False
change              False
changePercent       False
vwap                False
label               False
changeOverTime      False
symbol              False
dtype: bool

In [76]:
sp500_df.dtypes

date                 object
open                float64
high                float64
low                 float64
close               float64
adjClose            float64
volume               object
unadjustedVolume     object
change              float64
changePercent       float64
vwap                float64
label                object
changeOverTime      float64
symbol               object
dtype: object

In [77]:
sp500_df['date'] = pd.to_datetime(sp500_df['date'])
sp500_df['volume'] = sp500_df['volume'].astype(int)
sp500_df['unadjustedVolume'] = sp500_df['unadjustedVolume'].astype(int)

In [78]:
sp500_df.duplicated().sum()

0

In [80]:
output_file_path = './data/sp500_data.csv'
sp500_df.to_csv(output_file_path, index=False)
print(f'DataFrame has been exported to {output_file_path}')

DataFrame has been exported to ./data/sp500_data.csv


# **FMP API: Hourly Prices and Sentiment Scores**

In [111]:
sample_ticker_sentiment = 'TSLA'

sample_endpoint_sentiment = f'https://financialmodelingprep.com/api/v4/historical/social-sentiment?symbol={sample_ticker_sentiment}&page=0&apikey={api_key}'
sample_response_sentiment = requests.get(sample_endpoint_sentiment)
sample_data_sentiment = sample_response_sentiment.json()

In [112]:
len(sample_data_sentiment)

99

In [114]:
sample_data_sentiment[0]

{'date': '2023-09-28 18:00:00',
 'symbol': 'TSLA',
 'stocktwitsPosts': 96,
 'twitterPosts': 0,
 'stocktwitsComments': 26,
 'twitterComments': 0,
 'stocktwitsLikes': 77,
 'twitterLikes': 0,
 'stocktwitsImpressions': 247257,
 'twitterImpressions': 0,
 'stocktwitsSentiment': 0.485,
 'twitterSentiment': 0}

In [86]:
filtered_sentiment_data = [d['stocktwitsSentiment'] for d in sample_data_sentiment if d.get('date', '').startswith('2023-09')]
len(filtered_sentiment_data)

100

In [87]:
sample_ticker_hour = 'TSLA'
sample_endpoint_hour = f'https://financialmodelingprep.com/api/v3/historical-chart/1hour/{sample_ticker_hour}?apikey={api_key}'

sample_response_hour = requests.get(sample_endpoint_hour)
sample_data_hour = sample_response_hour.json()

In [89]:
pd.DataFrame(sample_data_hour)

Unnamed: 0,date,open,low,high,close,volume
0,2023-09-28 14:00:00,246.0281,244.7400,247.1500,245.0799,9196798
1,2023-09-28 13:00:00,245.5250,244.5889,246.5000,246.0400,12176174
2,2023-09-28 12:00:00,247.3750,244.9600,247.5500,245.5300,15330662
3,2023-09-28 11:00:00,241.6900,241.1201,247.3899,247.3601,20066222
4,2023-09-28 10:00:00,242.6700,238.7800,243.3000,241.7077,21933293
...,...,...,...,...,...,...
397,2023-07-10 16:00:00,269.6100,269.0000,269.8900,269.1200,737716
398,2023-07-10 15:00:00,269.2100,268.4200,269.9500,269.5000,12265901
399,2023-07-10 14:00:00,268.3800,268.0500,270.3600,269.1885,11951668
400,2023-07-10 13:00:00,268.8700,267.7600,269.5500,268.3500,11880400


In [102]:
columns =list(sample_data_hour[0].keys()) + ['symbol']

In [103]:
columns

['date', 'open', 'low', 'high', 'close', 'volume', 'symbol']

In [91]:
hour_df = pd.DataFrame(columns=columns)
for ticker in tqdm(tickers, desc='Collecting data'):
    endpoint = f'https://financialmodelingprep.com/api/v3/historical-chart/1hour/{ticker}?apikey={api_key}'
    response = requests.get(endpoint)
    data = response.json()

    if data:
        df = pd.DataFrame(data)
        df['symbol'] = ticker
    else:
        print(f'No data for {ticker}.')
    hour_df = pd.concat([hour_df, df], ignore_index=True)

Collecting data: 100%|██████████| 503/503 [14:06<00:00,  1.68s/it]


In [101]:
int(len(hour_df[hour_df['symbol']=='MMM']) / 100)

4

In [117]:
sentiment_df = pd.DataFrame(columns=list(sample_data_sentiment[0].keys()))
no_data_list = []
for ticker in tqdm(tickers, desc='Collecting data'):
    for page_num in range(int(len(hour_df[hour_df['symbol']==ticker])/100)):
        endpoint = f'https://financialmodelingprep.com/api/v4/historical/social-sentiment?symbol={ticker}&page={page_num}&apikey={api_key}'
        response = requests.get(endpoint)
        data = response.json()

        if data:
            df = pd.DataFrame(data)
        else:
            no_data_list.append(ticker)
        sentiment_df = pd.concat([sentiment_df, df], ignore_index=True)

Collecting data:   0%|          | 0/503 [00:00<?, ?it/s]

Collecting data: 100%|██████████| 503/503 [13:33<00:00,  1.62s/it]


In [127]:
sentiment_tickers = list(sentiment_df.symbol.unique())

In [128]:
merged_df = pd.merge(hour_df, sentiment_df, on=['symbol', 'date'])    

In [131]:
merged_df.head()

Unnamed: 0,date,open,low,high,close,volume,symbol,stocktwitsPosts,twitterPosts,stocktwitsComments,twitterComments,stocktwitsLikes,twitterLikes,stocktwitsImpressions,twitterImpressions,stocktwitsSentiment,twitterSentiment
0,2023-09-27 13:00:00,95.32,94.98,95.41,95.09,413663,ABT,3,0,0,0,6,0,17562,0,0.5,0
1,2023-09-26 15:00:00,96.26,96.03,96.35,96.22,883574,ABT,1,0,0,0,1,0,23,0,0.5,0
2,2023-09-26 13:00:00,96.47,96.17,96.62,96.28,374200,ABT,1,0,0,0,0,0,3882,0,0.6,0
3,2023-09-25 13:00:00,97.38,97.22,97.6,97.39,351481,ABT,1,0,0,0,0,0,89,0,0.5,0
4,2023-09-22 12:00:00,98.56,98.4,98.61,98.51,271445,ABT,2,0,0,0,2,0,11900,0,0.45,0


In [133]:
final_df = merged_df[['symbol', 'date', 'low', 'high', 'close', 'volume', 'stocktwitsSentiment']]

In [135]:
final_df.dtypes

symbol                  object
date                    object
low                    float64
high                   float64
close                  float64
volume                  object
stocktwitsSentiment     object
dtype: object

In [137]:
final_df['date'] = pd.to_datetime(final_df['date'])
final_df['volume'] = final_df['volume'].astype(int)
final_df['stocktwitsSentiment'] = final_df['stocktwitsSentiment'].astype('float64')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['date'] = pd.to_datetime(final_df['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['volume'] = final_df['volume'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['stocktwitsSentiment'] = final_df['stocktwitsSentiment'].astype('float64')


In [139]:
output_file_path = './data/sentiment_data.csv'
final_df.to_csv(output_file_path, index=False)
print(f'DataFrame has been exported to {output_file_path}')

DataFrame has been exported to ./data/sentiment_data.csv
