In [1]:
import sys
import os
sys.path.append(os.path.abspath('../util'))

from tqdm import tqdm
import pandas as pd
import yfinance as yf
from load_transcripts import load_all_transcripts
from sentiment import get_sentiment_features

tqdm.pandas()

## Sentiment Feature Extraction

1. All transcript files are loaded into a single DataFrame using load_all_transcripts()

2. Each transcript's text is split into individual paragraphs

3. The get_sentiment_features() function is applied to each list of paragraphs

4. The resulting sentiment feature set is combined with the corresponding transcript metadata (ticker, quarter, and date) to form df_sentiment.

In [2]:
df = load_all_transcripts(data_dir='../data/transcripts')

df['paragraphs'] = df['text'].apply(lambda x: x.split('\n\n'))
sentiment_features = df['paragraphs'].progress_apply(get_sentiment_features)
sentiment_df = pd.DataFrame(sentiment_features.tolist())

df_sentiment = pd.concat([
    df[['ticker', 'quarter', 'date']].reset_index(drop=True),
    sentiment_df
], axis=1)

100%|██████████| 188/188 [22:27<00:00,  7.17s/it]


In [3]:
df_sentiment.to_csv('../data/sentiment.csv', index=False)

## Price Data Collection

Tickers are extracted from df_sentiment and historical daily closing prices are downloaded from Yahoo Finance from 2000-01-01 to 2020-12-31. The final df_prices DataFrame contains three columns: date, ticker, and close.

In [14]:
tickers = df_sentiment['ticker'].unique().tolist()

start_date = '2005-01-01'
end_date = '2020-01-01'
price_data = yf.download(tickers, start=start_date, end=end_date, progress=True)['Close']

df_prices = price_data.reset_index().melt(id_vars='Date', var_name='ticker', value_name='close')
df_prices = df_prices.rename(columns={'Date': 'date'})
df_prices = df_prices.dropna()

[*********************100%***********************]  10 of 10 completed


In [15]:
df_prices.to_csv('../data/prices.csv', index=False)