In [1]:
# Lets analyze news headlines to build a trading strategy
!pip install yfinance

Collecting yfinance
  Obtaining dependency information for yfinance from https://files.pythonhosted.org/packages/1c/19/bf19123baf16a55fd38cbb100b5a49380b9b6db7279987034689d11254c7/yfinance-0.2.32-py2.py3-none-any.whl.metadata
  Downloading yfinance-0.2.32-py2.py3-none-any.whl.metadata (11 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.11-py3-none-any.whl (8.5 kB)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.17.0.tar.gz (2.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l- \ | / - done
[?25h  Getting requirements to build wheel ... [?25l- done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
Downloading yfinance-0.2.32-py2.py3-none-any.whl (68 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.0/69.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hB

In [2]:
import spacy
import en_core_web_lg
import pandas as pd
import numpy as np
import nltk
import plotly.express as px
import matplotlib.pyplot as plt
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Dropout, Activation, Embedding
nltk.download('vader_lexicon')
import panel as pn
import warnings; warnings.filterwarnings('ignore')
import yfinance as yf



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
def show_panel(df,top=20):
    return pn.widgets.Tabulator(df.head(top),
                    show_index=False,
                    pagination='local', 
                         page_size=10)
        
pn.extension('tabulator')
pn.widgets.Tabulator.theme = 'bootstrap'

In [4]:
start = '2010-01-01'
end = '2023-12-01'
ticker_data = pd.DataFrame()

tickers = ['AAPL','MSFT','AMZN','GOOG','AMD','NVDA','TSLA','YELP','NFLX','ADBE','BA','AIG', 'META']  
# JPM, GS, V, MA, AXP, BAC, C, BLK, SPGI, MCO
for ticker in tickers:    
    ticker_yf = yf.Ticker(ticker)
    data_temp = ticker_yf.history(start=start, end=end)
    data_temp['ticker'] = ticker
    
    if ticker_data.empty:
        ticker_data = data_temp
    else:
        ticker_data = pd.concat([ticker_data, data_temp])

print(ticker_data['ticker'].unique())
ticker_data.to_csv(r'ticker_data.csv')


['AAPL' 'MSFT' 'AMZN' 'GOOG' 'AMD' 'NVDA' 'TSLA' 'YELP' 'NFLX' 'ADBE' 'BA'
 'AIG' 'META']


In [5]:
# Stock price history from yahoo finance
ticker_data = pd.read_csv('ticker_data.csv',
                               index_col='Date')
ticker_data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-04 00:00:00-05:00,6.461438,6.493832,6.42965,6.478997,493729600,0.0,0.0,AAPL
2010-01-05 00:00:00-05:00,6.49686,6.526831,6.455989,6.490199,601904800,0.0,0.0,AAPL
2010-01-06 00:00:00-05:00,6.490199,6.515932,6.380303,6.386964,552160000,0.0,0.0,AAPL
2010-01-07 00:00:00-05:00,6.410578,6.418146,6.328837,6.375156,477131200,0.0,0.0,AAPL
2010-01-08 00:00:00-05:00,6.366682,6.418148,6.329142,6.417542,447610800,0.0,0.0,AAPL


In [6]:
ticker_data.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-11-24 00:00:00-05:00,340.130005,341.859985,336.769989,338.230011,5467500,0.0,0.0,META
2023-11-27 00:00:00-05:00,336.179993,339.899994,334.200012,334.700012,15684500,0.0,0.0,META
2023-11-28 00:00:00-05:00,333.399994,339.380005,333.399994,338.98999,12637200,0.0,0.0,META
2023-11-29 00:00:00-05:00,339.690002,339.899994,330.779999,332.200012,16024500,0.0,0.0,META
2023-11-30 00:00:00-05:00,331.890015,333.5,322.399994,327.149994,23146400,0.0,0.0,META


In [7]:
fig = px.line(ticker_data, y='Close',facet_col='ticker',  facet_col_wrap=4,height=900,template='plotly_white')
fig.update_traces(line_color='#1f77b4', line_width=0.5)
fig.show(renderer='iframe')


In [8]:
import zipfile
import json

z = zipfile.ZipFile("/kaggle/input/news-trading/headlines_archive", "r")
testFile=z.namelist()[10]
fileData= z.open(testFile).read() 
fileDataSample = json.loads(fileData)['content'][1:5000]  

# json format
fileDataSample

'li class="n-box-item date-title" data-end="1305172799" data-start="1305086400" data-txt="Tuesday, December 17, 2019">Wednesday, May 11, 2011</li><li class="n-box-item sa-box-item" data-id="76179" data-ts="1305149244"><div class="media media-overflow-fix"><div class="media-left"><a class="box-ticker" href="/symbol/CSCO" target="_blank">CSCO</a></div><div class="media-body"><h4 class="media-heading"><a href="/news/76179" sasource="on_the_move_news_fidelity" target="_blank">Cisco (NASDAQ:CSCO): Previous annual sales growth guidance of 12-17% is &quot;off the table.&quot;</a></h4><p>Cisco (NASDAQ:<a href="https://seekingalpha.com/symbol/CSCO" title="Cisco Systems, Inc.">CSCO</a>): Previous annual sales growth guidance of 12-17% is "off the table." </p><div class="tiny-share-widget" data-id="76179" data-linked="Cisco (NASDAQ:CSCO): Previous annual sales growth guidance of 12-17% is &quot;off the table.&quot; " data-tweet="$CSCO - Cisco (NASDAQ:CSCO): Previous annual sales growth guidance o

## HTML Snippet Description

The provided HTML snippet represents a portion of news content related to Cisco Systems, Inc. (NASDAQ:CSCO). Here's a breakdown of the information contained within the HTML snippet:

1. **Date**:
   - Date is represented as an HTML list item with the class "n-box-item date-title."
   - The date is "Wednesday, May 11, 2011."

2. **Ticker Symbol**:
   - The ticker symbol is represented as an HTML link (`<a>`) with the class "box-ticker."
   - The ticker symbol is "CSCO," which corresponds to Cisco Systems, Inc. (NASDAQ:CSCO).

3. **Headline**:
   - The headline is contained within an HTML structure with the class "media-body."
   - The headline text is wrapped within an `<a>` element with the class "media-heading."
   - The headline text is: "Cisco (NASDAQ:CSCO): Previous annual sales growth guidance of 12-17% is 'off the table.'"

4. **Additional Information**:
   - There is additional content below the headline, which includes a paragraph (`<p>`) with some duplicated information.
   - This section repeats the information from the headline: "Cisco (NASDAQ:CSCO): Previous annual sales growth guidance of 12-17% is 'off the table.'"

5. **Tiny Share Widget**:
   - There's a section with a class "tiny-share-widget" that seems to be related to social sharing or interaction with the news content.




html_snippet = '''
<li class="n-box-item date-title" data-end="1305172799" data-start="1305086400" data-txt="Tuesday, December 17, 2019">Wednesday, May 11, 2011</li>
<li class="n-box-item sa-box-item" data-id="76179" data-ts="1305149244">
    <div class="media media-overflow-fix">
        <div class="media-left"><a class="box-ticker" href="/symbol/CSCO" target="_blank">CSCO</a></div>
        <div class="media-body">
            <h4 class="media-heading"><a href="/news/76179" sasource="on_the_move_news_fidelity" target="_blank">Cisco (NASDAQ:CSCO): Previous annual sales growth guidance of 12-17% is 'off the table.'</a></h4>
            <p>Cisco (NASDAQ:<a href="https://seekingalpha.com/symbol/CSCO" title="Cisco Systems, Inc.">CSCO</a>): Previous annual sales growth guidance of 12-17% is 'off the table.'</p>
            <div class="tiny-share-widget" data-id="76179" data-linked="Cisco (NASDAQ:CSCO): Previous annual sales growth guidance of 12-17% is 'off the table.'" data-tweet="$CSCO - Cisco (NASDAQ:CSCO): Previous annual sales growth guidance of 12-17% is 'off the table.' -> what do u understand from this">
            </div>
        </div>
    </div>
</li>
'''
html_snippet


In [9]:
from lxml import etree
from io import StringIO
from datetime import date
from tqdm.notebook import tqdm

In [10]:
def jsonParser(json_data): 
    # Extract the 'content' field from the JSON data
    xml_data = json_data['content']
    
    # Parse the XML data using lxml
    tree = etree.parse(StringIO(xml_data), parser=etree.HTMLParser())

    # Extract headlines from the XML data
    headlines = tree.xpath("//h4[contains(@class, 'media-heading')]/a/text()")
    
    # Check if the number of headlines matches the 'count' field in the JSON data
    assert len(headlines) == json_data['count']

    # Extract main tickers from the XML data
    main_tickers = list(map(lambda x: x.replace('/symbol/', ''), tree.xpath("//div[contains(@class, 'media-left')]//a/@href")))
    
    # Check if the number of main tickers matches the 'count' field in the JSON data
    assert len(main_tickers) == json_data['count']
    
    # Extract final headlines from the XML data
    final_headlines = [''.join(f.xpath('.//text()')) for f in tree.xpath("//div[contains(@class, 'media-body')]/ul/li[1]")]
    
    # If no final headlines are found, extract from a different XPath and process
    if len(final_headlines) == 0:
        final_headlines = [''.join(f.xpath('.//text()')) for f in tree.xpath("//div[contains(@class, 'media-body')]")]
        
        # Process and clean the final headlines
        final_headlines = [f.replace(h, '').split('\xa0')[0].strip() for f, h in zip(final_headlines, headlines)]
        
    # Return the extracted main tickers and final headlines
    return main_tickers, final_headlines


In [11]:
data = None 
data_df_news = []  # Create an empty list to store DataFrames
ret = []           # Create an empty list (unused)
ret_f = []         # Create an empty list (unused)

# Open the zip file for reading
with zipfile.ZipFile("/kaggle/input/news-trading/headlines_archive", "r") as z:
    
    # Loop through the list of filenames in the zip archive
    for filename in tqdm(z.namelist()): 
        try:               
            # Open and read the JSON data from the current file
            with z.open(filename) as f:  
                data = f.read()  
                json_data = json.loads(data)      

            # Check if the 'count' field in the JSON data is greater than 10
            if json_data.get('count', 0) > 10:
                
                # Step 1: Parse the News JSONs using the jsonParser function
                main_tickers, final_headlines = jsonParser(json_data) 
                
                # Check if the number of final headlines matches the 'count' field in the JSON data
                if len(final_headlines) != json_data['count']:
                    continue  # Skip this file if the count doesn't match
                    
                # Step 2: Prepare Future and Event Return and assign Future and Event return for each ticker. 
                file_date = filename.split('/')[-1].replace('.json', '')
                file_date = date(int(file_date[:4]), int(file_date[5:7]), int(file_date[8:]))
                
                # Step 3: Merge all the data in a data frame
                df_dict = {'ticker': main_tickers,
                           'headline': final_headlines,            
                           'date': [file_date] * len(main_tickers)}
                
                # Create a DataFrame from the dictionary
                df_f = pd.DataFrame(df_dict)
                
                # Append the DataFrame to the data_df_news list
                data_df_news.append(df_f)
                
        except:
            pass  

# Concatenate all the DataFrames in data_df_news into a single DataFrame
data_df_news = pd.concat(data_df_news)

# Display the first few rows of the resulting DataFrame
display(data_df_news.head())

# Print the shape of the DataFrame
print(data_df_news.shape)


  0%|          | 0/3159 [00:00<?, ?it/s]

Unnamed: 0,ticker,headline,date
0,DVA,Davita (NYSE:DVA): EPS of $0.96 beats by $0.01...,2011-05-02
1,CGNX,Cognex (NASDAQ:CGNX): EPS of $0.32 beats by $0...,2011-05-02
2,AEIS,Advanced Energy Industries (NASDAQ:AEIS):,2011-05-02
3,LOCM,Local.com (NASDAQ:LOCM):,2011-05-02
4,CYDEQ,CyberDefender (CYDE):,2011-05-02


(122613, 3)


In [12]:
json_data

{'content': '<li class="n-box-item date-title" data-end="1577163599" data-start="1577077200" data-txt="Monday, December 23, 2019">Today - Monday, December 23, 2019</li><li class="n-box-item sa-box-item" data-id="3528295" data-ts="1577141953"><div class="media media-overflow-fix"><div class="media-left"><a class="box-ticker" href="/symbol/AMC" target="_blank">AMC</a></div><div class="media-body"><h4 class="media-heading"><a href="/news/3528295-theater-stocks-fade-on-disappointing-disney-film-debut" sasource="on_the_move_news_fidelity" target="_blank">Theater stocks fade on disappointing Disney film debut</a></h4><ul><li>Movie theater chains AMC Entertainment (<a href=\'https://seekingalpha.com/symbol/AMC\' title=\'AMC Entertainment Holdings, Inc.\'>AMC</a> <font color="red">-5.3%</font>) and Cinemark (<a href=\'https://seekingalpha.com/symbol/CNK\' title=\'Cinemark Holdings, Inc.\'>CNK</a> <font color="red">-3.1%</font>) <a href="https://finance.yahoo.com/news/theater-stocks-fall-box-of