In [1]:
import requests 
from bs4 import BeautifulSoup 
import pandas as pd 
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
  
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'} 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
header= ['Name', 'Price']
df = pd.DataFrame(columns = header)

def get_page_content(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:  # Check if the request was successful
            return BeautifulSoup(response.text, 'html.parser')
        else:
            print(f"URL is inaccessible, Status code: {response.status_code}")
    except requests.RequestException as e:
        print(f"Error accessing {url}: {e}")
        return None

def extract_price(soup):
    try:
        price_overall = soup.find('div', class_='D(ib) Mend(20px)')
        price_element = price_overall.find('fin-streamer', {'data-field': 'regularMarketPrice'})
        return price_element['value'] if price_element else None
    except AttributeError:
        print("Price element not found")
        return None

def extract_name(soup):
    try:
        name_overall = soup.find('div', class_='D(ib) Mt(-5px) Maw(38%)--tab768 Maw(38%) Mend(10px) Ov(h) smartphone_Maw(85%) smartphone_Mend(0px)')
        name_element = name_overall.find('h1', class_='D(ib) Fz(18px)')
        return name_element.text.strip() if name_element else None
    except AttributeError:
        print("Name element not found")
        return None

In [3]:
top_stock_url = 'https://finance.yahoo.com/u/yahoo-finance/watchlists/most-added/'
top_soup = get_page_content(top_stock_url)
table = top_soup.find('table',class_="cwl-symbols W(100%)")
# Find the header row
header_row = table.find('thead').find('tr')
# Extract header names
top_stock_headers = [th.text for th in header_row.find_all('th')]
top_df = pd.DataFrame(columns = top_stock_headers)

# Extract all row data
rows = table.find('tbody').find_all('tr')
for row in rows:
    top_stock_values = [td.text.strip() for td in row.find_all('td')]
    top_df.loc[len(top_df)] = top_stock_values

top_df

Unnamed: 0,Symbol,Company Name,Last Price,Change,% Change,Market Time,Volume,Avg Vol (3 month),Market Cap
0,AAPL,Apple Inc.,232.98,4.3,+1.88%,4:00 PM EDT,62.31M,69.39M,"3,572.54B"
1,MSFT,Microsoft Corporation,466.25,6.71,+1.46%,4:00 PM EDT,17.76M,18.94M,"3,465.31B"
2,NVDA,NVIDIA Corporation,134.91,3.53,+2.69%,4:00 PM EDT,235.67M,411.94M,"3,318.56B"
3,GOOG,Alphabet Inc.,192.66,2.22,+1.17%,4:00 PM EDT,11.74M,18.66M,"2,370.91B"
4,AMZN,"Amazon.com, Inc.",199.79,0.45,+0.23%,4:00 PM EDT,32.46M,41.85M,"2,079.13B"
5,META,"Meta Platforms, Inc.",534.69,4.69,+0.88%,4:00 PM EDT,10.35M,14.80M,"1,356.26B"
6,TSM,Taiwan Semiconductor Manufacturing Company Lim...,191.05,6.53,+3.54%,4:00 PM EDT,19.16M,14.75M,990.80B
7,LLY,Eli Lilly and Company,939.78,7.28,+0.78%,4:00 PM EDT,2.67M,2.73M,846.18B
8,TSLA,"Tesla, Inc.",263.26,0.93,+0.35%,4:00 PM EDT,127.74M,93.92M,839.59B
9,AVGO,Broadcom Inc.,1744.69,11.38,+0.66%,4:00 PM EDT,3.96M,3.51M,812.13B


In [4]:
# new_df = pd.DataFrame(top_df)

# # Drop any missing values
# new_df = new_df.dropna()

# # Load tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained("gpt2", pad_token="[PAD]")
# model = AutoModelForCausalLM.from_pretrained("gpt2")

# # Generate text with a specific maximum length
# max_length = 500  # Adjust as needed based on your preference
# input_text = "Summarize the following stock data:\n"
# data_list = new_df.apply(lambda row: f"Name: {row['Company Name']}, Price: {row['Last Price']}", axis=1).tolist()
# inputs = tokenizer(input_text + "\n".join(data_list), return_tensors="pt", padding=True, truncation=True)

# # Generate summary
# with torch.no_grad():
#     outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=max_length)

# generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
# print("\nGenerated Summary:")
# print(generated_text)

In [5]:
top_10_stocks = top_df["Symbol"].iloc[0:11]
print(top_10_stocks)
urls = []
for index, symbol in top_10_stocks.items():
    urls.append(f"https://sg.finance.yahoo.com/quote/{symbol}")

0     AAPL
1     MSFT
2     NVDA
3     GOOG
4     AMZN
5     META
6      TSM
7      LLY
8     TSLA
9     AVGO
10    COST
Name: Symbol, dtype: object


In [6]:
for url in urls:
    soup = get_page_content(url)
    if soup:
        name = extract_name(soup)
        price = extract_price(soup)
        
        if name and price:
            new_data = {'Name': name, 'Price': price}
            df = df._append(new_data, ignore_index=True)
        else:
            print(f"Failed to extract data from {url}")

# Print the final DataFrame
print("\nFinal DataFrame:")
print(df)


Final DataFrame:
                                                 Name    Price
0                                   Apple Inc. (AAPL)   232.98
1                        Microsoft Corporation (MSFT)   466.25
2                           NVIDIA Corporation (NVDA)   134.91
3                                Alphabet Inc. (GOOG)   192.66
4                             Amazon.com, Inc. (AMZN)   199.79
5                         Meta Platforms, Inc. (META)   534.69
6   Taiwan Semiconductor Manufacturing Company Lim...   191.05
7                         Eli Lilly and Company (LLY)   939.78
8                                  Tesla, Inc. (TSLA)   263.26
9                                Broadcom Inc. (AVGO)  1744.69
10                Costco Wholesale Corporation (COST)   884.31


In [7]:
# Drop rows with missing values and convert Price to float
df = df.dropna()
df['Price'] = df['Price'].astype(float)

# Convert DataFrame to a list of strings (ensure each row is unique)
data_list = df.apply(lambda row: f"Name: {row['Name']}, Price: {row['Price']}", axis=1).tolist()

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2", pad_token="[PAD]") 
model = AutoModelForCausalLM.from_pretrained("gpt2")

# Generate text with a specific maximum length
max_length = 200 # Adjust as needed based on your preference
input_text = "Summarize the following stock data:\n" + "\n".join(data_list)
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=max_length)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\nGenerated Summary:")
print(generated_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Generated Summary:
Summarize the following stock data:
Name: Apple Inc. (AAPL), Price: 232.98
Name: Microsoft Corporation (MSFT), Price: 466.25
Name: NVIDIA Corporation (NVDA), Price: 134.91
Name: Alphabet Inc. (GOOG), Price: 192.66
Name: Amazon.com, Inc. (AMZN), Price: 199.79
Name: Meta Platforms, Inc. (META), Price: 534.69
Name: Taiwan Semiconductor Manufacturing Company Limited (TSM), Price: 191.05
Name: Eli Lilly and Company (LLY), Price: 939.78
Name: Tesla, Inc. (TSLA), Price: 263.26
Name: Broadcom Inc. (AVGO), Price: 1744.69
Name: Costco Wholesale Corporation (COST), Price: 884.31
Name: Microsoft Corporation (


Using Jina to scrap for data

In [8]:
# def jinaai_readerapi_web_scrape_url(url):
#   response = requests.get("https://r.jina.ai/" + url)
#   return response.text

# data = jinaai_readerapi_web_scrape_url(url)
# print(data)

Using yfinance api to scrape for data

In [9]:
# import yfinance as yf

# def get_stock_data(ticker):
#     stock = yf.Ticker(ticker)
#     data = stock.history(period='1mo')
#     return data  # Return the stock data DataFrame

# stock_ticker = 'AAPL'

# # Get stock data
# stock_data = get_stock_data(stock_ticker)

# # Print stock data
# print(f"Stock Data for {stock_ticker}:\n{stock_data}")


In [10]:
# urls = ['https://sg.finance.yahoo.com/quote/NVDA/','https://sg.finance.yahoo.com/quote/AAPL/']
# for url in urls:
#     page = requests.get(url, headers=headers)
#     soup = BeautifulSoup(page.text, 'html.parser')
#     if page.status_code == 200:
#         price_overall = soup.find('div', class_='D(ib) Mend(20px)')
#         price_element = price_overall.find('fin-streamer', {'data-field': 'regularMarketPrice'})
#         if price_element:
#             price = price_element['value']
#             print(f"Price: {price}")
#         else:
#             print("Price element not found")
#             continue
            
#     # Extracting the name (if available)
#         name_overall = soup.find('div', class_ = "D(ib) Mt(-5px) Maw(38%)--tab768 Maw(38%) Mend(10px) Ov(h) smartphone_Maw(85%) smartphone_Mend(0px)")
#         name_element = name_overall.find('h1',class_='D(ib) Fz(18px)')
#         if name_element:
#             name = name_element.text.strip()
#             print(f"Name: {name}")
#         else:
#             print("Name element not found")
#             continue

#         new_df ={'Name': name,'Price': price,}
#         df = df._append(new_df,ignore_index = True)

#     else:
#         print("URL is inaccessible, Status code: {page.status_code}")


# # Print the final DataFrame
# print("\nFinal DataFrame:")
# print(df)

In [11]:
# overall = soup.find('div', class_ = 'D(ib) Mend(20px)')
# print(overall)

In [12]:
#  # Extracting the price
# price_overall = soup.find('div', class_='D(ib) Mend(20px)')
# price_element = price_overall.find('fin-streamer', {'data-field': 'regularMarketPrice'})
# if price_element:
#     price = price_element['value']
#     print(f"Price: {price}")
# else:
#     print("Price element not found")
        
# # Extracting the name (if available)
# name_overall = soup.find('div', class_ = "D(ib) Mt(-5px) Maw(38%)--tab768 Maw(38%) Mend(10px) Ov(h) smartphone_Maw(85%) smartphone_Mend(0px)")
# name_element = name_overall.find('h1',class_='D(ib) Fz(18px)')
# if name_element:
#     name = name_element.text.strip()
#     print(f"Name: {name}")
