In [51]:
import requests 
from bs4 import BeautifulSoup 
import pandas as pd 
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
  
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'} 

In [52]:
header= ['Name', 'Price']
df = pd.DataFrame(columns = header)

def get_page_content(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:  # Check if the request was successful
            return BeautifulSoup(response.text, 'html.parser')
        else:
            print(f"URL is inaccessible, Status code: {response.status_code}")
    except requests.RequestException as e:
        print(f"Error accessing {url}: {e}")
        return None

def extract_price(soup):
    try:
        price_overall = soup.find('div', class_='D(ib) Mend(20px)')
        price_element = price_overall.find('fin-streamer', {'data-field': 'regularMarketPrice'})
        return price_element['value'] if price_element else None
    except AttributeError:
        print("Price element not found")
        return None

def extract_name(soup):
    try:
        name_overall = soup.find('div', class_='D(ib) Mt(-5px) Maw(38%)--tab768 Maw(38%) Mend(10px) Ov(h) smartphone_Maw(85%) smartphone_Mend(0px)')
        name_element = name_overall.find('h1', class_='D(ib) Fz(18px)')
        return name_element.text.strip() if name_element else None
    except AttributeError:
        print("Name element not found")
        return None

In [53]:
top_stock_url = 'https://finance.yahoo.com/u/yahoo-finance/watchlists/most-added/'
top_soup = get_page_content(top_stock_url)
table = top_soup.find('table',class_="cwl-symbols W(100%)")
# Find the header row
header_row = table.find('thead').find('tr')
# Extract header names
top_stock_headers = [th.text for th in header_row.find_all('th')]
top_df = pd.DataFrame(columns = top_stock_headers)

# Extract all row data
rows = table.find('tbody').find_all('tr')
for row in rows:
    top_stock_values = [td.text.strip() for td in row.find_all('td')]
    top_df.loc[len(top_df)] = top_stock_values

top_df

Unnamed: 0,Symbol,Company Name,Last Price,Change,% Change,Market Time,Volume,Avg Vol (3 month),Market Cap
0,AAPL,Apple Inc.,230.3397,1.66,+0.73%,11:00 AM EDT,18.48M,69.85M,"3,532.05B"
1,MSFT,Microsoft Corporation,460.8,1.26,+0.27%,11:00 AM EDT,4.52M,18.94M,"3,424.81B"
2,NVDA,NVIDIA Corporation,133.98,2.6,+1.98%,11:00 AM EDT,97.49M,414.95M,"3,295.68B"
3,GOOG,Alphabet Inc.,191.29,0.85,+0.45%,11:00 AM EDT,3.03M,18.76M,"2,354.07B"
4,GOOGL,Alphabet Inc.,189.79,0.81,+0.43%,11:00 AM EDT,4.21M,25.41M,"2,353.66B"
5,AMZN,"Amazon.com, Inc.",198.27,-1.07,-0.54%,11:00 AM EDT,10.50M,41.97M,"2,063.32B"
6,META,"Meta Platforms, Inc.",530.2863,0.29,+0.05%,11:00 AM EDT,2.83M,14.79M,"1,345.09B"
7,TSM,Taiwan Semiconductor Manufacturing Company Lim...,188.9749,4.45,+2.41%,11:00 AM EDT,9.49M,14.63M,980.04B
8,LLY,Eli Lilly and Company,936.8,4.3,+0.46%,10:59 AM EDT,703.54k,2.72M,843.50B
9,TSLA,"Tesla, Inc.",261.2,-1.13,-0.43%,11:00 AM EDT,51.34M,93.36M,833.02B


In [59]:
# new_df = pd.DataFrame(top_df)

# # Drop any missing values
# new_df = new_df.dropna()

# # Load tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained("gpt2", pad_token="[PAD]")
# model = AutoModelForCausalLM.from_pretrained("gpt2")

# # Generate text with a specific maximum length
# max_length = 500  # Adjust as needed based on your preference
# input_text = "Summarize the following stock data:\n"
# data_list = new_df.apply(lambda row: f"Name: {row['Company Name']}, Price: {row['Last Price']}", axis=1).tolist()
# inputs = tokenizer(input_text + "\n".join(data_list), return_tensors="pt", padding=True, truncation=True)

# # Generate summary
# with torch.no_grad():
#     outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=max_length)

# generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
# print("\nGenerated Summary:")
# print(generated_text)

In [54]:
top_10_stocks = top_df["Symbol"].iloc[0:11]
print(top_10_stocks)
urls = []
for index, symbol in top_10_stocks.items():
    urls.append(f"https://sg.finance.yahoo.com/quote/{symbol}")

0      AAPL
1      MSFT
2      NVDA
3      GOOG
4     GOOGL
5      AMZN
6      META
7       TSM
8       LLY
9      TSLA
10     AVGO
Name: Symbol, dtype: object


In [55]:
for url in urls:
    soup = get_page_content(url)
    if soup:
        name = extract_name(soup)
        price = extract_price(soup)
        
        if name and price:
            new_data = {'Name': name, 'Price': price}
            df = df._append(new_data, ignore_index=True)
        else:
            print(f"Failed to extract data from {url}")

# Print the final DataFrame
print("\nFinal DataFrame:")
print(df)


Final DataFrame:
                                                 Name     Price
0                                   Apple Inc. (AAPL)    230.33
1                        Microsoft Corporation (MSFT)    460.67
2                           NVIDIA Corporation (NVDA)    133.96
3                                Alphabet Inc. (GOOG)    191.28
4                               Alphabet Inc. (GOOGL)    189.76
5                             Amazon.com, Inc. (AMZN)    198.26
6                         Meta Platforms, Inc. (META)  530.1889
7   Taiwan Semiconductor Manufacturing Company Lim...    188.91
8                         Eli Lilly and Company (LLY)     936.8
9                                  Tesla, Inc. (TSLA)  261.0464
10                               Broadcom Inc. (AVGO)   1735.69


In [57]:
# Drop rows with missing values and convert Price to float
df = df.dropna()
df['Price'] = df['Price'].astype(float)

# Convert DataFrame to a list of strings (ensure each row is unique)
data_list = df.apply(lambda row: f"Name: {row['Name']}, Price: {row['Price']}", axis=1).tolist()

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2", pad_token="[PAD]") 
model = AutoModelForCausalLM.from_pretrained("gpt2")

# Generate text with a specific maximum length
max_length = 200 # Adjust as needed based on your preference
input_text = "Summarize the following stock data:\n" + "\n".join(data_list)
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=max_length)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\nGenerated Summary:")
print(generated_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Generated Summary:
Summarize the following stock data:
Name: Apple Inc. (AAPL), Price: 230.33
Name: Microsoft Corporation (MSFT), Price: 460.67
Name: NVIDIA Corporation (NVDA), Price: 133.96
Name: Alphabet Inc. (GOOG), Price: 191.28
Name: Alphabet Inc. (GOOGL), Price: 189.76
Name: Amazon.com, Inc. (AMZN), Price: 198.26
Name: Meta Platforms, Inc. (META), Price: 530.1889
Name: Taiwan Semiconductor Manufacturing Company Limited (TSM), Price: 188.91
Name: Eli Lilly and Company (LLY), Price: 936.8
Name: Tesla, Inc. (TSLA), Price: 261.0464
Name: Broadcom Inc. (AVGO), Price: 1735.69
Name: Intel Corporation (INTC


Using Jina to scrap for data

In [None]:
# def jinaai_readerapi_web_scrape_url(url):
#   response = requests.get("https://r.jina.ai/" + url)
#   return response.text

# data = jinaai_readerapi_web_scrape_url(url)
# print(data)

Using yfinance api to scrape for data

In [None]:
# import yfinance as yf

# def get_stock_data(ticker):
#     stock = yf.Ticker(ticker)
#     data = stock.history(period='1mo')
#     return data  # Return the stock data DataFrame

# stock_ticker = 'AAPL'

# # Get stock data
# stock_data = get_stock_data(stock_ticker)

# # Print stock data
# print(f"Stock Data for {stock_ticker}:\n{stock_data}")


In [None]:
# urls = ['https://sg.finance.yahoo.com/quote/NVDA/','https://sg.finance.yahoo.com/quote/AAPL/']
# for url in urls:
#     page = requests.get(url, headers=headers)
#     soup = BeautifulSoup(page.text, 'html.parser')
#     if page.status_code == 200:
#         price_overall = soup.find('div', class_='D(ib) Mend(20px)')
#         price_element = price_overall.find('fin-streamer', {'data-field': 'regularMarketPrice'})
#         if price_element:
#             price = price_element['value']
#             print(f"Price: {price}")
#         else:
#             print("Price element not found")
#             continue
            
#     # Extracting the name (if available)
#         name_overall = soup.find('div', class_ = "D(ib) Mt(-5px) Maw(38%)--tab768 Maw(38%) Mend(10px) Ov(h) smartphone_Maw(85%) smartphone_Mend(0px)")
#         name_element = name_overall.find('h1',class_='D(ib) Fz(18px)')
#         if name_element:
#             name = name_element.text.strip()
#             print(f"Name: {name}")
#         else:
#             print("Name element not found")
#             continue

#         new_df ={'Name': name,'Price': price,}
#         df = df._append(new_df,ignore_index = True)

#     else:
#         print("URL is inaccessible, Status code: {page.status_code}")


# # Print the final DataFrame
# print("\nFinal DataFrame:")
# print(df)

In [None]:
# overall = soup.find('div', class_ = 'D(ib) Mend(20px)')
# print(overall)

In [None]:
#  # Extracting the price
# price_overall = soup.find('div', class_='D(ib) Mend(20px)')
# price_element = price_overall.find('fin-streamer', {'data-field': 'regularMarketPrice'})
# if price_element:
#     price = price_element['value']
#     print(f"Price: {price}")
# else:
#     print("Price element not found")
        
# # Extracting the name (if available)
# name_overall = soup.find('div', class_ = "D(ib) Mt(-5px) Maw(38%)--tab768 Maw(38%) Mend(10px) Ov(h) smartphone_Maw(85%) smartphone_Mend(0px)")
# name_element = name_overall.find('h1',class_='D(ib) Fz(18px)')
# if name_element:
#     name = name_element.text.strip()
#     print(f"Name: {name}")
