In [16]:
import os
import requests
import yfinance as yf
from textblob import TextBlob
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from urllib.parse import unquote
from scipy.special import softmax
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import BartTokenizer, BartForConditionalGeneration

In [2]:
load_dotenv()
api_token=os.getenv('HUGGINGFACEHUB_API_TOKEN')
os.environ['HF_TOKEN']=api_token
os.environ['HUGGINGFACEHUB_API_TOKEN']=api_token

In [3]:
def get_stock_sector(stock_symbol):
    try:
        stock = yf.Ticker(stock_symbol)
        info = stock.info
        sector = info.get('sector', 'Sector information not available')
        industry=info.get('industry','Industry information not available')        
        return sector,industry
    except Exception as e:
        print("Error:", e)
        return None

In [5]:
stock_symbol = input("Enter Stock Symbol: ")
sector,industry = get_stock_sector(stock_symbol)
if sector and industry:
    print("Stock:",stock_symbol)
    print("Sector:", sector)
    print('Industry:',industry)
elif sector:
    print("Stock:",stock_symbol)
    print("Sector:", sector)
    print('Industry:',"Falied to retrive industry information")
elif industry:
    print("Stock:",stock_symbol)
    print("Sector:", "Failed to retrieve sector information.")
    print('Industry:',industry)   
else:
    print("Failed to retrieve sector information.")

Stock: AAPL
Sector: Technology
Industry: Consumer Electronics


In [6]:
def scraping_article(url):
    headers = {
    'User-Agent': 'Your User Agent String',
    }
    r=requests.get(url,headers=headers)
    soup=BeautifulSoup(r.text,'html.parser')
    paragraphs=soup.find_all('p')
    text= [paragraph.text for paragraph in paragraphs]
    words=' '.join(text).split(' ')
    article = ' '.join(words)
    return article

In [7]:
def find_url(keyword):
    
    root = "https://www.google.com/"
    search_query = keyword.replace(" ", "+")
    link = f"https://www.google.com/search?q={search_query}&tbm=nws"
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(link, headers=headers)
    webpage = response.content
    soup = BeautifulSoup(webpage, 'html5lib')
    links = []
    for div_tag in soup.find_all('div', class_='Gx5Zad'):
        a_tag = div_tag.find('a')
        if a_tag:
            if 'href' in a_tag.attrs:
                href = a_tag['href']
                if href.startswith('/url?q='):
                    url = href.split('/url?q=')[1].split('&sa=')[0]
                    links.append(url)
    return links    

In [8]:
def to_chunks(data):
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size=3000,
        chunk_overlap=50
    )
    docs=text_splitter.split_text(data)
    return docs

In [9]:
def load_bart_model(model_name="facebook/bart-large-cnn"):
    tokenizer = BartTokenizer.from_pretrained(model_name)
    model = BartForConditionalGeneration.from_pretrained(model_name)
    return tokenizer, model

In [17]:
def find_news_url(keyword, start_date, end_date):
    root = "https://www.google.com/"
    search_query = keyword.replace(" ", "+")
    link = f"{root}search?q={search_query}&tbm=nws&tbs=cdr:1,cd_min:{start_date},cd_max:{end_date}"

    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

    response = requests.get(link, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    news_links = []

    for article in soup.select('div.SoaBEf'):
        link = article.select_one('a')
        if link and 'href' in link.attrs:
            url = link['href']
            if url.startswith('/url?q='):
                url = unquote(url.split('/url?q=')[1].split('&sa=')[0])
            news_links.append(url)

    return news_links

In [11]:
def summarize_text(tokenizer, model, text, max_chunk_length, summary_max_length):
    inputs = tokenizer(text, return_tensors="pt", max_length=max_chunk_length, truncation=True)
    summary_ids = model.generate(inputs["input_ids"], max_length=summary_max_length, min_length=200, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [12]:
def summarize_article(url, model_name="facebook/bart-large-cnn"):
    data = scraping_article(url)
    chunks = to_chunks(data)
    # tokenizer, model=load_pegasus_model("google/pegasus-xsum")
    tokenizer, model = load_bart_model(model_name)
    summaries = []
    for chunk in chunks:
        chunk_text = chunk
        summary = summarize_text(tokenizer, model, chunk_text,3000,800)
        summaries.append(summary)
    concatenated_summaries = " ".join(summaries)
    #  Second summarization pass: Summarize the concatenated summaries
    intermediate_chunks = [concatenated_summaries[i:i+3000] for i in range(0, len(concatenated_summaries), 3000)]
    final_summaries = []
    for intermediate_chunk in intermediate_chunks:
        final_summary = summarize_text(tokenizer, model, intermediate_chunk,3000,800)
        final_summaries.append(final_summary)    
    final_summary_text = " ".join(final_summaries)    
    return final_summary_text

In [15]:
# url=find_url('elon musk')[0]
url="https://www.hrkatha.com/global-hr-news/tesla-to-lay-off-601-more-employees-amidst-market-challenges/"
summary = summarize_article(url)
print(summary)

Elon Musk, CEO, Tesla, first disclosed the intention to reduce the company’s workforce by over 10 per cent on 15 April. Since then, multiple rounds of layoffs have been executed, with Musk reportedly aiming for a 20 per cent reduction in headcount. The latest round of layoffs will primarily impact workers at Tesla's Palo Alto and Fremont facilities. The termination process set to commence within a 14-day period starting 20 June, 2024, according to the Worker Adjustment and Retraining Notification (WARN) notice issued by Tesla. Last month, Tesla announced plans to cut 6,020 jobs in California and Texas as part of its broader downsizing strategy. The move also aligns with broader trends in the tech and automotive industries, where companies prioritise cost management and operational efficiency. For Tesla, maintaining market leadership requires a balance between innovation and financial prudence. The company says it is committed to being a leader in the electric-vehicle industry and will 

In [13]:
def senti_model(model_name="cardiffnlp/twitter-roberta-base-sentiment"):
    tokenizer=AutoTokenizer.from_pretrained(model_name)
    model=AutoModelForSequenceClassification.from_pretrained(model_name)
    return tokenizer,model

In [14]:
def find_senti(news_texts):
    tokenizer,model=senti_model()
    encoded=tokenizer(news_texts,return_tensors='pt')
    output=model(**encoded)
    scores=output[0][0].detach().numpy()
    scores=softmax(scores)
    weights = {
        'neg': -1,
        'neu': 0,
        'pos': 1
    }
    probabilities = {
        'neg': scores[0],
        'neu': scores[1],
        'pos': scores[2]
    }
    compound_score = sum(probabilities[label] * weights[label] for label in probabilities)
    senti_dict={
        'neg':scores[0],
        'neu': scores[1],
        'pos': scores[2],
        'polarity':compound_score        
    }
    return senti_dict


In [15]:
def subjectivity_analysis(text):
    blob = TextBlob(text)
    subjectivity = blob.sentiment.subjectivity
    return subjectivity

def relevance_scoring(summary, stock_name, domain_keywords):
    relevance_score = sum(1 for word in domain_keywords if word in summary) + (stock_name in summary)
    return relevance_score

def urgency_scoring(summary):
    # Look for time-sensitive words or phrases in the summary
    urgency_keywords = ['breaking', 'urgent', 'immediate']
    urgency_score = sum(1 for word in urgency_keywords if word in summary)
    return urgency_score

def mention_count(summary, stock_name, domain_keywords):
    # Count how many times the stock name and domain keywords appear in the summary
    count = sum(summary.count(word) for word in domain_keywords) + summary.count(stock_name)
    return count


In [16]:
def extract_features(summary, stock_name, domain_keywords):
    # Sentiment analysis
    sentiment_scores = find_senti(summary)
    subjectivity_score = subjectivity_analysis(summary)
    
    # Financial analysis
    relevance_score = relevance_scoring(summary, stock_name, domain_keywords)
    urgency_score = urgency_scoring(summary)
    mention_count_score = mention_count(summary, stock_name, domain_keywords)
    
    # Construct feature vector
    features = {
        'compound_sentiment_score': sentiment_scores['polarity'],
        'subjectivity_score': subjectivity_score,
        'relevance_score': relevance_score,
        'urgency_score': urgency_score,
        'mention_count': mention_count_score,
        'negative_sentiment_score': sentiment_scores['neg'],
        'neutral_sentiment_score': sentiment_scores['neu'],
        'positive_sentiment_score': sentiment_scores['pos']
    }
    
    return features


In [17]:
scores=extract_features(summary,"tesla","technology")
print(scores)



{'compound_sentiment_score': 0.5478894338011742, 'subjectivity_score': 0.5444444444444444, 'relevance_score': 10, 'urgency_score': 1, 'mention_count': 355, 'negative_sentiment_score': 0.008158185, 'neutral_sentiment_score': 0.43579414, 'positive_sentiment_score': 0.5560476}


In [18]:
print(url)

https://timesofindia.indiatimes.com/technology/tech-news/elon-musk-has-a-clarification-for-go-fk-yourselves-comment/articleshow/111167837.cms


In [22]:
abc=scraping_article("https://www.motor1.com/news/702440/tesla-plaid-vs-dodge-demon-170-drag-race/")

In [23]:
print(abc)


