In [3]:
import certifi
from pymongo import MongoClient
from transformers import pipeline
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime, timedelta  # Import timedelta module
import pytz


def load_sentiment_keywords():
    excel_path = r"C:\Users\91908\Documents\Raja\Share market\Analysis\Trendlyne\Data\Scrip\Scrip23012024.xlsx"
    sheet_name = "Sentiment"

    try:
        df = pd.read_excel(excel_path, sheet_name=sheet_name)
        return set(df['Keyword'].str.lower())
    except Exception as e:
        print(f"Error loading sentiment keywords: {e}")
        return set()

def analyze_sentiment(text):
    sentiment_analysis = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")
    try:
        result = sentiment_analysis(text)
        if result:
            return result[0]
        else:
            return {'label': '-99 stars', 'score': 0.0}
    except Exception as e:
        print(f"Error analyzing sentiment: {e}")
        return {'label': '-99 stars', 'score': 0.0}

def insert_record_into_mongodb(record, database_name, collection_name):
    ca = certifi.where()
    #uri = "mongodb+srv://ranguchamy:J8ePGYKw7XRdYZBg@stockanalytics.jkcqv2m.mongodb.net/?retryWrites=true&w=majority"
    uri = "mongodb://localhost:27017"

    # Create a new client and connect to the MongoDB server
    #client = MongoClient(uri, tlsCAFile=ca)
    client = MongoClient(uri)

    # Access the specified collection
    collection = client[database_name][collection_name]

    try:
        # Check if the paragraph_content already exists
        if not record_exists_in_mongodb(collection, record["paragraph_content"]):
            # Insert the record into the collection
            collection.insert_one(record)
            print("Record inserted successfully.")
        else:
            print("Record with paragraph_content already exists. Skipping insertion.")
    except Exception as e:
        print(f"Error inserting record into MongoDB: {e}")
    finally:
        client.close()

def record_exists_in_mongodb(collection, paragraph_content):
    # Check if the paragraph_content already exists in the collection
    existing_record = collection.find_one({"paragraph_content": paragraph_content})
    return existing_record is not None

def read_html_data(html_content, sentiment_keywords):
    try:
        soup = BeautifulSoup(html_content, "html.parser")

   
        for li_tag in soup.find_all('li', {'class': 'timeline-item'}):
            article_body_span = li_tag.find('span', {'itemprop': 'articleBody'})
            headline_h3 = li_tag.find('h3', {'itemprop': 'headline'})

            # Use collected_content for both span and h3 tags
            collected_content = []

            if article_body_span and not article_body_span.contents:
                # If span tag is empty, use content from h3 tag
                content = headline_h3.get_text(strip=True)
                collected_content.append(content)
            elif article_body_span:
                # Collect content from the span tag
                collected_content.extend(ptag.get_text(strip=True) for ptag in article_body_span.find_all('p'))
            else:
                continue  # Move to the next iteration if both span and h3 are not found

            # print(f"Paragraph Content: {' '.join(collected_content)}")



            
            paragraph_content = ' '.join(collected_content)

            if any(keyword in paragraph_content.lower() for keyword in sentiment_keywords):

                sentiment_result = analyze_sentiment(paragraph_content)
    
                if sentiment_result:
                    sentiment_data = {
                        "label": sentiment_result["label"],
                        "confidence": sentiment_result["score"]
                    }
    
                    # Use current date and time in IST
                    current_datetime_ist = datetime.now(pytz.timezone("Asia/Kolkata"))
                    #yesterday_datetime_ist = datetime.now(pytz.timezone("Asia/Kolkata")) - timedelta(days=1)
    
    
                    record = {
                        "paragraph_content": paragraph_content,
                        "sentiment": sentiment_data,
                        "created_at": current_datetime_ist,
                        "created_by": "user123",
                        "deleted_at": current_datetime_ist,
                        "deleted_by": "user789",
                        "is_deleted": False,
                        "is_purged": False,
                        "last_scraped": current_datetime_ist,
                        "purge_at": current_datetime_ist,
                        "purged_by": "admin_02",
                        "updated_at": current_datetime_ist,
                        "updated_by": "user456"
                    }
    
                    # Specify the MongoDB database and collection names
                    database_name = "NewsAnalytics"
                    collection_name = "RawNews_Hindu"
                    # Insert the record into MongoDB
                    insert_record_into_mongodb(record, database_name, collection_name)
        print("Completed")
    except Exception as e:
        print(f"Error parsing the HTML: {e}")

def economictimes_headlines():
    print("Fetching headlines from Economic Times...")
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-03-january-2024/article67698291.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-02-january-2024/article67695347.ece"
    # url = "https://www.thehindubusinessline.com/markets/stock-markets/share-market-nifty-sensex-live-updates-04-january-2024/article67702215.ece"
    # url = "https://www.thehindubusinessline.com/markets/stock-market-highlights-05-january-2024/article67706161.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-08-january-2024/article67715949.ece"
    # url = "https://www.thehindubusinessline.com/markets/stock-market-highlights-09-january-2024/article67720535.ece"
    #url = "https://www.thehindubusinessline.com/markets/stock-market-highlights-10-january-2024/article67724321.ece"
    ## url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-11-january-2024/article67727041.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-12-january-2024/article67730865.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-15-january-2024/article67740526.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-16-january-2024/article67743130.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-17-january-2023/article67744658.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-18-january-2024/article67748508.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-19-january-2024/article67752291.ece"
    # url = "https://www.thehindubusinessline.com/markets/stock-market-highlights-20-january-2024/article67757092.ece"
    # url = "https://www.thehindubusinessline.com/markets/stock-market-highlights-23-january-2024/article67762366.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-24-january-2024/article67768511.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-25-january-2024/article67773490.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-update-29-january-2024/article67786136.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-30-january-2023/article67789893.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-31-january-2023/article67793554.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-1-february-2024/article67796794.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-2-february-2024/article67800793.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-5-february-2024/article67810781.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-6-february-2024/article67813870.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-7-february-2024/article67817593.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-08-february-2024/article67821745.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-9-february-2024/article67825410.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-12-february-2024/article67836096.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-13-february-2024/article67838229.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-14-february-2024/article67842737.ece"

    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-15-february-2024/article67845496.ece"
    # url = "https://www.thehindubusinessline.com/markets/stock-market-highlights-16-february-2024/article67849197.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-updates-19-february-2024/article67860834.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-updates-20-february-2024/article67863390.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-21-february-2024/article67867142.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-22-february-2024/article67871508.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-23-february-2024/article67875397.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-26-february-2024/article67885065.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-27-february-2024/article67888298.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-28-february-2024/article67891443.ece"
    # url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-29-february-2024/article67895848.ece"
    # url = "https://www.thehindubusinessline.com/markets/stock-market-highlights-01-march-2024/article67899821.ece"
    url = "https://www.thehindubusinessline.com/markets/share-market-nifty-sensex-live-updates-4-march-2024/article67909983.ece"


    try:
        page_request = requests.get(url)
        page_request.raise_for_status()
        print("Status:", page_request.status_code)
    except requests.exceptions.RequestException as e:
        print(f"Error making the request: {e}")
        return

    try:
        data = page_request.content
        sentiment_keywords = load_sentiment_keywords()
        read_html_data(data, sentiment_keywords)
    except Exception as e:
        print(f"Error parsing the page: {e}")

if __name__ == "__main__":

    sentiment_keywords = load_sentiment_keywords()
    economictimes_headlines()


Fetching headlines from Economic Times...
Status: 200
Record inserted successfully.
Record inserted successfully.
Record inserted successfully.
Record inserted successfully.
Record inserted successfully.
Record inserted successfully.
Record inserted successfully.
Record inserted successfully.
Record inserted successfully.
Record with paragraph_content already exists. Skipping insertion.
Record with paragraph_content already exists. Skipping insertion.
Record with paragraph_content already exists. Skipping insertion.
Record with paragraph_content already exists. Skipping insertion.
Record with paragraph_content already exists. Skipping insertion.
Record with paragraph_content already exists. Skipping insertion.


Token indices sequence length is longer than the specified maximum sequence length for this model (773 > 512). Running this sequence through the model will result in indexing errors


Error analyzing sentiment: The size of tensor a (773) must match the size of tensor b (512) at non-singleton dimension 1
Record with paragraph_content already exists. Skipping insertion.
Record with paragraph_content already exists. Skipping insertion.
Record with paragraph_content already exists. Skipping insertion.


Token indices sequence length is longer than the specified maximum sequence length for this model (680 > 512). Running this sequence through the model will result in indexing errors


Error analyzing sentiment: The size of tensor a (680) must match the size of tensor b (512) at non-singleton dimension 1
Record with paragraph_content already exists. Skipping insertion.


Token indices sequence length is longer than the specified maximum sequence length for this model (1982 > 512). Running this sequence through the model will result in indexing errors


Error analyzing sentiment: The size of tensor a (1982) must match the size of tensor b (512) at non-singleton dimension 1
Record with paragraph_content already exists. Skipping insertion.
Record with paragraph_content already exists. Skipping insertion.
Record with paragraph_content already exists. Skipping insertion.
Record with paragraph_content already exists. Skipping insertion.
Record with paragraph_content already exists. Skipping insertion.
Record with paragraph_content already exists. Skipping insertion.
Record with paragraph_content already exists. Skipping insertion.
Record with paragraph_content already exists. Skipping insertion.
Record with paragraph_content already exists. Skipping insertion.
Completed


In [4]:
import certifi
from pymongo import MongoClient
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
from datetime import datetime
import pytz


# MongoDB connection URI (replace with your actual URI)
uri = "mongodb://localhost:27017"
# uri = "mongodb+srv://ranguchamy:J8ePGYKw7XRdYZBg@stockanalytics.jkcqv2m.mongodb.net/?retryWrites=true&w=majority"

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

# def analyze_finBERT_sentiment(text):
#     inputs = tokenizer(text, return_tensors="pt", padding=True)
#     outputs = finbert(**inputs)[0]
#     sentiment_label = np.argmax(outputs.detach().numpy())
#     return sentiment_label

def analyze_finBERT_sentiment(text):
    try:
        inputs = tokenizer(text, return_tensors="pt", padding=True)
        outputs = finbert(**inputs)[0]
        sentiment_label = np.argmax(outputs.detach().numpy())
        return sentiment_label
    except Exception as e:
        print(f"Error analyzing FinBERT sentiment: {e}")
        return 0

try:
    with MongoClient(uri) as client:
        # Specify the database and collection
        database_name = "NewsAnalytics"
        collection_name = "RawNews_Hindu"
        # Access the specified collection
        collection = client[database_name][collection_name]

        # Query all records in the collection
        all_records = collection.find()

        # Get the total count of records
        total_records = collection.count_documents({})

        # Iterate over each record
        for record in all_records:
            if "paragraph_content" in record and "FinBertScore" not in record:
                paragraph_content = record["paragraph_content"]

                # Analyze FinBERT sentiment for the paragraph_content
                sentiment_label = analyze_finBERT_sentiment(paragraph_content)
                # Format the date in the desired format
                current_datetime_ist = datetime.now(pytz.timezone("Asia/Kolkata"))

                # Update the record with the new field "FinBertScore" and metadata
                update_data = {
                    "$set": {
                        "FinBertScore": int(sentiment_label),
                        "updated_at": current_datetime_ist,
                        "updated_by": "FIN_BERT_Admin"
                    }
                }
                collection.update_one({"_id": record["_id"]}, update_data)

                # Print statement for successful entry
                print(f"Processed record {record['_id']} - FinBertScore: {int(sentiment_label)}, Updated at: {current_datetime_ist}, Updated by: FIN_BERT_Admin")
        print("Completed")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Processed record 65e561b6cf427725b1f03cfb - FinBertScore: 0, Updated at: 2024-03-04 11:23:54.630522+05:30, Updated by: FIN_BERT_Admin
Processed record 65e561b8cf427725b1f03cfd - FinBertScore: 0, Updated at: 2024-03-04 11:23:54.777523+05:30, Updated by: FIN_BERT_Admin
Processed record 65e561bacf427725b1f03cff - FinBertScore: 0, Updated at: 2024-03-04 11:23:54.972905+05:30, Updated by: FIN_BERT_Admin
Processed record 65e561bccf427725b1f03d01 - FinBertScore: 0, Updated at: 2024-03-04 11:23:55.155208+05:30, Updated by: FIN_BERT_Admin
Processed record 65e561becf427725b1f03d03 - FinBertScore: 0, Updated at: 2024-03-04 11:23:55.507084+05:30, Updated by: FIN_BERT_Admin
Processed record 65e561c0cf427725b1f03d05 - FinBertScore: 1, Updated at: 2024-03-04 11:23:55.860758+05:30, Updated by: FIN_BERT_Admin
Processed record 65e561c4cf427725b1f03d07 - FinBertScore: 0, Updated at: 2024-03-04 11:23:56.271022+05:30, Updated by: FIN_BERT_Admin
Processed record 65e561c7cf427725b1f03d09 - FinBertScore: 0, U