In [58]:
# Assuming the 'published' column represents the date, you can rename it to 'date'
news.rename(columns={'published': 'date'}, inplace=True)

# Now, select only the 'date', 'title', and 'content' columns
news_filtered = news[['date', 'title', 'content']]

# Display the first few rows of the filtered dataframe
news_filtered

Unnamed: 0,date,title,content
0,2024-10-17 08:50:57,Indian Schools To Benefit From Standard Capita...,This website is using a security service to pr...
1,2024-10-17 07:11:13,Vedanta shares down 1.35% as Nifty drops - The...,(What's movingSensexandNiftyTracklatest market...
2,2024-10-17 07:08:08,Vedanta shares down 1.35% as Nifty drops - Eco...,(What's movingSensexandNiftyTracklatest market...
3,2024-10-17 06:47:17,"1:1 Bonus, 1:2 Split, 48 Dividends; Not Vedant...",This website is using a security service to pr...
4,2024-10-17 05:30:00,Volumes soar at CRISIL Ltd counter - Business ...,Reference #18.e055a68.1729273976.7fb466b https...
...,...,...,...
1751,2023-10-23 07:00:00,Vedanta CFO likely to quit months after joinin...,Legal Mastering M&A Deal Making By - Ashwath R...
1752,2023-10-23 07:00:00,Govt may space out stake sale in Hindustan Zin...,Reference #18.a5055a68.1729281894.47741ba7 htt...
1753,2023-10-22 07:00:00,Vedanta-promoted Hindustan Zinc bets on indust...,Reference #18.a5055a68.1729281911.47747830 htt...
1754,2023-10-21 07:00:00,‘Reforming Penal Laws a necessity’: Law Minist...,Bringing you the Best Analytical Legal News BH...


In [59]:
# Function to check length and replace values with NaN if less than 10 characters
def filter_short_text(value):
    if len(str(value)) < 25:
        return np.nan
    return value

# Apply the filter to the 'title' and 'content' columns
news_filtered['title'] = news_filtered['title'].apply(filter_short_text)
news_filtered['content'] = news_filtered['content'].apply(filter_short_text)

news_filtered

Unnamed: 0,date,title,content
0,2024-10-17 08:50:57,Indian Schools To Benefit From Standard Capita...,This website is using a security service to pr...
1,2024-10-17 07:11:13,Vedanta shares down 1.35% as Nifty drops - The...,(What's movingSensexandNiftyTracklatest market...
2,2024-10-17 07:08:08,Vedanta shares down 1.35% as Nifty drops - Eco...,(What's movingSensexandNiftyTracklatest market...
3,2024-10-17 06:47:17,"1:1 Bonus, 1:2 Split, 48 Dividends; Not Vedant...",This website is using a security service to pr...
4,2024-10-17 05:30:00,Volumes soar at CRISIL Ltd counter - Business ...,Reference #18.e055a68.1729273976.7fb466b https...
...,...,...,...
1751,2023-10-23 07:00:00,Vedanta CFO likely to quit months after joinin...,Legal Mastering M&A Deal Making By - Ashwath R...
1752,2023-10-23 07:00:00,Govt may space out stake sale in Hindustan Zin...,Reference #18.a5055a68.1729281894.47741ba7 htt...
1753,2023-10-22 07:00:00,Vedanta-promoted Hindustan Zinc bets on indust...,Reference #18.a5055a68.1729281911.47747830 htt...
1754,2023-10-21 07:00:00,‘Reforming Penal Laws a necessity’: Law Minist...,Bringing you the Best Analytical Legal News BH...


In [60]:
news_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1756 entries, 0 to 1755
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   date     1756 non-null   object
 1   title    1756 non-null   object
 2   content  1696 non-null   object
dtypes: object(3)
memory usage: 41.3+ KB


In [61]:
news_filtered.describe()

Unnamed: 0,date,title,content
count,1756,1756,1696
unique,383,1532,1423
top,2024-10-04 07:00:00,Vedanta shares up 1.46% as Nifty gains - The E...,"Oct 18, 2024 11:56 PM2 Min Read Oct 18, 2024 8..."
freq,33,4,57


In [62]:
# Function to clean the text and filter out numerical data
def clean_text(text):
    if pd.isna(text):  # Check if the text is NaN
        return None  # Skip NaN values

    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    
    # If text is mostly numbers or stock-related info, skip it
    if re.fullmatch(r"[\d.,%\(\)\s-]+", text) or len(re.findall(r"\d", text)) > 10:
        return None  # Return None for non-textual or mostly numerical data
    
    return text

# Function to summarize and analyze sentiment
def summarize_and_get_sentiment(text):
    # Clean the text to remove problematic characters and handle NaN
    text = clean_text(text)
    
    # Skip if the text is None (filtered out or NaN)
    if not text:
        return None  # Skip the entry

    # Adjust max_length dynamically based on input length
    input_length = len(text.split())
    
    # If text is too short, return sentiment directly
    if input_length < 5:
        return classifier(text)[0]['score']
    
    # Ensure that input text does not exceed the model's max length (512 tokens for most models)
    max_input_length = 512
    if input_length > max_input_length:
        text = " ".join(text.split()[:max_input_length])
    
    # Set max_length for summarization based on input length
    max_len = min(512, int(input_length * 0.5))  # Ensure summary is at most 50% of the input length
    
    try:
        # Summarize if input is long enough, otherwise use original text
        if input_length > 50:
            summary = summarizer(text, max_length=max_len, min_length=20, do_sample=False)[0]['summary_text']
        else:
            summary = text  # Use the original text if it's too short for summarization
        
        # Return the sentiment score for the summary
        sentiment = classifier(summary)[0]['score']
        return sentiment
    
    except Exception as e:
        print(f"Error processing text: {text} - {e}")
        return None  # Or some default value

# Remove rows with NaN values in 'title' or 'content'
news_filtered = news_filtered.dropna(subset=['title', 'content'])

# Apply summarization and sentiment analysis
news_filtered['title_sentiment'] = news_filtered['title'].apply(lambda x: summarize_and_get_sentiment(str(x)))
news_filtered['content_sentiment'] = news_filtered['content'].apply(lambda x: summarize_and_get_sentiment(str(x)))

# Display the updated dataframe
print(news_filtered[['date', 'title', 'content', 'title_sentiment', 'content_sentiment']].head())


                  date                                              title  \
0  2024-10-17 08:50:57  Indian Schools To Benefit From Standard Capita...   
1  2024-10-17 07:11:13  Vedanta shares down 1.35% as Nifty drops - The...   
2  2024-10-17 07:08:08  Vedanta shares down 1.35% as Nifty drops - Eco...   
3  2024-10-17 06:47:17  1:1 Bonus, 1:2 Split, 48 Dividends; Not Vedant...   
4  2024-10-17 05:30:00  Volumes soar at CRISIL Ltd counter - Business ...   

                                             content  title_sentiment  \
0  This website is using a security service to pr...         0.968101   
1  (What's movingSensexandNiftyTracklatest market...         0.997807   
2  (What's movingSensexandNiftyTracklatest market...         0.998126   
3  This website is using a security service to pr...         0.995743   
4  Reference #18.e055a68.1729273976.7fb466b https...         0.648902   

   content_sentiment  
0                NaN  
1                NaN  
2                NaN  
3     

In [64]:
# Function to decide 'news_sentiment' based on the condition
def calculate_news_sentiment(row):
    if row['content_sentiment'] >= row['title_sentiment']:
        return row['content_sentiment']
    else:
        return row['title_sentiment']

# Apply the function to create the 'news_sentiment' column
news_filtered['news_sentiment'] = news_filtered.apply(calculate_news_sentiment, axis=1)

news_filtered_df = news_filtered[['date', 'title', 'content', 'news_sentiment']]

news_filtered_df

Unnamed: 0,date,title,content,news_sentiment
0,2024-10-17 08:50:57,Indian Schools To Benefit From Standard Capita...,This website is using a security service to pr...,0.968101
1,2024-10-17 07:11:13,Vedanta shares down 1.35% as Nifty drops - The...,(What's movingSensexandNiftyTracklatest market...,0.997807
2,2024-10-17 07:08:08,Vedanta shares down 1.35% as Nifty drops - Eco...,(What's movingSensexandNiftyTracklatest market...,0.998126
3,2024-10-17 06:47:17,"1:1 Bonus, 1:2 Split, 48 Dividends; Not Vedant...",This website is using a security service to pr...,0.995743
4,2024-10-17 05:30:00,Volumes soar at CRISIL Ltd counter - Business ...,Reference #18.e055a68.1729273976.7fb466b https...,0.648902
...,...,...,...,...
1751,2023-10-23 07:00:00,Vedanta CFO likely to quit months after joinin...,Legal Mastering M&A Deal Making By - Ashwath R...,0.998954
1752,2023-10-23 07:00:00,Govt may space out stake sale in Hindustan Zin...,Reference #18.a5055a68.1729281894.47741ba7 htt...,0.980792
1753,2023-10-22 07:00:00,Vedanta-promoted Hindustan Zinc bets on indust...,Reference #18.a5055a68.1729281911.47747830 htt...,0.935822
1754,2023-10-21 07:00:00,‘Reforming Penal Laws a necessity’: Law Minist...,Bringing you the Best Analytical Legal News BH...,0.588120


In [66]:
news_filtered_df.isna().sum()

date               0
title              0
content            0
news_sentiment    20
dtype: int64

In [67]:
news_filtered_df.fillna(news_filtered_df.mean(numeric_only=True),inplace=True)

In [75]:
news_filtered_df['date'] = pd.to_datetime(news_filtered_df['date'], errors='coerce')

In [76]:
news_filtered_df.isna().sum()

date              0
title             0
content           0
news_sentiment    0
dtype: int64

In [77]:
news_filtered_df.describe()

Unnamed: 0,date,news_sentiment
count,1696,1696.0
mean,2024-05-26 02:50:37.654481152,0.933606
min,2023-10-20 07:00:00,0.500698
25%,2024-03-12 01:00:00,0.932196
50%,2024-06-21 07:00:00,0.982484
75%,2024-08-21 07:00:00,0.995897
max,2024-10-17 08:50:57,0.99987
std,,0.109251


In [78]:
news_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1696 entries, 0 to 1755
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            1696 non-null   datetime64[ns]
 1   title           1696 non-null   object        
 2   content         1696 non-null   object        
 3   news_sentiment  1696 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 66.2+ KB


In [81]:
# Find all rows with duplicate dates
duplicate_dates_df = news_filtered_df[news_filtered_df['date'].duplicated(keep=False)]

# Display the duplicate entries
print(duplicate_dates_df)

                    date                                              title  \
21   2024-10-14 05:46:14  Vedanta declines 0.05% as Sensex climbs - The ...   
22   2024-10-14 05:46:14  Vedanta declines 0.05% as Sensex climbs - Econ...   
54   2024-10-09 07:00:00  Vedanta cancels board meeting on interim divid...   
55   2024-10-09 07:00:00  Interim dividend: Vedanta cancels board meetin...   
56   2024-10-09 07:00:00  Vedanta Cancels Board Meeting To Consider Four...   
...                  ...                                                ...   
1748 2023-10-23 07:00:00  Vedanta business split to address undervaluati...   
1749 2023-10-23 07:00:00  Vedanta Lanjigarh launches ‘Ascend’, a Mentori...   
1750 2023-10-23 07:00:00  Vedanta stock falls 1.6% as report says CFO So...   
1751 2023-10-23 07:00:00  Vedanta CFO likely to quit months after joinin...   
1752 2023-10-23 07:00:00  Govt may space out stake sale in Hindustan Zin...   

                                                con

In [79]:
news_filtered_df.duplicated().sum()

149

In [82]:
# Sort the dataframe by 'date' and 'news_sentiment' to ensure the highest sentiment is at the top for each date
news_filtered_df = news_filtered_df.sort_values(['date', 'news_sentiment'], ascending=[True, False])

# Drop duplicates based on 'date', keeping the row with the maximum sentiment value
news_filtered_df_unique = news_filtered_df.drop_duplicates(subset=['date'], keep='first')

# Display the updated dataframe
print(news_filtered_df_unique)


                    date                                              title  \
1755 2023-10-20 07:00:00  Reliance considers acquisition of Israel’s Tow...   
1754 2023-10-21 07:00:00  ‘Reforming Penal Laws a necessity’: Law Minist...   
1753 2023-10-22 07:00:00  Vedanta-promoted Hindustan Zinc bets on indust...   
1750 2023-10-23 07:00:00  Vedanta stock falls 1.6% as report says CFO So...   
1743 2023-10-24 07:00:00  Vedanta CFO Shrivastava quits, Ajay Goel of By...   
...                  ...                                                ...   
4    2024-10-17 05:30:00  Volumes soar at CRISIL Ltd counter - Business ...   
3    2024-10-17 06:47:17  1:1 Bonus, 1:2 Split, 48 Dividends; Not Vedant...   
2    2024-10-17 07:08:08  Vedanta shares down 1.35% as Nifty drops - Eco...   
1    2024-10-17 07:11:13  Vedanta shares down 1.35% as Nifty drops - The...   
0    2024-10-17 08:50:57  Indian Schools To Benefit From Standard Capita...   

                                                con

In [80]:
# Group by 'date' and aggregate using max on 'news_sentiment' while keeping the first title and content for each date
news_filtered_unique_df = news_filtered_df.groupby('date').agg({
    'title': 'first',            # Keep the first title for each date
    'content': 'first',          # Keep the first content for each date
    'news_sentiment': 'max'      # Take the max sentiment for each date
}).reset_index()  # Reset index to convert 'date' back into a column

# Display the resulting dataframe with unique dates
news_filtered_unique_df


Unnamed: 0,date,title,content,news_sentiment
0,2023-10-20 07:00:00,Reliance considers acquisition of Israel’s Tow...,The latest development comes after Intel scrap...,0.915486
1,2023-10-21 07:00:00,‘Reforming Penal Laws a necessity’: Law Minist...,Bringing you the Best Analytical Legal News BH...,0.588120
2,2023-10-22 07:00:00,Vedanta-promoted Hindustan Zinc bets on indust...,Reference #18.a5055a68.1729281911.47747830 htt...,0.935822
3,2023-10-23 07:00:00,Vedanta business split to address undervaluati...,-494.75 -221.45 -15.00 + 451.00 -233.00 -494.7...,0.999686
4,2023-10-24 07:00:00,Ajay Goel returns to Vedanta as CFO after quit...,To enjoy additional benefits CONNECT WITH US U...,0.998761
...,...,...,...,...
359,2024-10-17 05:30:00,Volumes soar at CRISIL Ltd counter - Business ...,Reference #18.e055a68.1729273976.7fb466b https...,0.648902
360,2024-10-17 06:47:17,"1:1 Bonus, 1:2 Split, 48 Dividends; Not Vedant...",This website is using a security service to pr...,0.995743
361,2024-10-17 07:08:08,Vedanta shares down 1.35% as Nifty drops - Eco...,(What's movingSensexandNiftyTracklatest market...,0.998126
362,2024-10-17 07:11:13,Vedanta shares down 1.35% as Nifty drops - The...,(What's movingSensexandNiftyTracklatest market...,0.997807


In [83]:
pip install requests beautifulsoup4 pandas textblob 






In [84]:
!apt-get update
!apt install -y chromium-chromedriver
!pip install selenium
!pip install webdriver_manager
!pip install feedparser

'apt-get' is not recognized as an internal or external command,
operable program or batch file.
'apt' is not recognized as an internal or external command,
operable program or batch file.

















In [1]:
import feedparser
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import urllib.parse
import time
import requests

# Function to set up Chrome in headless mode for local environment
def setup_selenium_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run headless
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920x1080")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    return driver

# Function to fetch Google News RSS feed for a search term within a date range with retries
def fetch_news_rss_for_term(date, term, retries=3):
    date_str = date.strftime('%Y-%m-%d')  # Format date as YYYY-MM-DD
    next_day_str = (date + timedelta(days=1)).strftime('%Y-%m-%d')  # Get the next day
    query = f"{term} after:{date_str} before:{next_day_str}"
    
    rss_url = f"https://news.google.com/rss/search?q={urllib.parse.quote(query)}"
    
    attempt = 0
    while attempt < retries:
        try:
            feed = feedparser.parse(rss_url)
            if feed.entries:
                return feed
        except Exception as e:
            print(f"Error fetching RSS for {term} on {date_str}: {e}")
        
        attempt += 1
        time.sleep(2)  # Delay before retrying
    
    return None

# Function to parse the published date
def parse_date(entry):
    return datetime.strptime(entry.published, '%a, %d %b %Y %H:%M:%S %Z')

# Function to fetch content (with fallback terms)
def fetch_news_with_fallbacks(current_date, driver, timeout=20):
    terms = [
        "Vedanta Limited",
        "Cairn Oil & Gas",           # Subsidiary 1
        "Hindustan Zinc",            # Subsidiary 2
        "Balco",                     # Subsidiary 3
        "Nifty Metal",               # Sector news
        "Global Metal Market",       # Global metal market news
        "Indian GDP",                # Macroeconomic news
        "Indian Inflation"           # Inflation news
    ]

    for term in terms:
        feed = fetch_news_rss_for_term(current_date, term)

        if feed and feed.entries:
            entry = feed.entries[0]  # Fetch only the first article
            news_date = parse_date(entry).strftime('%Y-%m-%d')  # Format date for checking
            
            # Fetch article content from the link using Selenium
            content = fetch_news_content(entry.link, driver, timeout)

            # Return the news article found
            return {
                'title': entry.title,
                'published': parse_date(entry),
                'link': entry.link,
                'content': content,
                'searched_term': term
            }
    
    # If no articles found for any term, return None
    return None

# Function to fetch news articles from 01-01-2019 in ascending order
def fetch_news_from_2019(driver, timeout=20):
    all_news = []
    today = datetime.now()
    start_date = datetime(2019, 1, 1)  # Start from 01-01-2019
    current_date = start_date

    processed_dates = set()  # Track dates for which articles are already fetched

    while current_date <= today:
        try:
            if current_date.strftime('%Y-%m-%d') not in processed_dates:
                # Try to fetch news for this date (with fallbacks)
                article = fetch_news_with_fallbacks(current_date, driver, timeout)

                if article:
                    all_news.append(article)
                    processed_dates.add(article['published'].strftime('%Y-%m-%d'))  # Mark the date as processed
                    print(f"Fetching news for {article['published'].strftime('%Y-%m-%d')}: {article['title']} [{article['searched_term']}]")
                else:
                    print(f"No news found for {current_date.strftime('%Y-%m-%d')}. Proceeding to the next day.")

            # Move to the next date
            current_date += timedelta(days=1)
        
        except Exception as e:
            print(f"Error fetching news for {current_date.strftime('%Y-%m-%d')}: {e}")
            # Move to the next date in case of error
            current_date += timedelta(days=1)
            continue

    # Sort the news articles by published date in ascending order
    sorted_news = sorted(all_news, key=lambda x: x['published'])
    return sorted_news

# Fetching news content using Selenium (with a timeout for slow pages)
def fetch_news_content(link, driver, timeout=20):
    try:
        driver.set_page_load_timeout(timeout)
        driver.get(link)
        time.sleep(3)  # Allow time for the page to fully load

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Attempt to find the main content of the news article
        paragraphs = soup.find_all('p')
        content = ' '.join([p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 0])

        return content if content else "Content not available."
    
    except Exception as e:
        return f"Failed to fetch content: {e}"

# Function to export news data to a CSV file
def export_to_csv(news_articles, filename='vedanta_news_2019_to_present.csv'):
    import csv
    keys = ['title', 'published', 'link', 'content', 'searched_term']
    
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=keys)
        writer.writeheader()
        
        for article in news_articles:
            writer.writerow({
                'title': article['title'],
                'published': article['published'].strftime('%Y-%m-%d %H:%M:%S'),
                'link': article['link'],
                'content': article['content'],
                'searched_term': article['searched_term']
            })

# Main execution
if __name__ == "__main__": 
    driver = setup_selenium_driver()
    
    try:
        news_articles = fetch_news_from_2019(driver)
        print(f"Fetched {len(news_articles)} news articles.")
        
        # Export the data to a CSV file
        export_to_csv(news_articles, 'vedanta_news_2019_to_present.csv')
        print("News data has been exported to 'vedanta_news_2019_to_present.csv'.")
    
    finally:
        # Close the browser after scraping
        driver.quit()


Fetching news for 2019-01-02: Vedanta plans $300 mn capex on Lanjigarh alumina refinery in FY20 - Business Standard [Vedanta Limited]
Fetching news for 2019-01-04: Vedanta exploration sanction: Is ONGC meeting Tripura’s natural gas needs? - Down To Earth Magazine [Vedanta Limited]
Fetching news for 2019-01-05: Jobs, in perspective - The Indian Express [Indian GDP]
Fetching news for 2019-01-07: Gold, oil and copper: What to watch in commodities in 2019 | Mint - Mint [Global Metal Market]
Fetching news for 2019-01-08: Supreme Court clears reopening of Vedanta's copper smelter - The Times of India [Vedanta Limited]
Fetching news for 2019-01-10: Wedding bells: Vedanta chairman Navin Agarwal's son ties the knot with Harvard graduate - The Economic Times [Vedanta Limited]
Fetching news for 2019-01-11: Report: global mining equipment market expected to reach US$188 billion by 2025 - World Coal [Global Metal Market]
Fetching news for 2019-01-12: Gold Price Falls By Rs. 40 At National Bullion M

In [16]:
pip install --user transformers torch






In [3]:
import pandas as pd
import numpy as np
import warnings
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from transformers import pipeline

# Ignore warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
news = pd.read_csv('vedanta_news_2019_to_present.csv')
news

Unnamed: 0,title,published,link,content,searched_term
0,Vedanta plans $300 mn capex on Lanjigarh alumi...,2019-01-02 08:00:00,https://news.google.com/rss/articles/CBMi0gFBV...,Reference #18.a5055a68.1729759226.707b524b htt...,Vedanta Limited
1,Vedanta exploration sanction: Is ONGC meeting ...,2019-01-04 08:00:00,https://news.google.com/rss/articles/CBMiuAFBV...,After the Ministry of Petroleum and Natural Ga...,Vedanta Limited
2,"Jobs, in perspective - The Indian Express",2019-01-05 08:00:00,https://news.google.com/rss/articles/CBMi1gFBV...,As and when the NSSO Employment-Unemployment r...,Indian GDP
3,"Gold, oil and copper: What to watch in commodi...",2019-01-07 08:00:00,https://news.google.com/rss/articles/CBMitgFBV...,Commodities took a kicking in 2018 -- with dee...,Global Metal Market
4,Supreme Court clears reopening of Vedanta's co...,2019-01-08 08:00:00,https://news.google.com/rss/articles/CBMi4wFBV...,10 Most Affordable Cities in India to Buy a Ho...,Vedanta Limited
...,...,...,...,...,...
1365,Vedanta shares down 1.35% as Nifty drops - The...,2024-10-17 07:11:13,https://news.google.com/rss/articles/CBMivAFBV...,Stock Trading RSI Trading Techniques: Masterin...,Vedanta Limited
1366,Vedanta announces new investments of Rs. 1 lak...,2024-10-19 03:05:00,https://news.google.com/rss/articles/CBMi0wFBV...,Content not available.,Vedanta Limited
1367,Vedanta to invest over Rs 1 lakh cr in Rajasth...,2024-10-21 01:51:26,https://news.google.com/rss/articles/CBMiuwFBV...,"By commenting, you agree to theProhibited Cont...",Vedanta Limited
1368,Vedanta Limited - Cyber Magazine,2024-10-22 12:17:01,https://news.google.com/rss/articles/CBMiXkFVX...,"Vedanta Aluminium, established in 2003, stands...",Vedanta Limited


In [9]:
# Assuming the 'published' column represents the date, you can rename it to 'date'
news.rename(columns={'published': 'date'}, inplace=True)

# Rearrange columns to make 'date' the first column and change its datatype
news = news[['date', 'title', 'link', 'content', 'searched_term']]

# Change 'date' to datetime and other columns to string (object in pandas)
news['date'] = pd.to_datetime(news['date'])
news['title'] = news['title'].astype(str)
news['link'] = news['link'].astype(str)
news['content'] = news['content'].astype(str)
news['searched_term'] = news['searched_term'].astype(str)

news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1370 entries, 0 to 1369
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           1370 non-null   datetime64[ns]
 1   title          1370 non-null   object        
 2   link           1370 non-null   object        
 3   content        1370 non-null   object        
 4   searched_term  1370 non-null   object        
dtypes: datetime64[ns](1), object(4)
memory usage: 53.6+ KB


In [14]:
print(news.isnull().sum())
print(news.isna().sum())

date             0
title            0
link             0
content          0
searched_term    0
dtype: int64
date             0
title            0
link             0
content          0
searched_term    0
dtype: int64


In [17]:
news.head()

Unnamed: 0,date,title,link,content,searched_term
0,2019-01-02 08:00:00,Vedanta plans $300 mn capex on Lanjigarh alumi...,https://news.google.com/rss/articles/CBMi0gFBV...,Reference #18.a5055a68.1729759226.707b524b htt...,Vedanta Limited
1,2019-01-04 08:00:00,Vedanta exploration sanction: Is ONGC meeting ...,https://news.google.com/rss/articles/CBMiuAFBV...,After the Ministry of Petroleum and Natural Ga...,Vedanta Limited
2,2019-01-05 08:00:00,"Jobs, in perspective - The Indian Express",https://news.google.com/rss/articles/CBMi1gFBV...,As and when the NSSO Employment-Unemployment r...,Indian GDP
3,2019-01-07 08:00:00,"Gold, oil and copper: What to watch in commodi...",https://news.google.com/rss/articles/CBMitgFBV...,Commodities took a kicking in 2018 -- with dee...,Global Metal Market
4,2019-01-08 08:00:00,Supreme Court clears reopening of Vedanta's co...,https://news.google.com/rss/articles/CBMi4wFBV...,10 Most Affordable Cities in India to Buy a Ho...,Vedanta Limited


In [18]:
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertForSequenceClassification

# Load FinBERT model and tokenizer
def load_finbert_model():
    model_name = "yiyanghkust/finbert-tone"
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForSequenceClassification.from_pretrained(model_name)
    return tokenizer, model

# Function to perform sentiment analysis using FinBERT and return a numeric sentiment score
def analyze_sentiment(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract logits (raw predictions) and apply softmax to get probabilities
    logits = outputs.logits
    probabilities = F.softmax(logits, dim=1).squeeze().tolist()  # Convert logits to probabilities
    
    # Sentiment scoring: assign numeric score based on probabilities
    # We can use a weighted score, for example:
    # Negative (0) -> -1, Neutral (1) -> 0, Positive (2) -> +1
    score = -1 * probabilities[0] + 0 * probabilities[1] + 1 * probabilities[2]
    return score

# Load your dataframe (assuming your dataframe is named 'news')
#news = pd.read_csv('path_to_your_news_file.csv')  # Update this with the correct file path

# Initialize FinBERT model and tokenizer
tokenizer, model = load_finbert_model()

# Analyze sentiment for title and content, returning numeric scores
news['title_sentiment_score'] = news['title'].apply(lambda x: analyze_sentiment(x, tokenizer, model))
news['content_sentiment_score'] = news['content'].apply(lambda x: analyze_sentiment(x, tokenizer, model))

# Display the dataframe with sentiment scores
print(news.head())

# Export the updated dataframe to a CSV file with sentiment scores
news.to_csv('news_with_sentiment_scores.csv', index=False)


                 date                                              title  \
0 2019-01-02 08:00:00  Vedanta plans $300 mn capex on Lanjigarh alumi...   
1 2019-01-04 08:00:00  Vedanta exploration sanction: Is ONGC meeting ...   
2 2019-01-05 08:00:00          Jobs, in perspective - The Indian Express   
3 2019-01-07 08:00:00  Gold, oil and copper: What to watch in commodi...   
4 2019-01-08 08:00:00  Supreme Court clears reopening of Vedanta's co...   

                                                link  \
0  https://news.google.com/rss/articles/CBMi0gFBV...   
1  https://news.google.com/rss/articles/CBMiuAFBV...   
2  https://news.google.com/rss/articles/CBMi1gFBV...   
3  https://news.google.com/rss/articles/CBMitgFBV...   
4  https://news.google.com/rss/articles/CBMi4wFBV...   

                                             content        searched_term  \
0  Reference #18.a5055a68.1729759226.707b524b htt...      Vedanta Limited   
1  After the Ministry of Petroleum and Natural Ga...