## Web-scraping-task

In [55]:
#basic imports
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import random
from datetime import datetime
from IPython.display import display, clear_output
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
#driver for scraping 
def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--start-maximized')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), 
                            options=chrome_options)
    
    return driver

In [3]:
def wait_for_page_load(driver, timeout=20):
    try:
        print("Waiting for page to load...", end='')
        WebDriverWait(driver, timeout).until(
            lambda d: d.execute_script('return document.readyState') == 'complete'
        )
        time.sleep(3) 
        print(" Done!")
    except Exception as e:
        print(f"\nWarning during page load: {str(e)}")

In [4]:
def scroll_page(driver, scrolls=3):
    last_height = driver.execute_script("return document.body.scrollHeight")
    for i in range(scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        print(f"Scroll {i+1}/{scrolls} completed")
        
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

In [5]:
def find_article_links(driver):
    article_urls = set()
    scroll_page(driver)
    links = driver.find_elements(By.TAG_NAME, 'a')
    for link in links:
        try:
            href = link.get_attribute('href')
            if href and 'thehindu.com' in href and '/article' in href:
                article_urls.add(href)
        except:
            continue
    selectors = [
        'div.story-card a',
        'div.story-card-news a',
        'div.latest-news a',
        'div.other-articles a',
        'div[class*="article"] a',
        '.sticky-article a'
    ]
    
    for selector in selectors:
        try:
            elements = driver.find_elements(By.CSS_SELECTOR, selector)
            for element in elements:
                href = element.get_attribute('href')
                if href and 'thehindu.com' in href and '/article' in href:
                    article_urls.add(href)
        except:
            continue
    
    print(f"Found {len(article_urls)} unique article URLs")
    return list(article_urls)


In [6]:
def scrape_article(driver, url):
    try:
        driver.get(url)
        wait_for_page_load(driver)
        
        data = {'url': url}
        try:
            data['title'] = driver.find_element(By.CSS_SELECTOR, 'h1.title, h1.article-title, h1').text.strip()
        except:
            data['title'] = "Title not found"
        try:
            date_element = driver.find_element(By.CSS_SELECTOR, 'meta[property="article:published_time"]')
            data['date'] = date_element.get_attribute('content')
        except:
            try:
                date_element = driver.find_element(By.CSS_SELECTOR, '.published-time')
                data['date'] = date_element.text.strip()
            except:
                data['date'] = "Date not found"
        try:
            data['author'] = driver.find_element(By.CSS_SELECTOR, '.author-name, .auth-nm, [class*="author"]').text.strip()
        except:
            data['author'] = "Author not specified"
        try:
            content_elements = driver.find_elements(By.CSS_SELECTOR, '[id*="content-body"], .article-body p')
            data['content'] = '\n'.join([elem.text.strip() for elem in content_elements if elem.text.strip()])
        except:
            data['content'] = "Content not found"
        
        return data
        
    except Exception as e:
        print(f"\nError scraping article {url}: {str(e)}")
        return None

In [7]:
def scrape_hindu_news():
    driver = None
    try:
        driver = setup_driver()
        
        # Navigate to homepage
        print("\nNavigating to The Hindu...")
        driver.get('https://www.thehindu.com/')
        wait_for_page_load(driver)
        
        # Get article URLs
        article_urls = find_article_links(driver)
        
        if not article_urls:
            print("No article URLs found. Please check the website's structure or try again.")
            return None
        
        # Scrape articles with progress bar
        print("\nScraping articles...")
        articles = []
        
        for url in tqdm(article_urls, desc="Scraping articles"):
            article_data = scrape_article(driver, url)
            if article_data:
                articles.append(article_data)
                time.sleep(random.uniform(1, 2))  # Random delay between requests
        
        # Create DataFrame and save results
        if articles:
            df = pd.DataFrame(articles)
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            filename = f'hindu_articles_{timestamp}.csv'
            df.to_csv(filename, index=False, encoding='utf-8')
            print(f"\nSuccessfully scraped {len(articles)} articles")
            print(f"Data saved to {filename}")
            
            # Display sample of results
            print("\nSample of scraped articles:")
            display(df[['title', 'date', 'author']].head())
            
            return df
        else:
            print("No articles were successfully scraped")
            return None
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None
        
    finally:
        if driver:
            driver.quit()


In [35]:
df = scrape_hindu_news()


Navigating to The Hindu...
Waiting for page to load... Done!
Scroll 1/3 completed
Found 99 unique article URLs

Scraping articles...


Scraping articles:   0%|          | 0/99 [00:00<?, ?it/s]

Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for page to load... Done!
Waiting for pa

Unnamed: 0,title,date,author
0,"Seven months after BJP comes to power, Odisha ...",2025-01-13T14:12:24.000Z,PTI
1,Net direct tax collections slow down marginall...,2025-01-13T14:09:31.000Z,THE HINDU BUREAU
2,Chennai wakes up to thick smog on Bhogi day; v...,2025-01-13T06:47:46.000Z,THE HINDU BUREAU
3,Erode (East) bypoll: With AIADMK choosing to s...,2025-01-13T06:57:59.000Z,T. RAMAKRISHNAN
4,India summons Bangladesh’s Deputy High Commiss...,2025-01-13T09:54:14.000Z,THE HINDU BUREAU


### import 2 days scraped articles

In [36]:
df_day1 = pd.read_csv('hindu_articles.csv')
df_day2 = pd.read_csv('hindu_articles_day2.csv')

In [37]:
df_day1.drop('Unnamed: 0',axis=1,inplace=True)

In [38]:
df_day1

Unnamed: 0,url,title,date,author,content
0,https://www.thehindu.com/news/international/lo...,Los Angeles fires: 24 dead as fire crews try t...,2025-01-13T02:46:29.000Z,AP,"Firefighters scrambled on Sunday (January 12, ..."
1,https://www.thehindu.com/news/national/prime-m...,"PM inaugurates strategic Z-Morh tunnel in J&K,...",2025-01-13T07:22:43.000Z,PEERZADA ASHIQ,"Prime Minister Narendra Modi, who inaugurated ..."
2,https://www.thehindu.com/lit-for-life/the-hind...,The Hindu Lit for Life | Engage with maths and...,2025-01-10T10:44:32.000Z,SANGITA RAJAN,The only true universal language is mathematic...
3,https://www.thehindu.com/opinion/op-ed/the-ref...,The reforms needed in the MEA\nPremium,2025-01-12T20:16:56.000Z,RAJEEV AGARWAL,"I\nndia is on the rise, thanks to its consiste..."
4,https://www.thehindu.com/news/national/jammu-a...,Jammu and Kashmir CM Omar Abdullah seeks resto...,2025-01-13T08:59:50.000Z,PTI,Chief Minister Omar Abdullah on Monday (Januar...
...,...,...,...,...,...
89,https://www.thehindu.com/news/national/pravasi...,Pravasi Bharatiya Divas highlights: India will...,2025-01-09T04:17:14.000Z,THE HINDU BUREAU,"JANUARY 09, 2025 17:17\nGuests from abroad lap..."
90,https://www.thehindu.com/news/national/uttar-p...,Steve Jobs’ wife Laurene Powell offers prayers...,2025-01-12T07:33:14.000Z,THE HINDU BUREAU,"Laurene Powell Jobs, the wife of late Apple co..."
91,https://www.thehindu.com/sci-tech/energy-and-e...,Is groundwater contamination high in India? | ...,2025-01-11T20:40:00.000Z,JACOB KOSHY,The story so far: An assessment of India’s gro...
92,https://www.thehindu.com/news/national/inspire...,"Inspired by the Belgium model, union demands l...",2025-01-13T03:55:07.000Z,SHRABANA CHATTERJEE,"Durbar Mahila Samanway Commitee (DMSC), one of..."


In [39]:
df_day2

Unnamed: 0,url,title,date,author,content
0,https://www.thehindu.com/news/national/prime-m...,"PM inaugurates strategic Z-Morh tunnel in J&K,...",2025-01-13T07:22:43.000Z,PEERZADA ASHIQ,"Prime Minister Narendra Modi, who inaugurated ..."
1,https://www.thehindu.com/lit-for-life/the-hind...,The Hindu Lit for Life | Engage with maths and...,2025-01-10T10:44:32.000Z,SANGITA RAJAN,The only true universal language is mathematic...
2,https://www.thehindu.com/opinion/op-ed/the-ref...,The reforms needed in the MEA\nPremium,2025-01-12T20:16:56.000Z,RAJEEV AGARWAL,"I\nndia is on the rise, thanks to its consiste..."
3,https://www.thehindu.com/hindi/editorial/%E2%8...,राष्ट्रपति पद की वजह से राहत: अमेरिका के नवनिर...,2025-01-14T06:12:30.000Z,,नवनिर्वाचित राष्ट्रपति डोनाल्ड ट्रम्प को न्यूय...
4,https://www.thehindu.com/news/national/tamil-n...,Russia ships nuclear reactor vessel for Kudank...,2025-01-13T23:15:00.000Z,P ANTONY JOSEPH SUDAHAR,"The VVER-1,000 MWe reactor vessel for the sixt..."
...,...,...,...,...,...
87,https://www.thehindu.com/lit-for-life/the-hind...,The Hindu Lit for Life launches mobile library,2025-01-08T06:20:29.000Z,THE HINDU BUREAU,"The Hindu Lit for Life, The Hindu’s annual lit..."
88,https://www.thehindu.com/news/national/uttar-p...,Steve Jobs’ wife Laurene Powell offers prayers...,2025-01-12T07:33:14.000Z,THE HINDU BUREAU,"Laurene Powell Jobs, the wife of late Apple co..."
89,https://www.thehindu.com/sci-tech/energy-and-e...,Is groundwater contamination high in India? | ...,2025-01-11T20:40:00.000Z,JACOB KOSHY,The story so far: An assessment of India’s gro...
90,https://www.thehindu.com/sci-tech/health/docto...,Doctors from 14 nations seek change in India-l...,2025-01-13T14:57:10.000Z,BINDU SHAJAN PERAPPADAN,Dermatologists from the India and 13 other cou...


In [166]:
#combine both dataframes
df_final = pd.concat([df_day1,df_day2],axis=0)

In [167]:
#reset index 
df_final = df_final.reset_index().drop('index',axis=1)

In [168]:
df_final

Unnamed: 0,url,title,date,author,content
0,https://www.thehindu.com/news/international/lo...,Los Angeles fires: 24 dead as fire crews try t...,2025-01-13T02:46:29.000Z,AP,"Firefighters scrambled on Sunday (January 12, ..."
1,https://www.thehindu.com/news/national/prime-m...,"PM inaugurates strategic Z-Morh tunnel in J&K,...",2025-01-13T07:22:43.000Z,PEERZADA ASHIQ,"Prime Minister Narendra Modi, who inaugurated ..."
2,https://www.thehindu.com/lit-for-life/the-hind...,The Hindu Lit for Life | Engage with maths and...,2025-01-10T10:44:32.000Z,SANGITA RAJAN,The only true universal language is mathematic...
3,https://www.thehindu.com/opinion/op-ed/the-ref...,The reforms needed in the MEA\nPremium,2025-01-12T20:16:56.000Z,RAJEEV AGARWAL,"I\nndia is on the rise, thanks to its consiste..."
4,https://www.thehindu.com/news/national/jammu-a...,Jammu and Kashmir CM Omar Abdullah seeks resto...,2025-01-13T08:59:50.000Z,PTI,Chief Minister Omar Abdullah on Monday (Januar...
...,...,...,...,...,...
181,https://www.thehindu.com/lit-for-life/the-hind...,The Hindu Lit for Life launches mobile library,2025-01-08T06:20:29.000Z,THE HINDU BUREAU,"The Hindu Lit for Life, The Hindu’s annual lit..."
182,https://www.thehindu.com/news/national/uttar-p...,Steve Jobs’ wife Laurene Powell offers prayers...,2025-01-12T07:33:14.000Z,THE HINDU BUREAU,"Laurene Powell Jobs, the wife of late Apple co..."
183,https://www.thehindu.com/sci-tech/energy-and-e...,Is groundwater contamination high in India? | ...,2025-01-11T20:40:00.000Z,JACOB KOSHY,The story so far: An assessment of India’s gro...
184,https://www.thehindu.com/sci-tech/health/docto...,Doctors from 14 nations seek change in India-l...,2025-01-13T14:57:10.000Z,BINDU SHAJAN PERAPPADAN,Dermatologists from the India and 13 other cou...


In [169]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186 entries, 0 to 185
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      186 non-null    object
 1   title    186 non-null    object
 2   date     186 non-null    object
 3   author   172 non-null    object
 4   content  177 non-null    object
dtypes: object(5)
memory usage: 7.4+ KB


### removing duplicate and empty items

In [170]:
df_final.drop_duplicates(inplace=True)
df_final.dropna(inplace=True)

In [171]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 141 entries, 0 to 185
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      141 non-null    object
 1   title    141 non-null    object
 2   date     141 non-null    object
 3   author   141 non-null    object
 4   content  141 non-null    object
dtypes: object(5)
memory usage: 6.6+ KB


### exploring length of news articles

In [172]:
df_final['wrd_len'] = df_final['content'].str.split().apply(len)

In [173]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 141 entries, 0 to 185
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      141 non-null    object
 1   title    141 non-null    object
 2   date     141 non-null    object
 3   author   141 non-null    object
 4   content  141 non-null    object
 5   wrd_len  141 non-null    int64 
dtypes: int64(1), object(5)
memory usage: 7.7+ KB


In [174]:
df_final = df_final[df_final['wrd_len'] < 750]
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 135 entries, 0 to 184
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      135 non-null    object
 1   title    135 non-null    object
 2   date     135 non-null    object
 3   author   135 non-null    object
 4   content  135 non-null    object
 5   wrd_len  135 non-null    int64 
dtypes: int64(1), object(5)
memory usage: 7.4+ KB


### sample 100 articles

In [175]:
df_final = df_final.sample(frac=100/135)

In [176]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 134 to 126
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      100 non-null    object
 1   title    100 non-null    object
 2   date     100 non-null    object
 3   author   100 non-null    object
 4   content  100 non-null    object
 5   wrd_len  100 non-null    int64 
dtypes: int64(1), object(5)
memory usage: 5.5+ KB


### gemini api

In [177]:
import google.generativeai as genai
import pandas as pd
from tqdm.notebook import tqdm
import time
import json
from datetime import datetime
import random
from tenacity import retry, stop_after_attempt, wait_exponential

In [178]:
def setup_gemini(api_key):
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel('gemini-pro')
    return model

In [179]:
@retry(
    wait=wait_exponential(multiplier=1, min=4, max=60),
    stop=stop_after_attempt(5)
)
def analyze_article_with_retry(model, article_text, title):
    prompt = f"""
    Title: {title}
    
    Content: {article_text}
    
    Please provide the following analysis:
    1. A concise 2-3 sentence summary focusing on key points and main message.
    2. Classify the type of news (e.g., Politics, Technology, Business, Sports, Entertainment, etc.)
    3. Determine if the news sentiment is positive, negative, or neutral.
    
    Format the response exactly as follows:
    Summary: [your summary]
    Type: [news type]
    Sentiment: [positive/negative/neutral]
    """
    
    response = model.generate_content(prompt)
    return parse_analysis_response(response.text.strip())

def parse_analysis_response(response_text):
    lines = response_text.split('\n')
    result = {
        'summary': '',
        'type': '',
        'sentiment': ''
    }
    
    for line in lines:
        if line.startswith('Summary:'):
            result['summary'] = line.replace('Summary:', '').strip()
        elif line.startswith('Type:'):
            result['type'] = line.replace('Type:', '').strip()
        elif line.startswith('Sentiment:'):
            result['sentiment'] = line.replace('Sentiment:', '').strip()
    
    return result

def analyze_article(model, article_text, title):
    try:
        return analyze_article_with_retry(model, article_text, title)
    except Exception as e:
        print(f"\nError analyzing article '{title}': {str(e)}")
        return {
            'summary': "Error generating analysis - API quota exceeded",
            'type': "Error",
            'sentiment': "Error"
        }

def analyze_articles_in_batches(df, api_key, batch_size=5):
    model = setup_gemini(api_key)
    
    # Add new columns for analysis results
    df['summary'] = ''
    df['news_type'] = ''
    df['sentiment'] = ''
    
    for start_idx in tqdm(range(0, len(df), batch_size), desc="Processing batches"):
        end_idx = min(start_idx + batch_size, len(df))
        batch = df.iloc[start_idx:end_idx]
        
        print(f"\nProcessing batch {start_idx//batch_size + 1}")
        
        # Process each article in the batch
        for idx in batch.index:
            title = df.loc[idx, 'title']
            content = df.loc[idx, 'content']
            
            print(f"Processing: {title[:50]}...")
            analysis = analyze_article(model, content, title)
            
            # Update all analysis fields
            df.loc[idx, 'summary'] = analysis['summary']
            df.loc[idx, 'news_type'] = analysis['type']
            df.loc[idx, 'sentiment'] = analysis['sentiment']
            
            # Random delay between articles within batch
            time.sleep(random.uniform(2, 4))
        
        if end_idx < len(df):
            pause_time = random.uniform(10, 15)
            print(f"\nPausing for {pause_time:.1f} seconds before next batch...")
            time.sleep(pause_time)
        
        # Save intermediate results
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        intermediate_file = f'news_analysis_intermediate_{timestamp}.csv'
        df.to_csv(intermediate_file, index=False)
        print(f"Intermediate results saved to {intermediate_file}")
    
    # Save final results
    final_file = f'news_analysis_final_{timestamp}.csv'
    df.to_csv(final_file, index=False)
    print(f"\nAll analysis completed and saved to {final_file}")
    
    return df

In [181]:
#removing API key for security*
api_key = ""
df_with_summaries = analyze_articles_in_batches(df_final,api_key)

Processing batches:   0%|          | 0/20 [00:00<?, ?it/s]


Processing batch 1
Processing: Need tech-savvy force to protect India's digital i...
Processing: Kumbhvani radio keeps pilgrims at Kumbh informed, ...
Processing: 15 Indian fishermen detained near Diego Garcia isl...
Processing: Uttarakhand bus accident: 6 dead, 22 injured as bu...
Processing: Here are some activities you should attend at The ...

Pausing for 12.1 seconds before next batch...
Intermediate results saved to news_analysis_intermediate_20250114_143820.csv

Processing batch 2
Processing: Gang of seven arrested for cheating businessman of...
Processing: The Hindu Lit for Life | Build self-worth and inne...
Processing: What led to the Azerbaijan Airlines jet crash? | E...
Processing: Bamboo Biryani tickles the taste buds of tourists ...
Processing: BBMP’s new portal allows those without khata to ge...

Pausing for 11.8 seconds before next batch...
Intermediate results saved to news_analysis_intermediate_20250114_143856.csv

Processing batch 3
Processing: The Hindu Lit for Li

Processing: ‘Way you dragged your feet raises doubts’: Delhi H...
Processing: Maha Kumbh begins in Prayagraj, 15 million pilgrim...
Processing: Sensex, Nifty tumble amid weak global peers, forei...
Processing: Rupee sinks to record low, settles 58 paise down a...

Pausing for 11.9 seconds before next batch...
Intermediate results saved to news_analysis_intermediate_20250114_144906.csv

Processing batch 19
Processing: Billion-pound lawsuit against Apple over App Store...
Processing: Why is deciphering the Indus script important? | E...
Processing: Accused in Bengaluru cow horror arrested...
Processing: Lebanon to name a prime minister as new phase begi...
Processing: Ajith Kumar secures third spot at Dubai 24H race d...

Pausing for 13.7 seconds before next batch...
Intermediate results saved to news_analysis_intermediate_20250114_144942.csv

Processing batch 20
Processing: Novak Djokovic, Andy Murray chat mid-match thanks ...
Processing: National Conference mobilises local people to at

In [190]:
df_with_summaries.to_csv('final_article_scraping.csv')

In [183]:
df_with_summaries.index = range(len(df_with_summaries))

In [184]:
df_with_summaries

Unnamed: 0,url,title,date,author,content,wrd_len,summary,news_type,sentiment
0,https://www.thehindu.com/news/national/need-te...,Need tech-savvy force to protect India's digit...,2025-01-13T21:45:00.000Z,PTI,Union Finance Minister Nirmala Sitharaman on M...,77,Union Finance Minister Nirmala Sitharaman stre...,Politics/Technology,Negative
1,https://www.thehindu.com/news/national/uttar-p...,Kumbhvani radio keeps pilgrims at Kumbh inform...,2025-01-13T15:48:36.000Z,THE HINDU BUREAU,Devotional songs interspersed with discussions...,128,"Kumbhvani radio, launched for the Kumbh Mela, ...",Religion/Entertainment,Positive
2,https://www.thehindu.com/news/national/tamil-n...,15 Indian fishermen detained near Diego Garcia...,2025-01-13T10:03:58.000Z,THE HINDU BUREAU,As many as 15 fishermen who ventured into the ...,86,Fifteen Indian fishermen were detained near Di...,Politics,Negative
3,https://www.thehindu.com/news/national/uttarak...,"Uttarakhand bus accident: 6 dead, 22 injured a...",2025-01-13T02:42:39.000Z,PTI,“Six people died and 22 others were injured wh...,57,A bus accident in Uttarakhand’s Pauri Garhwal ...,Local News,Negative
4,https://www.thehindu.com/lit-for-life/here-are...,Here are some activities you should attend at ...,2025-01-10T12:27:43.000Z,AMARJOT KAUR,The Hindu Lit for Life is presented by KIA Ind...,482,The Hindu Lit For Life festival will feature v...,Entertainment,Positive
...,...,...,...,...,...,...,...,...,...
95,https://www.thehindu.com/sport/tennis/novak-dj...,"Novak Djokovic, Andy Murray chat mid-match tha...",2025-01-14T05:06:03.000Z,AP,Novak Djokovic stood with his hands on his hip...,101,"During a match at the Australian Open, Andy Mu...",Sports,Positive
96,https://www.thehindu.com/news/national/jammu-a...,National Conference mobilises local people to ...,2025-01-13T16:25:30.000Z,PEERZADA ASHIQ,Unlike the past trend of ferrying government e...,116,Locals in Kashmir attended Prime Minister Modi...,Politics,Positive
97,https://www.thehindu.com/entertainment/music/k...,Know more about the element of Carnatic music ...,2025-01-13T10:17:04.000Z,"S. ADITHYANARAYANAN,ARCHANA MURALI",At the Music Academy’s morning lec dem session...,62,At the Music Academy's morning lec dem session...,Entertainment,Positive
98,https://www.thehindu.com/news/national/prime-m...,"PM inaugurates strategic Z-Morh tunnel in J&K,...",2025-01-13T07:22:43.000Z,PEERZADA ASHIQ,"Prime Minister Narendra Modi, who inaugurated ...",94,Prime Minister Narendra Modi opened the strate...,Politics,Neutral


## Note : If Graphs arent visible please check attachments

In [196]:
#wrd_len per news type
import plotly.express as px
fig = px.box(df_with_summaries, x='news_type', y='wrd_len', title='Boxplot of Word Length per News Type', labels={'wrd_len': 'Word Length', 'news_type': 'News Type'})
fig.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',  
    font=dict(color='white'),
    title=dict(font=dict(color='white')))

fig.show()

In [203]:
fig.write_html("news_type_countplot.html")

In [194]:
#wrd len based on sentiment
fig = px.box(df_with_summaries, x='sentiment', y='wrd_len', title='Boxplot of Word Length per Setiment', labels={'wrd_len': 'Word Length', 'news_type': 'News Type'})
fig.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',  
    font=dict(color='white'),
    title=dict(font=dict(color='white')))

fig.show()

In [202]:
fig = px.bar(
    df_with_summaries, 
    x='news_type', 
    color='sentiment', 
    title='Countplot of News Type and Sentiment', 
    labels={'news_type': 'News Type', 'count': 'Count'},
    barmode='group'  # Grouped bar chart
)

# Customize layout
fig.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font=dict(color='white'),
    title=dict(font=dict(color='white'))
)

fig.show()