# City Chronicals

## Project Partners 
Ayush Panchal - P24DS013\
Pooja Dave - P24DS012

In [3]:
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')


In [4]:
# Scrape data and create DataFrame without summaries
import pandas as pd
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import requests
from bs4 import BeautifulSoup
import re
from tqdm import tqdm

# Setup for scraping (same as previous)
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
service = Service(executable_path=r'C:\Users\Ayush\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe')

city_name = "ahmedabad"
base_url = f"https://english.gujaratsamachar.com/city/{city_name}"
headers = {"User-Agent": "Mozilla/5.0"}
max_articles = 15

# Date range (last 7 days)
end_date = datetime.today()
start_date = end_date - timedelta(days=7)
date_range = pd.date_range(start=start_date, end=end_date)

# Initialize lists for storing data
titles, article_links, dates, months, years, days = [], [], [], [], [], []
articles, locations = [], []

# Loop to scrape articles
for current_date in tqdm(date_range, desc="Scraping by date"):
    if len(titles) >= max_articles:
        break

    formatted_date = current_date.strftime('%Y-%m-%d')
    page = 1

    while True:
        if len(titles) >= max_articles:
            break

        url = f"{base_url}/{page}?date={formatted_date}"
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            break

        soup = BeautifulSoup(response.text, "html.parser")
        links = soup.find_all('a')
        found_any = False

        for link in links:
            if len(titles) >= max_articles:
                break

            title = link.get('title')
            href = link.get('href')
            if title and href and title not in titles:
                full_link = f"https://english.gujaratsamachar.com{href}"

                # Fetch article page using Selenium (no sleep)
                driver = webdriver.Chrome(service=service, options=chrome_options)
                driver.get(full_link)
                html = driver.page_source
                driver.quit()

                article_soup = BeautifulSoup(html, 'html.parser')
                article_div = article_soup.find('div', class_='article')
                article_text = article_div.get_text(separator=' ', strip=True) if article_div else ""

                if not article_text:
                    continue

                # Extract date from article
                updated_date = formatted_date
                date_p = article_soup.find('p', class_='text-muted mb-0')
                if date_p:
                    date_text = date_p.get_text(strip=True)
                    date_match = re.search(r'Updated:\s+([A-Za-z]+)\s+(\d{1,2})[a-z]{2},\s+(\d{4})', date_text)
                    if date_match:
                        month_str = date_match.group(1)
                        day = int(date_match.group(2))
                        year = int(date_match.group(3))
                        month_num = datetime.strptime(month_str, "%b").month
                        updated_date = f"{year}-{month_num:02d}-{day:02d}"

                # Final formatting
                date_obj = datetime.strptime(updated_date, "%Y-%m-%d")

                titles.append(title)
                article_links.append(full_link)
                dates.append(updated_date)
                months.append(f"{date_obj.month:02d}")
                years.append(date_obj.year)
                days.append(date_obj.day)
                articles.append(article_text)
                locations.append(city_name)

                found_any = True

        if not found_any:
            break

        page += 1

# Create the DataFrame without summaries
df = pd.DataFrame({
    "Title": titles,
    "Article Link": article_links,
    "Date": dates,
    "Day": days,
    "Month": months,
    "Year": years,
    "Article": articles,
    "Location": locations
})

# Save the DataFrame to CSV (this will be uploaded to Kaggle)
df.to_csv("ahmedabad_news_articles.csv", index=False)

# Preview the dataframe
print(df.head())


Scraping by date:  38%|███▊      | 3/8 [01:30<02:30, 30.12s/it]

                                               Title  \
0  Over half of liver ailments in ‘dry state’ Guj...   
1  Jamnagar man sent  to three-day police remand ...   
2  Court operations delayed in Ahmedabad as judge...   
3     Local vendor knifed in broad daylight in Ranip   
4  One worker dead, two injured in knife attack a...   

                                        Article Link        Date  Day Month  \
0  https://english.gujaratsamachar.com/news/healt...  2025-04-13   13    04   
1  https://english.gujaratsamachar.com/news/gujar...  2025-04-13   13    04   
2  https://english.gujaratsamachar.com/news/gujar...  2025-04-13   13    04   
3  https://english.gujaratsamachar.com/news/gujar...  2025-04-14   14    04   
4  https://english.gujaratsamachar.com/news/gujar...  2025-04-14   14    04   

   Year                                            Article   Location  
0  2025  Despite Gujarat being a dry state, alcohol con...  ahmedabad  
1  2025  Ahmedabad rural court have sent May




In [5]:
df.head()

Unnamed: 0,Title,Article Link,Date,Day,Month,Year,Article,Location
0,Over half of liver ailments in ‘dry state’ Guj...,https://english.gujaratsamachar.com/news/healt...,2025-04-13,13,4,2025,"Despite Gujarat being a dry state, alcohol con...",ahmedabad
1,Jamnagar man sent to three-day police remand ...,https://english.gujaratsamachar.com/news/gujar...,2025-04-13,13,4,2025,Ahmedabad rural court have sent Mayank Sanghan...,ahmedabad
2,Court operations delayed in Ahmedabad as judge...,https://english.gujaratsamachar.com/news/gujar...,2025-04-13,13,4,2025,"In Ahmedabad, functioning of nearly 20 courts ...",ahmedabad
3,Local vendor knifed in broad daylight in Ranip,https://english.gujaratsamachar.com/news/gujar...,2025-04-14,14,4,2025,A 36-year-old vegetable vendor was seriously i...,ahmedabad
4,"One worker dead, two injured in knife attack a...",https://english.gujaratsamachar.com/news/gujar...,2025-04-14,14,4,2025,In a disturbing case of violence stemming from...,ahmedabad


In [7]:
from transformers import pipeline
import torch
import pandas as pd

# Load your DataFrame
df = pd.read_csv("ahmedabad_news_articles.csv")

# Use GPU if available
device = 0 if torch.cuda.is_available() else -1

# Summarizer pipeline
summarizer = pipeline("summarization", model="facebook/bart-large", device=device)

# Zero-shot tag classification
tagger = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)
candidate_labels = ['Politics', 'Business', 'Health', 'Sports', 'Technology', 'Entertainment', 'Environment', 'Education', 'Crime', 'Weather']

# Sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment", device=device)

# --- Helpers ---

def chunk_text(text, max_tokens=1024):
    words = text.split()
    for i in range(0, len(words), max_tokens):
        yield " ".join(words[i:i + max_tokens])

def summarize_article(text, max_chunk_tokens=1024):
    chunks = list(chunk_text(text, max_chunk_tokens))
    summaries = []
    for chunk in chunks:
        if chunk.strip():
            summary = summarizer(chunk, max_length=80, min_length=20, do_sample=False)
            summaries.append(summary[0]['summary_text'])
    return " ".join(summaries)

def classify_tags(text):
    result = tagger(text, candidate_labels, multi_label=False)
    labels = [label for label, score in zip(result['labels'], result['scores']) if score > 0.3]
    return ', '.join(labels) if labels else 'General'

def analyze_sentiment(text):
    result = sentiment_analyzer(text[:512])[0]
    label = result['label']
    if "1" in label or "2" in label:
        return "Negative"
    elif "3" in label:
        return "Neutral"
    else:
        return "Positive"

# --- Main loop ---

summaries = []
tags_list = []
sentiments = []

for article in df['Article']:
    try:
        if isinstance(article, str) and article.strip():
            summary = summarize_article(article)
            tags = classify_tags(article)
            sentiment = analyze_sentiment(article)
        else:
            summary, tags, sentiment = "No content", "General", "Neutral"
    except Exception as e:
        print(f"Error processing article: {e}")
        summary, tags, sentiment = "Error", "General", "Neutral"

    summaries.append(summary)
    tags_list.append(tags)
    sentiments.append(sentiment)

# Add to DataFrame
df['Summary'] = summaries
df['Tags'] = tags_list
df['Sentiment'] = sentiments

# Save updated file
df.to_csv("news_with_summaries_tags_sentiment.csv", index=False)
print("✔ Saved updated file with summaries, tags, and sentiment!")


Device set to use cpu
Device set to use cpu
Device set to use cpu


✔ Saved updated file with summaries, tags, and sentiment!


In [8]:
df.head()

Unnamed: 0,Title,Article Link,Date,Day,Month,Year,Article,Location,Summary,Tags,Sentiment
0,Over half of liver ailments in ‘dry state’ Guj...,https://english.gujaratsamachar.com/news/healt...,2025-04-13,13,4,2025,"Despite Gujarat being a dry state, alcohol con...",ahmedabad,"Despite Gujarat being a dry state, alcohol con...",General,Negative
1,Jamnagar man sent to three-day police remand ...,https://english.gujaratsamachar.com/news/gujar...,2025-04-13,13,4,2025,Ahmedabad rural court have sent Mayank Sanghan...,ahmedabad,Ahmedabad court remands absconding convict May...,Crime,Negative
2,Court operations delayed in Ahmedabad as judge...,https://english.gujaratsamachar.com/news/gujar...,2025-04-13,13,4,2025,"In Ahmedabad, functioning of nearly 20 courts ...",ahmedabad,"In Ahmedabad, functioning of nearly 20 courts ...",General,Negative
3,Local vendor knifed in broad daylight in Ranip,https://english.gujaratsamachar.com/news/gujar...,2025-04-14,14,4,2025,A 36-year-old vegetable vendor was seriously i...,ahmedabad,A 36-year-old vegetable vendor was seriously i...,Crime,Negative
4,"One worker dead, two injured in knife attack a...",https://english.gujaratsamachar.com/news/gujar...,2025-04-14,14,4,2025,In a disturbing case of violence stemming from...,ahmedabad,In a disturbing case of violence stemming from...,Crime,Negative


In [None]:
print(df["Summary"][13])

In a brazen act of burglary in Ahmedabad’s Jamalpur locality, an unknown miscreant is reported to have broken into a residential flat and made away with gold, silver ornaments and cash amounting to an estimated ₹5,89 lakh. The incident took place on the morning of April 11 while the occupants of the house were away on a religious visit


In [9]:
%pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.5.tar.gz (317.2 MB)
     ---------------------------------------- 0.0/317.2 MB ? eta -:--:--
     --------------------------------------- 1.3/317.2 MB 11.3 MB/s eta 0:00:29
      -------------------------------------- 4.7/317.2 MB 14.3 MB/s eta 0:00:22
      -------------------------------------- 7.3/317.2 MB 13.3 MB/s eta 0:00:24
     - ------------------------------------- 8.9/317.2 MB 13.2 MB/s eta 0:00:24
     - ------------------------------------- 9.2/317.2 MB 11.7 MB/s eta 0:00:27
     - ------------------------------------- 10.7/317.2 MB 9.2 MB/s eta 0:00:34
     - ------------------------------------- 12.1/317.2 MB 8.7 MB/s eta 0:00:36
     - ------------------------------------- 14.4/317.2 MB 8.9 MB/s eta 0:00:35
     -- ------------------------------------ 16.5/317.2 MB 9.0 MB/s eta 0:00:34
     -- ------------------------------------ 19.1/317.2 MB 9.2 MB/s eta 0:00:33
     -- ------------------------------------ 20.7/317.2 MB 9.3