# City Chronicals

## Project Partners 
Ayush Panchal - P24DS013\
Pooja Dave - P24DS012

In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import time
import re
from tqdm import tqdm 

In [11]:
city_name = "ahmedabad"
base_url = f"https://english.gujaratsamachar.com/city/{city_name}"
headers = {"User-Agent": "Mozilla/5.0"}

In [12]:
# Date range (last 365 days)
end_date = datetime.today()
start_date = end_date - timedelta(days=365)

In [13]:
# Data storage
titles = []
article_links = []
dates = []
months = []
years = []
days = []

In [14]:
date_range = [start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1)]

In [15]:
for current_date in tqdm(date_range, desc="Scraping by date"):
    formatted_date = current_date.strftime('%Y-%m-%d')
    page = 1

    while True:
        url = f"{base_url}/{page}?date={formatted_date}"
        try:
            response = requests.get(url, headers=headers)
            if response.status_code != 200:
                break

            soup = BeautifulSoup(response.text, "html.parser")
            links = soup.find_all('a')

            found_any = False
            for link in links:
                title = link.get('title')
                href = link.get('href')
                if title and href and title not in titles:
                    full_link = f"https://english.gujaratsamachar.com{href}"

                    try:
                        article_response = requests.get(full_link, headers=headers)
                        article_soup = BeautifulSoup(article_response.text, "html.parser")

                        updated_date = formatted_date
                        date_p = article_soup.find('p', class_='text-muted mb-0')
                        if date_p:
                            date_text = date_p.get_text(strip=True)
                            date_match = re.search(r'Updated:\s+([A-Za-z]+)\s+(\d{1,2})[a-z]{2},\s+(\d{4})', date_text)
                            if date_match:
                                month_str = date_match.group(1)
                                day = int(date_match.group(2))
                                year = int(date_match.group(3))
                                month_num = datetime.strptime(month_str, "%b").month
                                updated_date = f"{year}-{month_num:02d}-{day:02d}"

                    except Exception as e:
                        print(f"Failed to fetch article date: {full_link} | Error: {e}")
                        updated_date = formatted_date

                    date_obj = datetime.strptime(updated_date, "%Y-%m-%d")
                    titles.append(title)
                    article_links.append(full_link)
                    dates.append(updated_date)
                    months.append(f"{date_obj.month:02d}")
                    years.append(date_obj.year)
                    days.append(date_obj.day)

                    found_any = True

            if not found_any:
                break

            page += 1
            # time.sleep(1)

        except Exception as e:
            print(f"Error on {formatted_date} page {page}: {e}")
            break

Scraping by date:  12%|█▏        | 43/366 [00:40<05:01,  1.07it/s]


KeyboardInterrupt: 

In [None]:
df = pd.DataFrame({
    "Title": titles,
    "Article Link": article_links,
    "Date": dates,
    "Month": months,
    "Year": years,
    "Day": days
})

In [None]:
df.to_csv(f"{city_name}_articles_last_year.csv", index=False)
print(f"✅ Total articles collected: {df.shape[0]}")

✅ Total articles collected: 1252


In [None]:
df.head()

Unnamed: 0,Title,Article Link,Date,Month,Year,Day
0,Six of ten deceased in A’bad-Vadodara expressw...,https://english.gujaratsamachar.com/news/gujar...,2024-04-17,4,2024,17
1,Ahmedabad cybercrime branch nabs two in ayurve...,https://english.gujaratsamachar.com/news/gujar...,2024-04-17,4,2024,17
2,Accident on Ahmedabad Vadodara Expressway kill...,https://english.gujaratsamachar.com/news/gujar...,2024-04-17,4,2024,17
3,Gujarat Titans seek home advantage against Del...,https://english.gujaratsamachar.com/news/sport...,2024-04-17,4,2024,17
4,Scorching heat hits Amdavadis: 200 heat-relate...,https://english.gujaratsamachar.com/news/gujar...,2024-04-17,4,2024,17
