# Web Scrapping
The script iteratively scrapes articles from the culture section of "Le Figaro" website, extracting each article's title and content, and then compiles this information into a CSV file. 

In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Function: get_soup(url) : 
Parameter: url
Fetches the HTML content from a given URL and returns a BeautifulSoup object for parsing.

In [15]:
# Function to get soup object from URL
def get_soup(url):
    response = requests.get(url)
    response.raise_for_status()  # Will raise HTTPError if the HTTP request returned an unsuccessful status code
    return BeautifulSoup(response.text, 'html.parser')

## Function: scrape_article(article_url)
Parameter: the URL of the specific article that needs to be scraped.
Extracts the title and main content of an article from its URL.

In [16]:
# Function to scrape an individual article
def scrape_article(article_url):
    soup = get_soup(article_url)
    title = soup.find('title').text
    article_content = soup.find('div', class_='fig-content-body')
    if article_content:
        content = ' '.join(article_content.get_text(separator='\n', strip=True).split())
    else:
        content = "Content not found"
    return title, content

This script also includes a looping structure to navigate through multiple pages of a website and collect data. 

In [17]:
base_url = 'https://www.lefigaro.fr/culture'

data = []  # List to store the data


for page_num in range(1, 8):
    page_url = f"{base_url}?page={page_num}" if page_num > 1 else base_url
    soup = get_soup(page_url)
    
    articles = soup.find_all('article', class_='fig-ranking-profile-container fig-ranking-profile-small-picture')
    for article in articles:
        a_tag = article.find('a')
        if a_tag and 'href' in a_tag.attrs:
            article_url = a_tag['href']
            if not article_url.startswith('http'):
                article_url = 'https://www.lefigaro.fr' + article_url
            title, content = scrape_article(article_url)
            data.append({'articles': content, 'highlights': title}) 

df = pd.DataFrame(data)

df['id'] = range(1, len(df) + 1)

df.to_csv('articles.csv', index=False)

print("Data saved to 'articles.csv'")

Data saved to 'articles.csv'
