In [1]:
!pip install retrying

import requests
from bs4 import BeautifulSoup
import pandas as pd
import concurrent.futures
import logging
from retrying import retry

# Configure logging
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] [%(levelname)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

url_df = pd.read_csv('df_hobby.csv')
c = 0

# Function to scrape content from a URL with retries
@retry(wait_fixed=2000, stop_max_attempt_number=3)
def scrape_url_with_retry(url):
    global c  # Declare c as a global variable
    try:
        page = requests.get(url, verify=True)  # Verify SSL certificates for other websites
    except requests.exceptions.SSLError as e:
        # If SSL certificate verification fails, try without verification
        page = requests.get(url, verify=False)
    except requests.exceptions.RequestException as e:
        logging.error(f"Error accessing URL: {url}. Error: {e}")
        return ''  # Return an empty string to indicate the error
    
    try:
        page.raise_for_status()  # Raise an exception for HTTP errors
        soup = BeautifulSoup(page.content, 'html.parser')
        
        content_with_class = soup.find('div', class_='_s30J clearfix')
        if content_with_class:
            c += 1
            logging.info(f"Scraped article {c}")
            return content_with_class.text
        
        content_with_normal = soup.find('div', class_='Normal')
        if content_with_normal:
            c += 1
            logging.info(f"Scraped article {c}")
            return content_with_normal.text
    except requests.exceptions.RequestException as e:
        logging.error(f"Error accessing URL: {url}. Error: {e}")
    
    return ''
    
# Use multithreading with retry mechanism
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = list(executor.map(scrape_url_with_retry, url_df['URL']))

# Apply the results to the DataFrame
url_df['text'] = results
url_df['target'] = 'hobbies and interest'
result_df = url_df[['target', 'text']]
result_df.to_csv('scraped_hobby.csv', index = False)





[2023-08-13 16:41:38] [INFO] Scraped article 1
[2023-08-13 16:41:39] [INFO] Scraped article 2
[2023-08-13 16:41:39] [INFO] Scraped article 3
[2023-08-13 16:41:39] [INFO] Scraped article 4
[2023-08-13 16:41:39] [INFO] Scraped article 5
[2023-08-13 16:41:39] [INFO] Scraped article 6
[2023-08-13 16:41:39] [INFO] Scraped article 7
[2023-08-13 16:41:39] [INFO] Scraped article 8
[2023-08-13 16:41:39] [INFO] Scraped article 9
[2023-08-13 16:41:39] [INFO] Scraped article 10
[2023-08-13 16:41:39] [INFO] Scraped article 11
[2023-08-13 16:41:39] [INFO] Scraped article 12
[2023-08-13 16:41:40] [INFO] Scraped article 13
[2023-08-13 16:41:40] [INFO] Scraped article 14
[2023-08-13 16:41:40] [INFO] Scraped article 15
[2023-08-13 16:41:40] [INFO] Scraped article 16
[2023-08-13 16:41:40] [INFO] Scraped article 17
[2023-08-13 16:41:40] [INFO] Scraped article 18
[2023-08-13 16:41:40] [INFO] Scraped article 19
[2023-08-13 16:41:40] [INFO] Scraped article 20
[2023-08-13 16:41:41] [INFO] Scraped article 21
[