In [1]:
import requests
from bs4 import BeautifulSoup
import json
import os
import csv

# Base directory path where you want to save everything
base_directory = r"E:\NYX\imgur"  # Update this to your desired path
# Create directories if they don't exist

category = 'Conceptual art'
subcategory = ' '
image_folder_name = f'{category}_{subcategory}_downloaded_images'
csv_file_name = f'{category}_{subcategory}_data.csv'

image_directory = os.path.join(base_directory, image_folder_name)
csv_directory = base_directory  # You can change this if you prefer a different directory for the CSV
os.makedirs(image_directory, exist_ok=True)
os.makedirs(csv_directory, exist_ok=True)

# CSV file setup
csv_file_name = os.path.join(csv_directory, csv_file_name)

# Headers for the request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
}

# Function to fetch and process data from each page
def fetch_data_from_page(page_number):
    # The URL of the page you want to scrape, adjusted for the current page number
    url = f'https://www.deviantart.com/?topic=concept-art&page={page_number}'
    
    # Fetch the webpage
    response = requests.get(url, headers=headers)
    webpage_content = response.text
    
    # Parse the HTML content
    soup = BeautifulSoup(webpage_content, 'html.parser')
    
    # Find the <script> tag that contains the JSON data
    script_tag = soup.find_all('script', type='application/json')
    if not script_tag:
        print(f"No script tag found on page {page_number}")
        return
    
    # Extract and parse the JSON string from the <script> tag
    try:
        json_string = script_tag[1].string if script_tag else None
        json_data = json.loads(json_string) if json_string else None
    except Exception as e:
        print(f"Failed to parse JSON on page {page_number}: {e}")
        return
    
    if not json_data or 'search' not in json_data or 'gallery' not in json_data['search'] or 'assets' not in json_data['search']['gallery']:
        print(f"No asset data found on page {page_number}")
        return
    
    # Assuming json_data contains your JSON data
    assets = json_data['search']['gallery']['assets']
    
    with open(csv_file_name, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        
        # Write the header row only for the first page
        if page_number == 1:
            writer.writerow(['ID', 'Image URL', 'Upload Date', 'Title'])
        
        # Loop through each asset and process it
        for asset in assets:
            img_url = asset['thumbUrl']
            asset_id = asset['id']
            upload_date = asset['dateSubmitted']
            title = asset['title']
            
            # Download image
            response = requests.get(img_url)
            if response.status_code == 200:
                image_path = os.path.join(image_directory, f"{asset_id}.jpg")
                with open(image_path, 'wb') as img_file:
                    img_file.write(response.content)
                print(f"Downloaded {image_path}")
            else:
                print(f"Failed to download image for asset ID {asset_id}")
            
            # Write data to CSV
            writer.writerow([asset_id, img_url, upload_date, title])

# Loop through the desired number of pages
end_page = 1
for i in range(1, end_page+1):  
    fetch_data_from_page(i)

print("Data download and CSV generation complete.")


No script tag found on page 1
Data download and CSV generation complete.
