In [3]:
import os
import sys
import django
import csv
import re

# Setup Django environment
def setup_django(project_path):
    """Setup Django environment with the given project path."""
    sys.path.append(project_path)
    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "library_api.settings")
    django.setup()

# Define parameters for data extraction
MAX_AUTHORS = 500  # Maximum number of valid authors to filter
TOTAL_AUTHORS = 1000  # Total number of authors to process
CHUNK_SIZE = 250   # Size for inserting authors in chunks

def is_english(s):
    """Check if a string contains only ASCII characters."""
    return bool(re.match(r'^[\x00-\x7F]+$', s))

def has_valid_photo(image_url):
    """Check if the image URL is valid and does not contain 'nophoto'."""
    return "nophoto" not in image_url and bool(re.match(r'.+\.(jpg|jpeg|png|gif)$', image_url))

def extract_books(about):
    """Extract book titles and their published dates from the 'about' field."""
    books = []
    if "written" in about:
        matches = re.findall(r'written (.*?)\s*\((\d{4})\)', about)
        for match in matches:
            title, year = match
            books.append({
                "title": title.strip(),
                "published_date": year.strip()
            })
    return books

def filter_authors(input_file, output_file):
    """Filter authors based on defined criteria and save to a new CSV file."""
    filtered_authors = []
    valid_author_count = 0

    try:
        with open(input_file, mode='r', encoding='utf-8') as file:
            reader = csv.DictReader(file)

            for row in reader:
                if valid_author_count >= MAX_AUTHORS:
                    break  # Stop if we have reached the maximum valid authors

                name = row.get("name", "").strip()
                image_url = row.get("image_url", "").strip()
                about = row.get("about", "").strip()

                # Validate author details
                if name and image_url and about:
                    if not is_english(name):
                        continue  # Skip non-English names
                    if not has_valid_photo(image_url):
                        continue  # Skip invalid photo URLs

                    about_clean = re.sub(r'<.*?>', '', about)
                    books_written = extract_books(about_clean)

                    if books_written:
                        filtered_authors.append({
                            "name": name,
                            "image_url": image_url,
                            "about": about_clean,
                            "books_written": books_written
                        })
                        valid_author_count += 1

        # Write filtered authors to a new CSV file if we found valid authors
        if filtered_authors:
            with open(output_file, mode='w', encoding='utf-8', newline='') as outfile:
                fieldnames = ["name", "image_url", "about", "books_written"]
                writer = csv.DictWriter(outfile, fieldnames=fieldnames)
                writer.writeheader()
                for author in filtered_authors[:MAX_AUTHORS]:  # Only take the first MAX_AUTHORS
                    books_info = "; ".join(f"{book['title']} ({book['published_date']})" for book in author['books_written'])
                    author['books_written'] = books_info
                    writer.writerow(author)

            print(f"Filtered authors saved to '{output_file}'.")
            print(f"Successfully validated {valid_author_count} authors.")
        else:
            print("No valid authors found.")

    except FileNotFoundError:
        print(f"File not found: {input_file}")
    except csv.Error as e:
        print(f"Error reading CSV file: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

def insert_data_in_chunks(authors_data):
    """Insert authors into the database in chunks and provide feedback."""
    total_inserted = 0
    for i in range(0, len(authors_data), CHUNK_SIZE):
        chunk = authors_data[i:i + CHUNK_SIZE]
        # Insert chunk into the database (implement your logic here)

        # Simulate insertion for demonstration purposes
        total_inserted += len(chunk)
        print(f"Inserting chunk {i // CHUNK_SIZE + 1}: {len(chunk)} authors inserted. Total inserted so far: {total_inserted}.")

    print(f"All {total_inserted} authors have been inserted successfully.")

if __name__ == "__main__":
    project_path = '/home/code/library_api'  # Adjust the path as necessary
    setup_django(project_path)

    # Define input and output CSV files
    input_csv_file = "authors.csv"
    output_csv_file = "authors_filtered.csv"
    
    # Filter authors and save to output CSV
    filter_authors(input_csv_file, output_csv_file)

    with open(output_csv_file, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        authors_data = list(reader)

    if authors_data:
        print(f"Ready to insert {len(authors_data)} authors.")
        insert_data_in_chunks(authors_data)
    else:
        print("No authors to insert. Aborting.")


Filtered authors saved to 'authors_filtered.csv'.
Successfully validated 500 authors.
Ready to insert 500 authors.
Inserting chunk 1: 250 authors inserted. Total inserted so far: 250.
Inserting chunk 2: 250 authors inserted. Total inserted so far: 500.
All 500 authors have been inserted successfully.
