In [45]:
!pip install faker
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from faker import Faker

faker = Faker()

# Number of samples
n = 1000

# --------------------
# Generate Books Table
# --------------------
def create_books_table(n):
    genres = ['Fiction', 'Non-fiction', 'Science', 'History', 'Biography']
    languages = ['English', 'Spanish', 'French', 'German', 'Japanese']
    publishers = ['Penguin', 'HarperCollins', 'Macmillan', 'Random House', 'Simon & Schuster']
    conditions = ['New', 'Good', 'Worn', 'Damaged']  # New column values

    # Columns
    isbns = [f"978-{np.random.randint(1000000000, 9999999999)}" for _ in range(n)]  # Unique ISBNs
    titles = [faker.catch_phrase() for _ in range(n)]  # Realistic book titles
    authors = [faker.name() for _ in range(n)]  # Realistic author names
    genres_data = np.random.choice(genres, size=n)
    languages_data = np.random.choice(languages, size=n)
    publication_years = np.random.randint(1900, 2023, n)  # Publication years within a valid range
    total_copies = np.random.randint(5, 50, n)  # Total inventory of books
    available_copies = np.where(
        np.random.rand(n) < 0.85,
        total_copies - np.random.randint(0, 5, n),  # Available copies (missing for 15%)
        np.nan
    )
    publishers_data = np.random.choice(publishers, size=n)
    book_conditions = np.random.choice(conditions, size=n)  # Assign conditions

    # Create DataFrame
    books_df = pd.DataFrame({
        'ISBN': isbns,
        'Title': titles,
        'Author': authors,
        'Genre': genres_data,
        'Language': languages_data,
        'Publication_Year': publication_years,
        'Total_Copies': total_copies,
        'Available_Copies': available_copies,
        'Publisher': publishers_data,
        'Book_Condition': book_conditions  # New column
    })

    return books_df

# --------------------
# Generate Members Table
# --------------------
def create_members_table(n):
    member_ids = [f"M-{i + 1:05d}" for i in range(n)]
    names = [f"Member {i + 1}" for i in range(n)]
    names = [faker.name() for _ in range(n)]  # Realistic member names
    member_types = ['Regular', 'Premium']
    member_type_data = np.random.choice(member_types, size=n, p=[0.7, 0.3])  # More Regular members
    join_dates = [(datetime.now() - timedelta(days=np.random.randint(1000, 5000))).strftime('%Y-%m-%d') for _ in
                  range(n)]
    fines_outstanding = np.where(
        np.random.rand(n) < 0.8,
        np.random.uniform(0, 100, size=n).round(2),  # 80% have fines
        np.nan
    )

    # Create DataFrame
    members_df = pd.DataFrame({
        'Member_ID': member_ids,
        'Name': names,
        'Member_Type': member_type_data,
        'Join_Date': join_dates,
        'Fine_Outstanding': fines_outstanding,
    })

    return members_df


# --------------------
# Generate Staff Table
# --------------------
def create_staff_table(n):
    staff_ids = [f"S-{i + 1:04d}" for i in range(n)]
    names = [f"Staff {i + 1}" for i in range(n)]
    names = [faker.name() for _ in range(n)]  # Realistic staff names
    roles = ['Librarian', 'Assistant']
    role_data = np.random.choice(roles, size=n, p=[0.5, 0.5])  # Equal split
    hire_dates = [(datetime.now() - timedelta(days=np.random.randint(1000, 5000))).strftime('%Y-%m-%d') for _ in
                  range(n)]
    weekly_hours = np.random.randint(20, 40, n)

    # Create DataFrame
    staff_df = pd.DataFrame({
        'Staff_ID': staff_ids,
        'Name': names,
        'Role': role_data,
        'Hire_Date': hire_dates,
        'Weekly_Hours': weekly_hours,
    })

    return staff_df


# ------------------------
# Generate Borrow Records Table
# ------------------------
def create_borrow_records(n, members_df, books_df):
    transaction_ids = [f"T-{i + 1:05d}" for i in range(n)]
    member_ids = np.random.choice(members_df['Member_ID'], size=n)
    isbns = np.random.choice(books_df['ISBN'], size=n)
    borrow_dates = [(datetime.now() - timedelta(days=np.random.randint(5, 365))).strftime('%Y-%m-%d') for _ in range(n)]
    due_dates = [(datetime.strptime(bd, '%Y-%m-%d') + timedelta(days=30)).strftime('%Y-%m-%d') for bd in borrow_dates]
    return_dates = np.where(
        np.random.rand(n) < 0.8,
        [(datetime.strptime(bd, '%Y-%m-%d') + timedelta(days=np.random.randint(5, 60))).strftime('%Y-%m-%d') for bd in
         borrow_dates],
        None
    )
    fines = np.where(return_dates == None, np.random.uniform(0, 50, size=n).round(2), 0)

    # Create DataFrame explicitly correct without redundant column
    borrow_df = pd.DataFrame({
        'Transaction_ID': transaction_ids,
        'Member_ID': member_ids,
        'ISBN': isbns,
        'Borrow_Date': borrow_dates,
        'Due_Date': due_dates,
        'Return_Date': return_dates,
        'Fine_Amount': fines,
    })

    return borrow_df


# --------------------
# Generate and Save Data
# --------------------
# Generate data
books_df = create_books_table(1000)
members_df = create_members_table(300)
staff_df = create_staff_table(50)
borrow_df = create_borrow_records(2000, members_df, books_df)

# Output data to CSVs
books_df.to_csv('Books.csv', index=False)
members_df.to_csv('Members.csv', index=False)
staff_df.to_csv('Staff.csv', index=False)
borrow_df.to_csv('Borrow_Records.csv', index=False)

print("Data generation complete! Files saved as CSV.")


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Data generation complete! Files saved as CSV.
