In [15]:
! py -m pip list

Package                                  Version
---------------------------------------- -------------------
absl-py                                  1.0.0
aiohttp                                  3.9.5
aiosignal                                1.3.1
altair                                   5.3.0
annotated-types                          0.6.0
anyio                                    3.5.0
argon2-cffi                              21.3.0
argon2-cffi-bindings                     21.2.0
asgiref                                  3.8.1
asttokens                                2.0.5
astunparse                               1.6.3
async-timeout                            4.0.3
asyncio                                  3.4.3
attrs                                    21.4.0
Babel                                    2.9.1
backcall                                 0.2.0
backoff                                  2.2.1
bcrypt                                   4.1.3
beautifulsoup4                           

In [1]:
import pandas as pd
import sqlite3
from datetime import datetime, timedelta
import random

# Assisted living specific room types.
room_types = [
    ("Private Assisted Living Room", "Private"),
    ("Semi-Private Assisted Living Room", "Semi-Private"),
    ("Shared Assisted Living Room", "Shared"),
    ("Memory Care Unit", "Memory Care"),
    ("Skilled Nursing Room", "Skilled Nursing"),
    ("Rehabilitation Room", "Rehabilitation"),
    ("Palliative Care Unit", "Palliative Care"),
    ("Alzheimer's Care Unit", "Memory Care")
]

# Function to generate a random timestamp between two datetime objects.
def random_timestamp(start, end):
    delta = end - start
    random_seconds = random.randint(0, int(delta.total_seconds()))
    return start + timedelta(seconds=random_seconds)

# Set the date range for RecordIngestedOn.
start_date = datetime(2025, 1, 1)
end_date = datetime(2025, 3, 31)

# Create one row per room type.
data = []
for i, (room_name, room_group) in enumerate(room_types, start=1):
    row = {
        "RoomTypeKey": i,
        "RoomTypeId": i,
        "RoomTypeCode": f"RT-{i:03d}",  # e.g., RT-001, RT-002, etc.
        "RoomTypeName": room_name,
        "RoomTypeGroup": room_group,
        "RecordIngestedOn": random_timestamp(start_date, end_date).strftime("%Y-%m-%d %H:%M:%S")
    }
    data.append(row)

# Convert the list of dictionaries into a DataFrame.
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file.
csv_filename = "Dim_CensusRoomType.csv"
df.to_csv(csv_filename, index=False)
print(f"CSV file '{csv_filename}' created with {len(df)} records.")

# Create a SQLite database and store the CSV data in a table.
db_filename = "census.db"
conn = sqlite3.connect(db_filename)
df.to_sql("Dim_CensusRoomType", conn, if_exists='replace', index=False)
conn.commit()
conn.close()
print(f"Data stored in SQLite database '{db_filename}' in table 'Dim_CensusRoomType'.")


CSV file 'Dim_CensusRoomType.csv' created with 8 records.
Data stored in SQLite database 'census.db' in table 'Dim_CensusRoomType'.


In [2]:
import pandas as pd
import sqlite3
from datetime import datetime, timedelta
import random

# Function to generate a random timestamp between two datetime objects.
def random_timestamp(start, end):
    delta = end - start
    random_seconds = random.randint(0, int(delta.total_seconds()))
    return start + timedelta(seconds=random_seconds)

# Define the status records as tuples:
# (CensusStatusKey, CensusStatusCode, CensusStatusName)
statuses = [
    (1, "A", "Active"),
    (2, "I", "Inactive"),
    (3, "P", "Pending"),
    (4, "D", "Discharged"),
    (5, "O", "On Hold")
]

# Set the date range for RecordIngestedOn.
start_date = datetime(2025, 1, 1)
end_date = datetime(2025, 3, 31)

# Create fake data for each status record.
data = []
for key, code, name in statuses:
    data.append({
        "CensusStatusKey": key,
        "CensusStatusCode": code,
        "CensusStatusName": name,
        "RecordIngestedOn": random_timestamp(start_date, end_date).strftime("%Y-%m-%d %H:%M:%S")
    })

# Convert the list to a DataFrame.
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file.
csv_filename = "Dim_CensusStatus.csv"
df.to_csv(csv_filename, index=False)
print(f"CSV file '{csv_filename}' created.")

# Create a SQLite database (or connect if it exists) and store the CSV data in a table.
db_filename = "census.db"
conn = sqlite3.connect(db_filename)

# Write the DataFrame to the SQLite database table "Dim_CensusStatus".
df.to_sql("Dim_CensusStatus", conn, if_exists='replace', index=False)
conn.commit()
conn.close()
print(f"Data stored in SQLite database '{db_filename}' in table 'Dim_CensusStatus'.")


CSV file 'Dim_CensusStatus.csv' created.
Data stored in SQLite database 'census.db' in table 'Dim_CensusStatus'.


In [10]:
import pandas as pd
import sqlite3
import calendar
from datetime import datetime, timedelta

# Define the date range: from 2021-01-01 to 2023-12-31
start_date = datetime(2021, 1, 1)
end_date = datetime(2024, 12, 31)
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

data = []
for dt in date_range:
    # Convert Pandas Timestamp to Python datetime
    dt_date = dt.to_pydatetime()
    
    # CensusDateKey: YYYYMMDD as integer
    key = int(dt_date.strftime("%Y%m%d"))
    
    # CensusDateDate: date with fixed time string
    census_date_date = dt_date.strftime("%Y-%m-%d 00:00:00")
    
    # Determine quarter and first day of quarter
    quarter = ((dt_date.month - 1) // 3) + 1
    first_month_of_quarter = (quarter - 1) * 3 + 1
    first_day_of_quarter = datetime(dt_date.year, first_month_of_quarter, 1)
    day_in_quarter = (dt_date - first_day_of_quarter).days + 1
    
    # Day names and day of month
    day_name = dt_date.strftime("%A")
    day_abbr = dt_date.strftime("%a")
    day_of_month = dt_date.day
    
    # Compute CensusDateDayOfWeek using a system where Sunday=1, Monday=2, ..., Saturday=7
    dow = dt_date.weekday()  # Monday=0 ... Sunday=6
    day_of_week = dow + 2 if dow < 6 else 1
    
    # Occurrence count of that weekday in the month (e.g., 1st, 2nd, etc.)
    day_of_week_in_month = sum(1 for d in range(1, dt_date.day + 1) 
                               if datetime(dt_date.year, dt_date.month, d).weekday() == dow)
    
    # Day of year
    day_of_year = dt_date.timetuple().tm_yday
    
    # First day values for month, quarter, and year
    first_day_of_month = dt_date.replace(day=1).strftime("%Y-%m-%d 00:00:00")
    first_day_of_quarter_str = first_day_of_quarter.strftime("%Y-%m-%d 00:00:00")
    first_day_of_year = datetime(dt_date.year, 1, 1).strftime("%Y-%m-%d 00:00:00")
    
    # Determine holiday (simple rules for three common holidays)
    holiday = ""
    if dt_date.month == 1 and dt_date.day == 1:
        holiday = "New Year's Day"
    elif dt_date.month == 7 and dt_date.day == 4:
        holiday = "Independence Day"
    elif dt_date.month == 12 and dt_date.day == 25:
        holiday = "Christmas Day"
    
    is_holiday = True if holiday else False
    is_weekday = dt_date.weekday() < 5  # Monday-Friday
    is_weekend = not is_weekday
    
    # Last day of quarter: compute last month of quarter and then its last day
    last_month_of_quarter = first_month_of_quarter + 2
    last_day_q = calendar.monthrange(dt_date.year, last_month_of_quarter)[1]
    last_day_of_quarter = datetime(dt_date.year, last_month_of_quarter, last_day_q).strftime("%Y-%m-%d 00:00:00")
    
    # Last day of month and year
    last_day_m = calendar.monthrange(dt_date.year, dt_date.month)[1]
    last_day_of_month = dt_date.replace(day=last_day_m).strftime("%Y-%m-%d 00:00:00")
    last_day_of_year = datetime(dt_date.year, 12, 31).strftime("%Y-%m-%d 00:00:00")
    
    # Month details
    month = dt_date.month
    month_abbr = dt_date.strftime("%b")
    month_name = dt_date.strftime("%B")
    month_of_quarter = dt_date.month - first_month_of_quarter + 1
    
    # Quarter details
    quarter_num = quarter
    quarter_name = {1: "First", 2: "Second", 3: "Third", 4: "Fourth"}[quarter_num]
    quarter_short_name = f"Q{quarter_num}"
    
    # Week of month: rough calculation
    week_of_month = (dt_date.day - 1) // 7 + 1
    # Week of quarter: based on days since first day of quarter
    week_of_quarter = ((dt_date - first_day_of_quarter).days) // 7 + 1
    # Week of year using ISO calendar week
    week_of_year = dt_date.isocalendar()[1]
    
    # YYYYMM formatted as YYYY/MM
    yyyy_mm = dt_date.strftime("%Y/%m")
    
    year = dt_date.year
    year_and_quarter = f"{year}/Q{quarter_num}"
    year_month = f"{year}/{dt_date.strftime('%b')}"
    year_name = f"CY {year}"
    
    # Fiscal calculations (assume fiscal year starts on October 1)
    if dt_date.month < 10:
        fiscal_start = datetime(dt_date.year - 1, 10, 1)
    else:
        fiscal_start = datetime(dt_date.year, 10, 1)
    fiscal_start_str = fiscal_start.strftime("%Y-%m-%d 00:00:00")
    
    # Fiscal Date Key is same as the calendar key.
    fiscal_date_key = key
    fiscal_day_of_year = (dt_date - fiscal_start).days + 1
    # Fiscal month: October becomes month 1, November 2, December 3, January 4, etc.
    fiscal_month = dt_date.month - 9 if dt_date.month >= 10 else dt_date.month + 3
    fiscal_quarter = ((fiscal_month - 1) // 3) + 1
    fiscal_quarter_name = f"Q{fiscal_quarter}"
    fiscal_week_of_year = ((dt_date - fiscal_start).days) // 7 + 1
    # Fiscal Year: if month < October, fiscal year equals current year; otherwise, next year.
    fiscal_year_val = dt_date.year if dt_date.month < 10 else dt_date.year + 1
    fiscal_year = f"FY{fiscal_year_val}"
    
    is_first_day_fiscal = 1 if dt_date.date() == fiscal_start.date() else 0
    # Fiscal year ends on September 30 of the fiscal year.
    fiscal_year_end = datetime(fiscal_year_val, 9, 30)
    is_last_day_fiscal = 1 if dt_date.date() == fiscal_year_end.date() else 0
    last_day_fiscal = fiscal_year_end.strftime("%Y-%m-%d 00:00:00")
    
    row = {
        "CensusDateKey": key,
        "CensusDateDate": census_date_date,
        "CensusDateDayInQuarter": day_in_quarter,
        "CensusDateDayName": day_name,
        "CensusDateDayNameAbbrevation": day_abbr,
        "CensusDateDayOfMonth": day_of_month,
        "CensusDateDayOfWeek": day_of_week,
        "CensusDateDayOfWeekInMonth": day_of_week_in_month,
        "CensusDateDayOfYear": day_of_year,
        "CensusDateFirstDayOfMonth": first_day_of_month,
        "CensusDateFirstDayOfQuarter": first_day_of_quarter_str,
        "CensusDateFirstDayofYear": first_day_of_year,
        "CensusDateHoliday": holiday,
        "CensusDateIsHoliday": is_holiday,
        "CensusDateIsWeekday": is_weekday,
        "CensusDateIsWeekend": is_weekend,
        "CensusDateLastDayOfQuarter": last_day_of_quarter,
        "CensusDateLastDayofMonth": last_day_of_month,
        "CensusDateLastDayofYear": last_day_of_year,
        "CensusDateMonth": month,
        "CensusDateMonthAbbrevation": month_abbr,
        "CensusDateMonthName": month_name,
        "CensusDateMonthOfQuarter": month_of_quarter,
        "CensusDateQuarter": quarter_num,
        "CensusDateQuarterName": quarter_name,
        "CensusDateQuarterShortName": quarter_short_name,
        "CensusDateWeekOfMonth": week_of_month,
        "CensusDateWeekOfQuarter": week_of_quarter,
        "CensusDateWeekOfYear": week_of_year,
        "CensusDateYYYYMM": yyyy_mm,
        "CensusDateYear": year,
        "CensusDateYearAndQuarter": year_and_quarter,
        "CensusDateYearMonth": year_month,
        "CensusDateYearName": year_name,
        "CensusDateFirstDayOfFiscalYear": fiscal_start_str,
        "CensusDateFiscalDateKey": fiscal_date_key,
        "CensusDateFiscalDayOfYear": fiscal_day_of_year,
        "CensusDateFiscalMonth": fiscal_month,
        "CensusDateFiscalQuarter": fiscal_quarter,
        "CensusDateFiscalQuarterName": fiscal_quarter_name,
        "CensusDateFiscalWeekOfYear": fiscal_week_of_year,
        "CensusDateFiscalYear": fiscal_year,
        "CensusDateIsFirstDayOfFiscalYear": is_first_day_fiscal,
        "CensusDateIsLastOfFiscalYear": is_last_day_fiscal,
        "CensusDateLastDayOfFiscalYear": last_day_fiscal
    }
    data.append(row)

# Convert the list of dictionaries into a DataFrame.
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file.
csv_filename = "Dim_CensusDate.csv"
df.to_csv(csv_filename, index=False)
print(f"CSV file '{csv_filename}' created with {len(df)} records.")

# Create (or connect to) the SQLite database and store the data.
db_filename = "census.db"
conn = sqlite3.connect(db_filename)
df.to_sql("Dim_CensusDate", conn, if_exists='replace', index=False)
conn.commit()
conn.close()
print(f"Data stored in SQLite database '{db_filename}' in table 'Dim_CensusDate'.")


CSV file 'Dim_CensusDate.csv' created with 1461 records.
Data stored in SQLite database 'census.db' in table 'Dim_CensusDate'.


In [5]:
import pandas as pd
import sqlite3
from datetime import datetime, timedelta
import random
from faker import Faker

# Initialize Faker for generating fake names.
fake = Faker()

# Helper function to generate a random timestamp between two datetime objects.
def random_timestamp(start, end):
    delta = end - start
    random_seconds = random.randint(0, int(delta.total_seconds()))
    return start + timedelta(seconds=random_seconds)

# Helper function to generate a random date between two dates.
def random_date(start, end):
    delta = end - start
    random_days = random.randint(0, delta.days)
    return start + timedelta(days=random_days)

# Define ingestion timestamp range (for RecordIngestedOn)
ingest_start = datetime(2025, 1, 1)
ingest_end = datetime(2025, 3, 31)

# Define date of birth range (ensuring residents are at least 70 years old)
dob_start = datetime(1935, 1, 1)
dob_end = datetime(1954, 12, 31)

num_residents = 10000
data = []

for i in range(1, num_residents + 1):
    resident_name = fake.name()
    dob = random_date(dob_start, dob_end).strftime("%Y-%m-%d")
    record_ingested_on = random_timestamp(ingest_start, ingest_end).strftime("%Y-%m-%d %H:%M:%S")
    
    row = {
        "ResidentKey": i,
        "ResidentName": resident_name,
        "ResidentDateOfBirth": dob,
        "RecordIngestedOn": record_ingested_on
    }
    data.append(row)

# Create a DataFrame from the data.
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file.
csv_filename = "Dim_CensusResident.csv"
df.to_csv(csv_filename, index=False)
print(f"CSV file '{csv_filename}' created with {num_residents} records.")

# Create (or connect to) the SQLite database and store the data in the Dim_CensusResident table.
db_filename = "census.db"
conn = sqlite3.connect(db_filename)
df.to_sql("Dim_CensusResident", conn, if_exists='replace', index=False)
conn.commit()
conn.close()
print(f"Data stored in SQLite database '{db_filename}' in table 'Dim_CensusResident'.")


CSV file 'Dim_CensusResident.csv' created with 10000 records.
Data stored in SQLite database 'census.db' in table 'Dim_CensusResident'.


In [6]:
import pandas as pd
import sqlite3
from datetime import datetime, timedelta
import random

# List of county names for each branch.
counties = [
    "Los Angeles",
    "San Francisco",
    "Orange",
    "Harris",
    "Dallas",
    "Travis",
    "Kings",
    "Queens",
    "Miami-Dade",
    "Broward"
]

# Function to generate a random timestamp between two datetime objects.
def random_timestamp(start, end):
    delta = end - start
    random_seconds = random.randint(0, int(delta.total_seconds()))
    return start + timedelta(seconds=random_seconds)

# Set the date range for RecordIngestedOn.
start_date = datetime(2025, 1, 1)
end_date = datetime(2025, 3, 31)

# Create the data for each facility using only county names.
data = []
for i, county in enumerate(counties, start=1):
    facility_code = f"{i:02d}"  # Two-digit facility code, e.g., "01", "02", etc.
    facility_name = f"Reliant care at {county}"
    record_ingested_on = random_timestamp(start_date, end_date).strftime("%Y-%m-%d %H:%M:%S")
    row = {
        "FacilityKey": i,
        "FacilityCode": facility_code,
        "FacilityName": facility_name,
        "RecordIngestedOn": record_ingested_on
    }
    data.append(row)

# Convert the list of dictionaries into a DataFrame.
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file.
csv_filename = "Dim_CensusFacility.csv"
df.to_csv(csv_filename, index=False)
print(f"CSV file '{csv_filename}' created with {len(df)} records.")

# Create (or connect to) the SQLite database and store the data in the Dim_CensusFacility table.
db_filename = "census.db"
conn = sqlite3.connect(db_filename)
df.to_sql("Dim_CensusFacility", conn, if_exists='replace', index=False)
conn.commit()
conn.close()
print(f"Data stored in SQLite database '{db_filename}' in table 'Dim_CensusFacility'.")


CSV file 'Dim_CensusFacility.csv' created with 10 records.
Data stored in SQLite database 'census.db' in table 'Dim_CensusFacility'.


In [7]:
import pandas as pd
import sqlite3
from datetime import datetime, timedelta
import random

# List of county names (from the previous facility example)
counties = [
    "Los Angeles", "San Francisco", "Orange", 
    "Harris", "Dallas", "Travis", 
    "Kings", "Queens", "Miami-Dade", "Broward"
]

# Function to generate a random timestamp between two datetime objects.
def random_timestamp(start, end):
    delta = end - start
    random_seconds = random.randint(0, int(delta.total_seconds()))
    return start + timedelta(seconds=random_seconds)

# Set the date range for RecordIngestedOn.
start_date = datetime(2025, 1, 1)
end_date = datetime(2025, 3, 31)

# Create data for each location.
data = []
for i, county in enumerate(counties, start=1):
    row = {
        "LocationKey": i,
        "LocationId": str(i),
        "LocationCode": str(i),  # Using the row number as a string for the code.
        "LocationName": county,
        "RecordIngestedOn": random_timestamp(start_date, end_date).strftime("%Y-%m-%d %H:%M:%S")
    }
    data.append(row)

# Convert the list of dictionaries into a DataFrame.
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file.
csv_filename = "Dim_CensusLocation.csv"
df.to_csv(csv_filename, index=False)
print(f"CSV file '{csv_filename}' created with {len(df)} records.")

# Create (or connect to) the SQLite database and store the data in the Dim_CensusLocation table.
db_filename = "census.db"
conn = sqlite3.connect(db_filename)
df.to_sql("Dim_CensusLocation", conn, if_exists='replace', index=False)
conn.commit()
conn.close()
print(f"Data stored in SQLite database '{db_filename}' in table 'Dim_CensusLocation'.")


CSV file 'Dim_CensusLocation.csv' created with 10 records.
Data stored in SQLite database 'census.db' in table 'Dim_CensusLocation'.


In [8]:
import pandas as pd
import sqlite3
from datetime import datetime, timedelta
import random

# Define 10 realistic unit names for senior living.
unit_names = [
    "Evergreen Way",
    "Sunset Manor",
    "Rosewood Gardens",
    "Meadowbrook Place",
    "Maple Grove",
    "Willow Creek",
    "Cedar Ridge",
    "Oak Haven",
    "Pineview Terrace",
    "Silver Springs"
]

# Function to generate a random timestamp between two datetime objects.
def random_timestamp(start, end):
    delta = end - start
    random_seconds = random.randint(0, int(delta.total_seconds()))
    return start + timedelta(seconds=random_seconds)

# Set the date range for RecordIngestedOn.
start_date = datetime(2025, 1, 1)
end_date = datetime(2025, 3, 31)

# Create data for each unit.
data = []
for i, unit_name in enumerate(unit_names, start=1):
    row = {
        "UnitKey": i,
        "UnitId": str(i),
        "UnitCode": str(i),  # Using the unit number as the code.
        "UnitName": unit_name,
        "RecordIngestedOn": random_timestamp(start_date, end_date).strftime("%Y-%m-%d %H:%M:%S")
    }
    data.append(row)

# Convert the list of dictionaries into a DataFrame.
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file.
csv_filename = "Dim_CensusUnit.csv"
df.to_csv(csv_filename, index=False)
print(f"CSV file '{csv_filename}' created with {len(df)} records.")

# Create (or connect to) the SQLite database and store the data in the Dim_CensusUnit table.
db_filename = "census.db"
conn = sqlite3.connect(db_filename)
df.to_sql("Dim_CensusUnit", conn, if_exists='replace', index=False)
conn.commit()
conn.close()
print(f"Data stored in SQLite database '{db_filename}' in table 'Dim_CensusUnit'.")


CSV file 'Dim_CensusUnit.csv' created with 10 records.
Data stored in SQLite database 'census.db' in table 'Dim_CensusUnit'.


In [18]:
# import pandas as pd
# import sqlite3
# import random
# from datetime import datetime, timedelta

# # Define the range for random date generation (for CensusDateKey and ReportDateKey)
# date_start = datetime(2021, 1, 1)
# date_end = datetime(2024, 12, 31)

# def random_date_key(start, end):
#     """Generate a random date between start and end, then return it in YYYYMMDD integer format."""
#     delta = end - start
#     random_days = random.randint(0, delta.days)
#     date_val = start + timedelta(days=random_days)
#     return int(date_val.strftime("%Y%m%d"))

# # Number of fact records to generate
# num_records = 20000

# fact_data = []
# for i in range(1, num_records + 1):
#     record = {
#         "ResidentKey": random.randint(1, 500),         # From Dim_CensusResident (500 records)
#         "FacilityKey": random.randint(1, 10),            # From Dim_CensusFacility (10 records)
#         "UnitKey": random.randint(1, 10),                # From Dim_CensusUnit (10 records)
#         "LocationKey": random.randint(1, 10),            # From Dim_CensusLocation (10 records)
#         "RoomTypeKey": random.randint(1, 8),             # From Dim_CensusRoomType (8 records)
#         "CensusStatusKey": random.randint(1, 5),         # From Dim_CensusStatus (5 records)
#         "CensusDateKey": random_date_key(date_start, date_end),   # Date key from Dim_CensusDate
#         "ReportDateKey": random_date_key(date_start, date_end),   # Report date key from Dim_CensusDate
#         "CensusId": str(i),                              # Sequential identifier as string
#         "CensusFactId": "CF" + str(i),                   # Unique fact identifier with prefix
#         "Id": str(i),                                    # Unique identifier for data modelling
#         "RecordStatus": "Active"                         # Status of the record
#     }
#     fact_data.append(record)

# # Create a DataFrame for the fact table.
# df_fact = pd.DataFrame(fact_data)

# # Save the DataFrame to a CSV file.
# csv_filename = "Fact_Census.csv"
# df_fact.to_csv(csv_filename, index=False)
# print(f"CSV file '{csv_filename}' created with {num_records} records.")

# # Connect to the SQLite database and store the fact table.
# db_filename = "census.db"
# conn = sqlite3.connect(db_filename)
# df_fact.to_sql("Fact_Census", conn, if_exists='replace', index=False)
# conn.commit()
# conn.close()
# print(f"Data stored in SQLite database '{db_filename}' in table 'Fact_Census'.")


CSV file 'Fact_Census.csv' created with 20000 records.
Data stored in SQLite database 'census.db' in table 'Fact_Census'.


In [9]:
import pandas as pd
import sqlite3
import random
from datetime import datetime, timedelta

# Define your date range
date_start = datetime(2021, 1, 1)
date_end = datetime(2024, 12, 31)

# Create a list of dates (daily frequency)
date_range = pd.date_range(start=date_start, end=date_end, freq='D')

fact_data = []

# For each day and for each location (assumed to be 1 to 10)
for dt in date_range:
    # Convert the date into an integer key in YYYYMMDD format
    census_date_key = int(dt.strftime("%Y%m%d"))
    # For each location (LocationKey from 1 to 10)
    for location in range(1, 11):
        # For each day at this location, generate a random number of records between 60 and 120
        num_records_for_day = random.randint(30, 50)
        for i in range(num_records_for_day):
            record = {
                "ResidentKey": random.randint(1, 500),       # Choose a random resident from 500 available
                "FacilityKey": random.randint(1, 10),          # Random facility
                "UnitKey": random.randint(1, 10),              # Random unit
                "LocationKey": location,                       # Set the current location
                "RoomTypeKey": random.randint(1, 8),           # Random room type
                "CensusStatusKey": random.randint(1, 5),       # Random census status
                "CensusDateKey": census_date_key,              # Use the date as CensusDateKey
                "ReportDateKey": census_date_key,              # Use the same date for ReportDateKey
                "CensusId": f"{dt.strftime('%Y%m%d')}_{location}_{i+1}",
                "CensusFactId": f"CF_{dt.strftime('%Y%m%d')}_{location}_{i+1}",
                "Id": f"{dt.strftime('%Y%m%d')}_{location}_{i+1}",
                "RecordStatus": "Active"
            }
            fact_data.append(record)

# Convert the collected records to a DataFrame.
df_fact = pd.DataFrame(fact_data)

# Optional: Print summary information
print(f"Total records generated: {len(df_fact)}")

# Save the DataFrame to a CSV file.
csv_filename = "Fact_Census.csv"
df_fact.to_csv(csv_filename, index=False)
print(f"CSV file '{csv_filename}' created.")

# Store the data in the SQLite database.
db_filename = "census.db"
conn = sqlite3.connect(db_filename)
df_fact.to_sql("Fact_Census", conn, if_exists='replace', index=False)
conn.commit()
conn.close()
print(f"Data stored in SQLite database '{db_filename}' in table 'Fact_Census'.")


Total records generated: 583378
CSV file 'Fact_Census.csv' created.
Data stored in SQLite database 'census.db' in table 'Fact_Census'.


In [None]:
import sqlite3
import pandas as pd

# Connect to the SQLite database
db_filename = "census.db"
conn = sqlite3.connect(db_filename)

# Define the query.
query = """
SELECT COUNT(ResidentKey) AS DailyAvgResidents
FROM Fact_Census
WHERE LocationKey = (SELECT LocationKey FROM Dim_CensusLocation WHERE LocationName = 'Dallas')
AND CensusDateKey IN (SELECT CensusDateKey FROM Dim_CensusDate WHERE CensusDateYear = 2024);
"""

# Execute the query and load the result into a DataFrame.
df_result = pd.read_sql_query(query, conn)

# Close the database connection.
conn.close()

# Print the query result.
print(df_result)


DatabaseError: Execution failed on sql '
SELECT ResidentName
FROM Dim_CensusResident
WHERE ResidentKey IN (
    SELECT ResidentKey
    FROM Fact_Census
    WHERE LocationKey = (SELECT LocationKey FROM Dim_CensusLocation WHERE LocationName = 'Dallas')
    AND ResidentDateOfBirth <= '1939-02-29'
    AND ResidentDateOfBirth >= '1939-02-01'
);
': no such table: Dim_CensusResident

In [20]:
conn = sqlite3.connect(db_filename)

# Debug: list available tables in the database
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
tables = cursor.fetchall()
print("Available tables in DB:", tables)

Available tables in DB: [('Dim_CensusRoomType',), ('Dim_CensusStatus',), ('Dim_CensusDate',), ('Dim_CensusResident',), ('Dim_CensusFacility',), ('Dim_CensusLocation',), ('Dim_CensusUnit',), ('Fact_Census',)]
