In [1]:
import pandas as pd
import numpy as np

# Set seed for reproducibility
np.random.seed(42)

# Read the decision table
decision_df = pd.read_csv('data_prep/DECISION_TABLE.csv')

# Pre-defined lists for random data
titles = ['Herr', 'Frau', 'Dr.', 'Prof.', 'Dipl.-Ing.']
forenames = ['Anna', 'Hans', 'Maria', 'Peter', 'Klaus', 'Sabine', 'Thomas', 'Julia', 'Michael', 'Petra']
surnames = ['Müller', 'Schmidt', 'Schneider', 'Fischer', 'Weber', 'Meyer', 'Wagner', 'Becker', 'Schulz', 'Hoffmann']
streets = ['Hauptstraße', 'Bahnhofstraße', 'Kirchweg', 'Gartenstraße', 'Schulstraße', 'Bergstraße', 'Waldweg', 'Dorfstraße', 'Lindenallee', 'Marktplatz']
streetnos = ['1', '2', '5', '10', '15', '20', '25', '30', '42', '50', '1a', '3b', '12c']

# Get unique EQUNRs
num_rows = len(decision_df)

# Add random columns
decision_df['TITLE'] = np.random.choice(titles, size=num_rows)
decision_df['FORENAME'] = np.random.choice(forenames, size=num_rows)
decision_df['SURNAME'] = np.random.choice(surnames, size=num_rows)
decision_df['STREET'] = np.random.choice(streets, size=num_rows)
decision_df['STREETNO'] = np.random.choice(streetnos, size=num_rows)
decision_df["EMAIL"] = (decision_df["FORENAME"].str.lower() + "." + decision_df["SURNAME"].str.lower() + "@example.com")

decision_df = decision_df[["EQUNR", "TITLE", "FORENAME", "SURNAME", "STREET", "STREETNO", "CITY1", "CITY2", "POST_CODE1", "EMAIL"]].copy()
decision_df = decision_df.drop_duplicates(subset=["EQUNR"]).reset_index(drop=True)
display(decision_df)

# Save as NET_MASTER.csv
decision_df.to_csv('data_prep/NET_MASTER.csv', index=False)

num_empty = int(len(decision_df) * 0.1)
empty_indices = np.random.choice(decision_df.index, size=num_empty, replace=False)
decision_df = decision_df.drop(index=empty_indices)
display(decision_df)

# Save as SALES_MASTER.csv
decision_df.to_csv('data_prep/SALES_MASTER.csv', index=False)

print(f"Created SALES_MASTER.csv with {len(decision_df)} rows")
print(f"Columns: {list(decision_df.columns)}")

Unnamed: 0,EQUNR,TITLE,FORENAME,SURNAME,STREET,STREETNO,CITY1,CITY2,POST_CODE1,EMAIL


Unnamed: 0,EQUNR,TITLE,FORENAME,SURNAME,STREET,STREETNO,CITY1,CITY2,POST_CODE1,EMAIL


Created SALES_MASTER.csv with 0 rows
Columns: ['EQUNR', 'TITLE', 'FORENAME', 'SURNAME', 'STREET', 'STREETNO', 'CITY1', 'CITY2', 'POST_CODE1', 'EMAIL']


In [2]:
import pandas as pd
import sqlite3
from pathlib import Path

# Database path
DB_PATH = "database.db"

# Data folder path
data_folder = Path("data_prep")

# Dictionary to store DataFrames
dataframes = {}


# Load all CSV files from data folder
csv_files = list(data_folder.glob("*.csv"))
print(f"Found {len(csv_files)} CSV files in data folder:\n")

for csv_file in csv_files:
    table_name = csv_file.stem.lower()  # Use filename without extension as table name
    print(f"Loading {csv_file.name}...")
    df = pd.read_csv(csv_file).fillna("")  # Read all columns as strings TODO: check if it is okay without str
    if table_name.startswith("decision_table"):
        df["HOUSE_NUM1"] = df["HOUSE_NUM1"].apply(lambda x: "1111" if x == "" else x).astype(int)
        df = df.drop(columns=["ENTSCHEIDUNG", "REGEL"]).copy()
        display(df)
    dataframes[table_name] = df
    print(f"  - Shape: {df.shape}")
    print(f"  - Columns: {list(df.columns)[:5]}{'...' if len(df.columns) > 5 else ''}\n")

# Write all DataFrames to SQLite database
print(f"\nWriting tables to {DB_PATH}:")
conn = sqlite3.connect(DB_PATH)

for table_name, df in dataframes.items():
    print(f"  - Writing table '{table_name}'...")
    df.to_sql(table_name, conn, if_exists='replace', index=False)
    
conn.close()

print("\n✓ All tables successfully written to database!")
print(f"Tables created: {list(dataframes.keys())}")

Found 6 CSV files in data folder:

Loading NET_MASTER.csv...
  - Shape: (0, 10)
  - Columns: ['EQUNR', 'TITLE', 'FORENAME', 'SURNAME', 'STREET']...

Loading SALES_MASTER.csv...
  - Shape: (0, 10)
  - Columns: ['EQUNR', 'TITLE', 'FORENAME', 'SURNAME', 'STREET']...

Loading DECISION_TABLE.csv...


Unnamed: 0,HAUS,ANLAGE,ABLEINH,TOUR,ME_MA_ID,CITY1,CITY2,HOUSE_NUM1,POST_CODE1,EQUNR


  - Shape: (0, 10)
  - Columns: ['HAUS', 'ANLAGE', 'ABLEINH', 'TOUR', 'ME_MA_ID']...

Loading EABL.csv...
  - Shape: (0, 11)
  - Columns: ['Meter Reading (MR) Doc. No.', 'EQUNR', 'Geplante Ableseart', 'MR type', 'MR TYPE TEXT']...

Loading EABLG.csv...
  - Shape: (0, 7)
  - Columns: ['Meter Reading (MR) Doc. No.', 'Installation', 'Meter Reading reason', 'MR Reason - Text', 'Scheduled MR Date']...

Loading EANL.csv...
  - Shape: (0, 14)
  - Columns: ['Installation', 'Installation type', 'Record created on', 'Object changed on', 'SPARTE_TEXT']...


Writing tables to database.db:
  - Writing table 'net_master'...
  - Writing table 'sales_master'...
  - Writing table 'decision_table'...
  - Writing table 'eabl'...
  - Writing table 'eablg'...
  - Writing table 'eanl'...

✓ All tables successfully written to database!
Tables created: ['net_master', 'sales_master', 'decision_table', 'eabl', 'eablg', 'eanl']


In [3]:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()

# Get all table names
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

print("Database Schema:\n" + "="*80 + "\n")

for table in tables:
    table_name = table[0]
    print(f"Table: {table_name}")
    print("-" * 80)
    
    # Get table info (column names and types)
    cursor.execute(f"PRAGMA table_info({table_name});")
    columns = cursor.fetchall()
    
    for col in columns:
        col_id, col_name, col_type, not_null, default_val, is_pk = col
        pk_marker = " (PRIMARY KEY)" if is_pk else ""
        not_null_marker = " NOT NULL" if not_null else ""
        print(f"  {col_name}: {col_type}{not_null_marker}{pk_marker}")
    
    print("\n")

conn.close()

Database Schema:

Table: decision_talbe
--------------------------------------------------------------------------------
  HAUS: TEXT
  ANLAGE: TEXT
  ABLEINH: TEXT
  TOUR: TEXT
  ME_MA_ID: TEXT
  CITY1: TEXT
  CITY2: TEXT
  HOUSE_NUM1: INTEGER
  POST_CODE1: INTEGER
  EQUNR: TEXT


Table: J09A_step5_net
--------------------------------------------------------------------------------
  HAUS: TEXT
  ANLAGE: TEXT
  ABLEINH: TEXT
  TOUR: TEXT
  ME_MA_ID: TEXT
  CITY1: TEXT
  CITY2: TEXT
  HOUSE_NUM1: INT
  POST_CODE1: INT
  EQUNR: TEXT
  ENTSCHEIDUNG: TEXT
  Meter Reading (MR) Doc. No.: TEXT
  Meter Reading reason: INT
  MR Reason - Text: TEXT
  Scheduled MR Date: TEXT
  Meter Reading unit: TEXT
  NET_TITLE: TEXT
  NET_FORENAME: TEXT
  NET_SURNAME: TEXT
  NET_STREET: TEXT
  NET_STREETNO: TEXT
  NET_CITY1: TEXT
  NET_CITY2: TEXT
  NET_POSTCODE1: INT
  NET_EMAIL: TEXT


Table: J09A_step5_full
--------------------------------------------------------------------------------
  HAUS: TEXT
  ANLA