In [15]:
import pandas as pd # for data manipulation and data handling
import numpy as ny #numeric operations array,maths
from sqlalchemy import create_engine , text #build database conn and sql exection
import random #random offset data
from sqlalchemy.types import String,INTEGER,Numeric #define columns types when writing to sql   
import time 
import matplotlib as plt



In [16]:
engine = create_engine ("postgresql://postgres:123456@localhost:5432/project1") #initialize engine 

In [18]:
yes = 1
SAMPLE_SIZE = 200000
SILVER_SCHEMA = 'silver1'
SILVER_TABLE = 'cleaned'
GOLD_SCHEMA = 'gold1'
GOLD_TABLE = 'presentation'

Gold_Columns = [
     'state','survey_month','survey_date','month','day','year',
    'disposition_code','sequence_number','primary_sampling_unit',
    'telephone_number','private_residence','state_residence','cell_phone',
    'num_of_adults','num_of_men','num_of_women',
    'general_health','physical_health_days','mental_health_days','poor_health_days',
    'has_health_plan','has_personal_doctor','medical_cost_issue','last_checkup',
    'high_blood_pressure','high_cholesterol','cholesterol_check','diagnosed_diabetes',
    'had_heart_attack','had_coronary_heart_disease','had_stroke','has_asthma',
    'had_skin_cancer','had_other_cancer','has_copd','has_arthritis',
    'has_depression','had_kidney_disease'
]


disease = [
    'high_blood_pressure', 'high_cholesterol', 'cholesterol_check', 'diagnosed_diabetes',
    'had_heart_attack', 'had_coronary_heart_disease', 'had_stroke', 'has_asthma',
    'had_skin_cancer', 'had_other_cancer', 'has_copd', 'has_arthritis', 'has_depression', 'had_kidney_disease'
]

#Database Connect
def get_engine (uri : str ) : 
    return create_engine(uri)

def table_rowcount(engine , schema : str , table : str ) -> int : #define a func and take para engine , schema and table which is str data type and int hints that this func return int
    with engine.connect() as conn: #create a conn to the database and make executable to run sql query
        return conn.exec_driver_sql(f"SELECT COUNT(*) FROM {schema}.{table};").scalar() #exec query using sqlalchemy and scalar extracts a single value

def ensure_schemas(engine):
    with engine.begin() as conn:
        conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {SILVER_SCHEMA};"))
        conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {GOLD_SCHEMA};"))


def get_table_columns(engine, schema: str, table: str):  #define a func and take para engine , schema and table which is str data type and int hints that this func return int
    df = pd.read_sql_query(text(f"SELECT * FROM {schema}.{table} LIMIT 1"), con=engine) #create a dataframe in which retrieve column name 
    return df.columns.tolist() #df has columns name and to list make a list of column names

def fetch_random_sample(engine , sample_size = SAMPLE_SIZE ) -> pd.DataFrame : #def a func with par engine and sample size  and type hint that this fun return a pandas dataframe 
    total_df = pd.read_sql_query(
        sql=text(f"SELECT COUNT(*) AS c FROM {SILVER_SCHEMA}.{SILVER_TABLE}"),
        con=engine
    ) #count total num of rows
    
    total = int(total_df['c'][0]) #total rows
    if total == 0:
        raise RuntimeError(f"{SILVER_SCHEMA}.{SILVER_TABLE} is empty.")
    
    offset = max(0,random.randint(0,max(0,total - sample_size)))  # We want to avoid asking for rows that donâ€™t exist. Maximum offset = total rows âˆ’ sample size = 10 âˆ’ 4 = 6
    #So OFFSET can only be 0,1,2,3,4,5,6 If OFFSET = 6 and LIMIT = 4 â†’ take rows 7,8,9,10 max prevent negative numbers and if neg return 0

    # this query return gold columns offset to limit sample size
    query = f"""
        SELECT {', '.join(Gold_Columns)}
        FROM {SILVER_SCHEMA}.{SILVER_TABLE}
        OFFSET {offset}
        LIMIT {sample_size}
    """
    df = pd.read_sql_query(sql=text(query), con =engine) #exec the sql query and reads data to pandas dataframe
    df = enrich_gold_layer(df)
    return df

def enrich_gold_layer(df : pd.DataFrame) -> pd.DataFrame:
    total = (df[disease] == yes).sum(axis=1)
    df['total_disease'] = total.astype('int')
    df['has_any_disease'] = (df['total_disease'] > 0).astype('int8')
    df['multimorbidity_flag'] = (df['total_disease'] >= 2 ).astype('int') 
    df['severe_multimorbidity_flag'] = (df['total_disease'] >= 3 ).astype('int') 
    return df

def write_sample_to_gold(engine, df: pd.DataFrame) -> int:
    # Ensure column order & presence
    existing = [c for c in Gold_Columns if c in df.columns] #Loop through every column c in Gold_Columns Keep it only if it exists in the DataFrame.
    # Include the new columns too when writing
    extra_cols = ['total_disease','has_any_disease','multimorbidity_flag',
                  'severe_multimorbidity_flag']
    for c in extra_cols: # will raise an error if any column doesnt exist
        if c not in df.columns:
            raise KeyError(f"Missing expected enriched column: {c}")

    df = df[existing + extra_cols]

    with engine.begin() as conn:
        df.to_sql(
            name=GOLD_TABLE,
            con=conn,
            schema=GOLD_SCHEMA,
            if_exists='replace',  
            index=False,
            method='multi',
            chunksize=50_000
        )
    return len(df)


def run_gold_pipeline(engine):
    start_time = time.perf_counter()   

    try:
        ensure_schemas(engine)

        print("\nðŸ”¹ Sampling 200,000 rows from silver1.cleaned ...")
        df = fetch_random_sample(engine, SAMPLE_SIZE)
        print(f"âœ… Sample fetched: {len(df):,} rows")

        if df.empty:
            raise RuntimeError("Sample is empty")

        print("\nðŸ”¹ Writing sampled rows to gold1.presentation ...")
        written = write_sample_to_gold(engine, df)
        print(f"âœ… Written to gold1.presentation: {written:,} rows")

        end_time = time.perf_counter()  
        elaspsed_sec = end_time - start_time
        elaspsed_min = elaspsed_sec / 60 
        print(f"\nðŸ”¹ Total time taken in writing data: {elaspsed_min:.2f} Minutes")

    except Exception as e:
        import traceback
        print(" ERROR in pipeline:", e)
        traceback.print_exc()
        raise


if __name__ == "__main__":
    ENGINE_URI = "postgresql+psycopg2://postgres:123456@localhost:5432/Project1"
    engine = get_engine(ENGINE_URI)
    run_gold_pipeline(engine)


ðŸ”¹ Sampling 200,000 rows from silver1.cleaned ...
âœ… Sample fetched: 200,000 rows

ðŸ”¹ Writing sampled rows to gold1.presentation ...
âœ… Written to gold1.presentation: 200,000 rows

ðŸ”¹ Total time taken in writing data: 5.06 Minutes
