In [5]:
!pip install pandas 



In [None]:
import sqlite3
import pandas as pd
from datetime import datetime
from pathlib import Path
from IPython.display import display

# List of CSV files
csv_files = [
    Path(r"D:\BITS_SEM2\DMML_Assignment\Task4_DataPreparation\processed_data\clean_hf.csv"),
    Path(r"D:\BITS_SEM2\DMML_Assignment\Task4_DataPreparation\processed_data\clean_kaggle.csv")
]

# Columns that exist in the SQLite feature store table
table_cols = [
    "customerID", "tenure", "MonthlyCharges", "TotalCharges",
    "Contract_OneYear", "Contract_TwoYear",
    "PaymentMethod_CreditCard", "PaymentMethod_ElectronicCheck",
    "PaymentMethod_MailedCheck", "Churn"
]

# Use context manager to safely open/close the DB
with sqlite3.connect("feature_store.db") as conn:
    cursor = conn.cursor()

    # -------- Step 1: Create metadata table if not exists --------
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS feature_metadata (
        feature_name TEXT PRIMARY KEY,
        description TEXT,
        source TEXT,
        version TEXT,
        created_at TEXT
    )
    ''')

    # -------- Step 2: Create feature store table if not exists --------
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS feature_store (
        customerID TEXT PRIMARY KEY,
        tenure INTEGER,
        MonthlyCharges REAL,
        TotalCharges REAL,
        Contract_OneYear INTEGER,
        Contract_TwoYear INTEGER,
        PaymentMethod_CreditCard INTEGER,
        PaymentMethod_ElectronicCheck INTEGER,
        PaymentMethod_MailedCheck INTEGER,
        Churn INTEGER
    )
    ''')

    # -------- Step 3: Loop through CSV files --------
    for csv_file in csv_files:
        print(f"Processing: {csv_file.name}")
        df = pd.read_csv(csv_file)
        print(f"  Rows in file: {len(df)}")

        # Standardize column names: strip spaces, lowercase
        df.columns = df.columns.str.strip().str.replace(" ", "").str.lower()
        table_cols_lower = [col.lower() for col in table_cols]

        # Keep only columns that exist in both CSV and table
        df_to_insert = df[[col for col in table_cols_lower if col in df.columns]]

        # Rename columns to match SQLite table
        rename_map = dict(zip(df_to_insert.columns, [col for col in table_cols if col.lower() in df_to_insert.columns]))
        df_to_insert = df_to_insert.rename(columns=rename_map)

        # Ensure all table columns exist
        for col in table_cols:
            if col not in df_to_insert.columns:
                if col == "customerID":
                    df_to_insert[col] = ""
                elif col == "Churn":
                    df_to_insert[col] = 0
                else:
                    df_to_insert[col] = 0

        # Reorder columns to match table exactly
        df_to_insert = df_to_insert[table_cols]

        # Insert metadata dynamically
        features_meta = []
        for col in df_to_insert.columns:
            if col == "Churn":
                source = "Label"
                description = "Whether customer churned (0/1)"
            elif col.startswith("Contract") or col.startswith("PaymentMethod"):
                source = "Engineered"
                description = f"Feature: {col}"
            elif col == "customerID":
                source = "ID"
                description = "Unique customer identifier"
            else:
                source = "Telco DB"
                description = f"Feature: {col}"
            features_meta.append((col, description, source, "v1", str(datetime.now())))

        # Insert metadata safely
        try:
            cursor.executemany('''
            INSERT OR IGNORE INTO feature_metadata 
            (feature_name, description, source, version, created_at)
            VALUES (?, ?, ?, ?, ?)
            ''', features_meta)
        except sqlite3.OperationalError as e:
            print("Could not insert metadata:", e)

        # Insert data into the feature store
        records = df_to_insert.to_records(index=False)
        cursor.executemany('''
        INSERT OR REPLACE INTO feature_store VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        ''', records)

    # -------- Step 4: Retrieve and display data --------
    features = pd.read_sql("SELECT * FROM feature_store", conn)
    print("Feature Store Data:")
    display(features)

    metadata = pd.read_sql("SELECT * FROM feature_metadata", conn)
    print("Feature Metadata:")
    display(metadata)


Processing: clean_hf.csv
  Rows in file: 1409


OperationalError: database is locked