In [57]:
pip install --upgrade snowflake-connector-python

Note: you may need to restart the kernel to use updated packages.


In [58]:
import snowflake.connector

In [59]:
conn = snowflake.connector.connect(
    user= 'mnonog',
    password = 'KayaNatinToL0RD!',
    account='cejmwpt-djb91267'
    )

#https://cejmwpt-djb91267.snowflakecomputing.com

In [60]:
with conn.cursor() as cs:
    # warehouse + db/schema setup
    cs.execute("CREATE WAREHOUSE IF NOT EXISTS my_first_warehouse")
    cs.execute("CREATE DATABASE IF NOT EXISTS testdb")
    cs.execute("USE DATABASE testdb")          # 👈 set current DB
    cs.execute("CREATE SCHEMA IF NOT EXISTS testschema")
    cs.execute( "CREATE OR REPLACE TABLE "    
             "test_table(col1 integer, col2 string)")
    cs.execute("INSERT INTO test_table(col1, col2) "
    		"VALUES (123, 'test string1'), (456, 'test string2')")
    
    #  stages 
    cs.execute("CREATE STAGE IF NOT EXISTS purchases_stage")
    cs.execute("CREATE STAGE IF NOT EXISTS invoices_stage")
    cs.execute("CREATE STAGE IF NOT EXISTS supplier_stage")
    
    # File formats
    cs.execute("""
        CREATE OR REPLACE FILE FORMAT csv_ff 
        TYPE = CSV 
        FIELD_OPTIONALLY_ENCLOSED_BY = '"'
        SKIP_HEADER = 1 
        NULL_IF = ('', 'NULL')
    """)
    
    cs.execute("""
        CREATE OR REPLACE FILE FORMAT xml_ff 
        TYPE = XML 
        STRIP_OUTER_ELEMENT = TRUE
    """)
    # cs.execute('SELECT * FROM test_table')
    # print(cs.fetchmany(2))
conn.close()



In [61]:
%pip install psycopg2-binary pandas lxml python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [64]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import snowflake.connector as sf


WH, DB, SC = "ETL_WH", "ETL_DB", "ETL_SCHEMA"
BASE_DIR = Path(r"/home/jovyan/Desktop/SQL/SQL_FINAL_PROJECT/Data-5/Monthly PO Data").resolve()
OUT_CSV  = BASE_DIR / "combined_purchases.csv"   # 统一输出文件

# 2) 需要保留并标准化到 Snowflake 的列（目标列名）
TARGET_COLS = [
    "PurchaseOrderID",
    "PurchaseOrderLineID",
    "ReceivedOuters",
    "ExpectedUnitPricePerOuter",
    "OrderDate",
    "SupplierID",
]

# 3) 列名别名表（尽量覆盖你文件里的各种写法；不够的话再加几种）
ALIASES = {
    "PurchaseOrderID":           ["purchaseorderid", "purchase_order_id", "poid", "orderid"],
    "PurchaseOrderLineID":       ["purchaseorderlineid", "purchase_order_line_id", "polineid", "orderlineid"],
    "ReceivedOuters":            ["receivedouters", "received_outers", "receivedoutersqty", "receivedoutersquantity", "received_qty"],
    "ExpectedUnitPricePerOuter": ["expectedunitpriceperouter", "expected_unit_price_per_outer", "unitpriceperouter", "expectedprice", "unitprice"],
    "OrderDate":                 ["orderdate", "order_date", "date", "order_dt"],
    "SupplierID":                ["supplierid", "supplier_id", "vendorid", "vendor_id"],
}

def norm(name: str) -> str:
    """规范化列名：只保留字母数字并小写，便于匹配"""
    return "".join(ch.lower() for ch in name if ch.isalnum())

def pick_and_rename_columns(df: pd.DataFrame) -> pd.DataFrame:
    """从原始 df 里按别名表挑出需要的列，并重命名为标准列名"""
    orig_to_norm = {c: norm(c) for c in df.columns}
    norm_to_orig = {v: k for k, v in orig_to_norm.items()}  # 取第一个出现的映射即可

    selected = {}
    missing  = []
    for tgt, alias_list in ALIASES.items():
        found = None
        for alias in alias_list:
            if alias in norm_to_orig:
                found = norm_to_orig[alias]
                break
        if found is None:
            missing.append(tgt)
        else:
            selected[tgt] = found

    if missing:
        raise ValueError(f"缺少必须列：{missing}; 文件列={list(df.columns)}")


    out = df[[selected[c] for c in TARGET_COLS]].copy()
    out.columns = TARGET_COLS
    return out

frames = []
csv_files = sorted(BASE_DIR.glob("*.csv"))
if not csv_files:
    raise FileNotFoundError(f"在 {BASE_DIR} 下没有找到 .csv 文件")

for p in csv_files:
    df = pd.read_csv(p, dtype=str, keep_default_na=False, na_values=["", "NULL"])
    df_std = pick_and_rename_columns(df)


    for c in ["ReceivedOuters", "ExpectedUnitPricePerOuter"]:
        df_std[c] = pd.to_numeric(df_std[c], errors="coerce")

    df_std["OrderDate"] = pd.to_datetime(df_std["OrderDate"], errors="coerce", infer_datetime_format=True)


    df_std = df_std.dropna(subset=["PurchaseOrderID", "PurchaseOrderLineID", "OrderDate"])

    df_std["ReceivedOuters"] = df_std["ReceivedOuters"].fillna(0)
    df_std["ExpectedUnitPricePerOuter"] = df_std["ExpectedUnitPricePerOuter"].fillna(0)

    frames.append(df_std)

combined = pd.concat(frames, ignore_index=True)

combined.to_csv(OUT_CSV, index=False)
print(f"combine to 1：{OUT_CSV} line={len(combined):,}")

conn = sf.connect(
    user=os.getenv("SNOW_USER", "ETHANAN2000"),
    password=os.getenv("SNOW_PASSWORD", "An67087833@123"),
    account=os.getenv("SNOW_ACCOUNT", "svogymj-bxb71103") 
)
cs = conn.cursor()

WH, DB, SC = "ETL_WH", "ETL_DB", "ETL_SCHEMA"
cs.execute(f"CREATE WAREHOUSE IF NOT EXISTS {WH} WAREHOUSE_SIZE=SMALL AUTO_SUSPEND=60 AUTO_RESUME=TRUE")
cs.execute(f"CREATE DATABASE  IF NOT EXISTS {DB}")
cs.execute(f"CREATE SCHEMA    IF NOT EXISTS {DB}.{SC}")
cs.execute(f"USE WAREHOUSE {WH}")
cs.execute(f"USE DATABASE {DB}")
cs.execute(f"USE SCHEMA {SC}")

cs.execute("CREATE STAGE IF NOT EXISTS purchases_stage")
cs.execute("""
CREATE OR REPLACE FILE FORMAT csv_ff
  TYPE = CSV
  FIELD_OPTIONALLY_ENCLOSED_BY = '\"'
  PARSE_HEADER = TRUE
  NULL_IF = ('','NULL')
  TRIM_SPACE = TRUE
""")

cs.execute("""
CREATE OR REPLACE TABLE purchases_detail (
  PurchaseOrderID           STRING,
  PurchaseOrderLineID       STRING,
  ReceivedOuters            NUMBER(18,4),
  ExpectedUnitPricePerOuter NUMBER(18,4),
  OrderDate                 DATE,
  SupplierID                STRING
)
""")

# See what's in the stage (helps confirm the .gz name)
cs.execute("LIST @purchases_stage")
print(cs.fetchall())

# Re-upload the merged file (overwrites same name only)
cs.execute(f"PUT 'file:///{OUT_CSV.as_posix()}' @purchases_stage AUTO_COMPRESS=TRUE OVERWRITE=TRUE")

# Load ONLY the merged file
cs.execute("""
COPY INTO purchases_detail
FROM @purchases_stage
FILE_FORMAT = (FORMAT_NAME = csv_ff)
MATCH_BY_COLUMN_NAME = CASE_INSENSITIVE
PATTERN = '.*combined_purchases\\.csv\\.gz'
ON_ERROR = 'ABORT_STATEMENT'
""")


# 10) 校验
cs.execute("SELECT COUNT(*) FROM purchases_detail")
print("Snowflake 行数:", cs.fetchone()[0])
cs.execute("SELECT * FROM purchases_detail ORDER BY OrderDate, PurchaseOrderID LIMIT 5")
for row in cs.fetchall():
    print(row)

cs.close(); conn.close()


  df_std["OrderDate"] = pd.to_datetime(df_std["OrderDate"], errors="coerce", infer_datetime_format=True)
  df_std["OrderDate"] = pd.to_datetime(df_std["OrderDate"], errors="coerce", infer_datetime_format=True)
  df_std["OrderDate"] = pd.to_datetime(df_std["OrderDate"], errors="coerce", infer_datetime_format=True)
  df_std["OrderDate"] = pd.to_datetime(df_std["OrderDate"], errors="coerce", infer_datetime_format=True)
  df_std["OrderDate"] = pd.to_datetime(df_std["OrderDate"], errors="coerce", infer_datetime_format=True)
  df_std["OrderDate"] = pd.to_datetime(df_std["OrderDate"], errors="coerce", infer_datetime_format=True)
  df_std["OrderDate"] = pd.to_datetime(df_std["OrderDate"], errors="coerce", infer_datetime_format=True)
  df_std["OrderDate"] = pd.to_datetime(df_std["OrderDate"], errors="coerce", infer_datetime_format=True)
  df_std["OrderDate"] = pd.to_datetime(df_std["OrderDate"], errors="coerce", infer_datetime_format=True)
  df_std["OrderDate"] = pd.to_datetime(df_std["OrderDat

combine to 1：/home/jovyan/Desktop/SQL/SQL_FINAL_PROJECT/Data-5/Monthly PO Data/combined_purchases.csv line=49,098
[('purchases_stage/2019-1.csv.gz', 6784, 'f983bbb141781bcf2a7be159f7a285dd', 'Tue, 9 Sep 2025 07:06:11 GMT'), ('purchases_stage/2019-10.csv.gz', 3328, '6c32fccfb404358be0857b6582f6c3ec', 'Tue, 9 Sep 2025 07:06:11 GMT'), ('purchases_stage/2019-11.csv.gz', 3056, 'cb143ac537c9bded77a1ea95a40b73bb', 'Tue, 9 Sep 2025 07:06:11 GMT'), ('purchases_stage/2019-12.csv.gz', 2992, 'cfa1e94167f572f7c5125f51bd4937c6', 'Tue, 9 Sep 2025 07:06:11 GMT'), ('purchases_stage/2019-2.csv.gz', 2384, 'c6d1126a97402fe7e31ac5d7fcddb3c5', 'Tue, 9 Sep 2025 07:06:11 GMT'), ('purchases_stage/2019-3.csv.gz', 2848, 'fc3764367922a620bb821fadd6128829', 'Tue, 9 Sep 2025 07:06:11 GMT'), ('purchases_stage/2019-4.csv.gz', 3088, 'b2b4d839e5980b278e3576d9ba584e04', 'Tue, 9 Sep 2025 07:06:11 GMT'), ('purchases_stage/2019-5.csv.gz', 3200, '1750b6702598a947d517b4b5d26ce157', 'Tue, 9 Sep 2025 07:06:11 GMT'), ('purchase