In [None]:
pip install --upgrade snowflake-connector-python

Note: you may need to restart the kernel to use updated packages.


In [None]:

import os, pandas as pd, numpy as np
from pathlib import Path
import snowflake.connector as sf

BASE_DIR = Path(r"/home/jovyan/MGTA SQL/final project/SQL_FINAL_PROJECT/Data-5/Monthly PO Data").resolve()   # file base on your local path
OUT_CSV  = BASE_DIR / "combined_purchases.csv"

# ---- 1) Combine csv file
if not OUT_CSV.exists():
    TARGET_COLS = ["PurchaseOrderID","PurchaseOrderLineID","ReceivedOuters",
                   "ExpectedUnitPricePerOuter","OrderDate","SupplierID"]
    ALIASES = {
        "PurchaseOrderID":           ["purchaseorderid","purchase_order_id","poid","orderid"],
        "PurchaseOrderLineID":       ["purchaseorderlineid","purchase_order_line_id","polineid","orderlineid"],
        "ReceivedOuters":            ["receivedouters","received_outers","receivedoutersqty","receivedoutersquantity","received_qty"],
        "ExpectedUnitPricePerOuter": ["expectedunitpriceperouter","expected_unit_price_per_outer","unitpriceperouter","expectedprice","unitprice"],
        "OrderDate":                 ["orderdate","order_date","date","order_dt"],
        "SupplierID":                ["supplierid","supplier_id","vendorid","vendor_id"],
    }
    def norm(s): return "".join(ch.lower() for ch in s if ch.isalnum())
    def pick_and_rename(df):
        m = {c: norm(c) for c in df.columns}
        rev = {}
        for k,v in m.items():
            if v not in rev: rev[v]=k
        sel = {}
        miss=[]
        for tgt, aliases in ALIASES.items():
            found=None
            for a in aliases:
                if a in rev: found=rev[a]; break
            if not found: miss.append(tgt)
            else: sel[tgt]=found
        if miss: raise ValueError(f"缺少必须列: {miss}; 文件列={list(df.columns)}")
        out = df[[sel[c] for c in TARGET_COLS]].copy()
        out.columns = TARGET_COLS
       
        for c in ["ReceivedOuters","ExpectedUnitPricePerOuter"]:
            out[c] = pd.to_numeric(out[c], errors="coerce")
      
        out["OrderDate"] = pd.to_datetime(out["OrderDate"], errors="coerce", format="%m/%d/%Y")
        out = out.dropna(subset=["PurchaseOrderID","PurchaseOrderLineID","OrderDate"])
        out["ReceivedOuters"] = out["ReceivedOuters"].fillna(0)
        out["ExpectedUnitPricePerOuter"] = out["ExpectedUnitPricePerOuter"].fillna(0)
        return out

    frames=[]
    files = sorted(BASE_DIR.glob("*.csv"))
    if not files: raise FileNotFoundError(f"{BASE_DIR} 下没有 .csv")
    for p in files:
        df = pd.read_csv(p, dtype=str, keep_default_na=False, na_values=["","NULL"])
        frames.append(pick_and_rename(df))
    combined = pd.concat(frames, ignore_index=True)
    combined.to_csv(OUT_CSV, index=False)
    print(f"已生成本地整合：{OUT_CSV}  行数={len(combined):,}")

# ---- 2) To Snowflake ----
conn = sf.connect(
    user=os.getenv("SNOW_USER", "ETHANAN2000"),
    password=os.getenv("SNOW_PASSWORD", "An67087833@123"),
    account=os.getenv("SNOW_ACCOUNT", "svogymj-bxb71103") 
)
cs = conn.cursor()

## change these line under to choose where we save the data to
WH, DB, SC = "ETL_WH","ETL_DB","ETL_SCHEMA"
cs.execute(f"CREATE WAREHOUSE IF NOT EXISTS {WH} WAREHOUSE_SIZE=SMALL AUTO_SUSPEND=60 AUTO_RESUME=TRUE")
cs.execute(f"CREATE DATABASE  IF NOT EXISTS {DB}")
cs.execute(f"CREATE SCHEMA    IF NOT EXISTS {DB}.{SC}")
cs.execute(f"USE WAREHOUSE {WH}"); cs.execute(f"USE DATABASE {DB}"); cs.execute(f"USE SCHEMA {SC}")

cs.execute("CREATE STAGE IF NOT EXISTS purchases_stage")
cs.execute("""
CREATE OR REPLACE FILE FORMAT csv_ff
  TYPE=CSV
  FIELD_OPTIONALLY_ENCLOSED_BY='\"'
  PARSE_HEADER=TRUE
  NULL_IF=('','NULL')
  TRIM_SPACE=TRUE
""")

# Combine file
cs.execute(f"PUT 'file:///{OUT_CSV.as_posix()}' @purchases_stage AUTO_COMPRESS=TRUE OVERWRITE=TRUE")

staged_name = OUT_CSV.name + ".gz" 
copy_sql = f"""
COPY INTO purchases_detail
FROM @purchases_stage/{staged_name}
FILE_FORMAT=(FORMAT_NAME='csv_ff')
MATCH_BY_COLUMN_NAME=CASE_INSENSITIVE
ON_ERROR='ABORT_STATEMENT'
"""
cs.execute("""
CREATE OR REPLACE TABLE purchases_detail (
  PurchaseOrderID           STRING,
  PurchaseOrderLineID       STRING,
  ReceivedOuters            NUMBER(18,4),
  ExpectedUnitPricePerOuter NUMBER(18,4),
  OrderDate                 DATE,
  SupplierID                STRING
)
""")
cs.execute("ALTER SESSION SET DATE_INPUT_FORMAT='AUTO'")
cs.execute(copy_sql)

cs.execute("SELECT COUNT(*) FROM purchases_detail")
print("Snowflake line:", cs.fetchone()[0])
cs.execute("SELECT * FROM purchases_detail ORDER BY OrderDate, PurchaseOrderID LIMIT 5")
for r in cs.fetchall(): print(r)

cs.close(); conn.close()

Snowflake 行数: 204575
('1', '1', Decimal('18.0000'), Decimal('5.5000'), datetime.date(2019, 1, 1), '2')
('1', '2', Decimal('21.0000'), Decimal('5.5000'), datetime.date(2019, 1, 1), '2')
('1', '3', Decimal('18.0000'), Decimal('5.5000'), datetime.date(2019, 1, 1), '2')
('1', '2', Decimal('21.0000'), Decimal('5.5000'), datetime.date(2019, 1, 1), '2')
('1', '1', Decimal('18.0000'), Decimal('5.5000'), datetime.date(2019, 1, 1), '2')
