필요 라이브러리 다운

In [1]:
pip install pandas numpy scikit-learn lightgbm joblib openpyxl

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


데이터 기본 전처리

In [2]:
import pandas as pd
import re
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

# 파일 로딩
file_path = "C:\\Users\\정하민\\Downloads\\선용품 표준코드 목록 - ALL.xlsx"
df = pd.read_excel(file_path)

# 전처리 클래스 정의
class SimplePreprocessor:
    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.stop_words = set(["OF", "FOR", "WITH", "AND", "THE", "A", "AN", "TO", "IN", "ON", "BY"])
        self.unit_dict = self._learn_units()
        self.synonyms = self._learn_synonyms()
        self.manual_synonyms = {
    "SET": ["SET", "SETS", "KIT", "KITS", "KITAGAWA", "KITGRACO", "LIGHTINGKIT"],
    "BOX": ["BOX", "BOXES", "BOXING", "CASE", "CASES", "CARTON"],
    "BOTTLE": ["BTL", "BTLS", "BOTTLE", "BOTTLES", "FLASK"],
    "SPOON": ["SPOON", "SPOONS", "TEASPOON", "TABLESPOON", "LADLE"],
    "GLASS": ["GLASS", "GLASSES", "GLASSWARE", "FIBERGLASS", "FIBREGLASS", "MARINEGLASS"],
    "JAR": ["JAR", "JARS", "JARLSBERG", "CANISTER", "CONTAINER"],
    "MUG": ["MUG", "MUGS", "CUP", "CUPS", "TUMBLER", "STEIN"],
    "TRAY": ["TRAY", "TRAYS", "PLATE", "PLATES", "DISH", "DISHES"],
    "TAPE": ["TAPE", "TAPES", "BAND", "STRAP", "STRAPS"],
    "HANDLE": ["HANDLE", "KNOB", "GRIP", "KNOBSET", "LEVER"],
    "VALVE": ["VALVE", "VALVES", "COCK", "STOPPER", "TAP"],
    "PIPE": ["PIPE", "TUBE", "TUBES", "HOSE", "DUCT", "CYLINDER"],
    "BLADE": ["BLADE", "BLADES", "KNIFE", "KNIVES", "CUTTER", "CUTTERS"],
    "COVER": ["COVER", "COVERS", "LID", "CAP", "SLEEVE", "SHIELD"],
    "CLOTH": ["CLOTH", "CLOTHES", "FABRIC", "TEXTILE", "RAG", "RAGS"],
    "GLOVE": ["GLOVE", "GLOVES", "MITT", "MITTS", "HANDGUARD"],
    "WIRE": ["WIRE", "CABLE", "CORD", "ROPE", "LINE"],
    "HELMET": ["HELMET", "CAP", "HEADGEAR", "HAT", "HARDHAT"],
    "GOGGLES": ["GOGGLES", "GLASSES", "SHADES", "VISOR", "PROTECTIVE EYEWEAR"],
    "BATTERY": ["BATTERY", "BATTERIES", "CELL", "ACCUMULATOR"],
    "PAINT": ["PAINT", "COATING", "SPRAY", "VARNISH", "ENAMEL"],
    "BAG": ["BAG", "BAGS", "SACK", "SACKS", "POUCH", "PACK"],
    "LABEL": ["LABEL", "TAG", "STICKER", "DECAL"],
    "CLEANER": ["CLEANER", "DETERGENT", "SOAP", "SANITIZER", "DISINFECTANT"],
    "FAN": ["FAN", "FANS", "VENTILATOR", "BLOWER"],
    "LIGHT": ["LIGHT", "LAMP", "LED", "BULB", "FIXTURE"],
    "PUMP": ["PUMP", "PUMPS", "DISPENSER", "INJECTOR"],
        }
        self.tfidf = self._learn_tfidf()

    def _learn_units(self):
        units = defaultdict(int)
        specs = self.df['L5 NAME (SPEC)'].dropna().astype(str)
        for spec in specs:
            for match in re.findall(r'(\d+)\s*([A-Z]{1,5})', spec.upper()):
                units[match[1]] += 1
        return dict(units)

    def _learn_synonyms(self):
        synonyms = {}
        for col in ['L1 NAME', 'L2 NAME', 'L3 NAME', 'L4 NAME']:
            if col in self.df.columns:
                for val in self.df[col].dropna().unique():
                    if '/' in val:
                        terms = [t.strip().upper() for t in val.split('/')]
                        for t in terms:
                            synonyms[t] = terms
        return synonyms

    def _learn_tfidf(self):
        texts = self.df[['L1 NAME', 'L2 NAME', 'L3 NAME', 'L4 NAME']].fillna('').agg(' '.join, axis=1)
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(texts)
        return dict(zip(vectorizer.get_feature_names_out(), tfidf_matrix.sum(axis=0).A1))

    def normalize_query(self, query: str):
        query = query.upper()
        query = re.sub(r"[^A-Z0-9\s/]", " ", query)  # 특수문자 제거
        for unit in self.unit_dict:
            query = re.sub(rf"(\d+)\s*{unit}", rf"\1{unit}", query)
        return query.strip()

    def expand_query(self, query: str):
        words = set()
        for token in query.split():
            token = token.strip()
            if len(token) < 2 or token in self.stop_words:
                continue
            words.add(token)
            if token in self.synonyms:
                words.update(self.synonyms[token])
            if token in self.manual_synonyms:
                words.update(self.manual_synonyms[token])
            if '/' in token:
                parts = token.split('/')
                words.update(parts)
                words.add(' '.join(parts))
        return ' '.join(sorted(words))

    def score_query(self, query: str):
        words = query.split()
        return sum(self.tfidf.get(w.lower(), 0) for w in words)

    def preprocess(self, query: str):
        norm = self.normalize_query(query)
        expanded = self.expand_query(norm)
        score = self.score_query(expanded)
        return {
            'original': query,
            'normalized': norm,
            'expanded': expanded,
            'tfidf_score': round(score, 4)
        }

    # ✅ 새로운 함수 추가: normalized만 반환
    def preprocess_normalized(self, query: str):
        return self.normalize_query(query)

# 사용 예시
preprocessor_plus = SimplePreprocessor(df)
result = preprocessor_plus.preprocess("3 Boxes BTL Stand")
print(result)


{'original': '3 Boxes BTL Stand', 'normalized': '3BOXES BTL STAND', 'expanded': '3BOXES BTL STAND', 'tfidf_score': np.float64(21.2587)}


In [None]:
# 전처리기 학습 후 저장
#import pandas as pd

# 엑셀에서 전체 품명 코드 목록을 읽어옴
#df_model = pd.read_excel("C:\\Users\\정하민\\Downloads\\선용품 표준코드 목록 - ALL.xlsx")

# 이제 전처리기 학습 후 저장
#import joblib

#preprocessor = SimplePreprocessor(df_model)
#joblib.dump(preprocessor, "AI_Model_2/preprocessor.pkl")

모델 학습

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import lightgbm as lgb
from sklearn.metrics import classification_report
import re
from collections import defaultdict

# 1. 데이터 불러오기
file_path = "C:\\Users\\정하민\\Downloads\\선용품 표준코드 목록 - ALL.xlsx"
df = pd.read_excel(file_path, sheet_name='ALL')

# 3. 전처리기 인스턴스 생성
preprocessor = SimplePreprocessor(df)

# 4. 필수 컬럼 제거
required_cols = ['L1 NAME', 'L2 NAME', 'L3 NAME', 'L4 NAME', 'L5 NAME (SPEC)']
df = df.dropna(subset=required_cols)

# 5. 입력 텍스트 생성 및 전처리 적용
df["ITEM_NAME"] = (
    df["L3 NAME"].astype(str) + " " +
    df["L4 NAME"].astype(str) + " " +
    df["L5 NAME (SPEC)"].astype(str)
).str.upper()


# normalized만 저장
df["ITEM_NAME"] = df["ITEM_NAME"].apply(preprocessor.preprocess_normalized)

# ✅ 5-2. 🔍 SEARCH_NAME 생성 및 전처리 (★ 이게 빠졌을 가능성 높음)
df["SEARCH_NAME"] = (
    df["L1 NAME"].astype(str) + " " +
    df["L2 NAME"].astype(str) + " " +
    df["L3 NAME"].astype(str) + " " +
    df["L4 NAME"].astype(str) + " " +
    df["L5 NAME (SPEC)"].astype(str)
).str.upper()

df["SEARCH_NAME"] = df["SEARCH_NAME"].apply(preprocessor.preprocess_normalized)

# 6. 중복 제거
df_model = df[['SEARCH_NAME','ITEM_NAME', 'P CODE'] + required_cols].drop_duplicates()

# 7. 출력값 라벨 인코딩
label_encoders = {}
y_encoded = {}
for col in required_cols:
    le = LabelEncoder()
    y_encoded[col] = le.fit_transform(df_model[col].astype(str))
    label_encoders[col] = le

y_df = pd.DataFrame(y_encoded)

# 8. 학습/검증 분할
X_train, X_test, y_train_df, y_test_df = train_test_split(
    df_model["ITEM_NAME"],
    y_df,
    test_size=0.2,
    random_state=42
)

X_train, X_test, y_train_df, y_test_df = train_test_split(
    df_model["SEARCH_NAME"],  # 변경된 부분
    y_df,
    test_size=0.2,
    random_state=42
)

# 9. TF-IDF 벡터화
vectorizer = TfidfVectorizer(
    ngram_range=(1, 3),
    max_features=10000,
    min_df=1,
    max_df=0.8,
    stop_words='english'
)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

vectorizer_search = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=6000,
    min_df=2,
    max_df=0.85,
    stop_words='english'
)

vectorizer_search.fit(df_model["SEARCH_NAME"])

# 10. 계층적 모델 학습 및 평가

hierarchical_models = {}
hierarchical_label_encoders = {}

for idx, col in enumerate(required_cols):
    print(f"🔧 계층적 학습 중: {col}")

    # 🔁 입력 텍스트 구성
    if idx == 0:
        X_input = X_train
        X_test_input = X_test
    else:
        prev_cols = required_cols[:idx]
        X_input = X_train.copy()
        X_test_input = X_test.copy()

        for p_col in prev_cols:
            labels_train = label_encoders[p_col].inverse_transform(y_train_df[p_col])
            labels_test = label_encoders[p_col].inverse_transform(y_test_df[p_col])
            X_input += " " + pd.Series(labels_train, index=X_input.index)
            X_test_input += " " + pd.Series(labels_test, index=X_test_input.index)

    # ✅ L3 NAME만 클래스 필터링 적용
    if col == "L3 NAME":
        min_samples = 5
        vc = y_train_df[col].value_counts()
        valid_labels = vc[vc >= min_samples].index

        valid_idx = y_train_df[col].isin(valid_labels)
    
     # ✅ 인덱스 안 맞는 문제 해결!
        X_input = X_input[valid_idx.values]
        y_col_filtered = y_train_df[col][valid_idx.values]
    else:
        y_col_filtered = y_train_df[col]


    # 💡 벡터화
    vectorizer = TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=6000,
        min_df=2,
        max_df=0.85,
        stop_words='english'
    )
    X_train_vec = vectorizer.fit_transform(X_input).astype(np.float32)
    X_test_vec = vectorizer.transform(X_test_input).astype(np.float32)

    # ⚙️ 모델 선택
    if col == "L1 NAME":
        model = lgb.LGBMClassifier(class_weight="balanced", n_estimators=150)
    elif col == "L2 NAME":
        model = RandomForestClassifier(n_estimators=100, class_weight="balanced")
    elif col == "L3 NAME":
        model = SGDClassifier(loss="log_loss", class_weight="balanced", max_iter=1000, tol=1e-3)
    elif col == "L4 NAME":
        model = LogisticRegression(solver="saga", max_iter=300, class_weight="balanced")
    elif col == "L5 NAME (SPEC)":
        continue  # L5는 추천 기반 처리
    else:
        raise ValueError(f"알 수 없는 단계: {col}")

    # 📊 학습
    model.fit(X_train_vec, y_col_filtered)
    y_pred = model.predict(X_test_vec)

    print(f"\n===== {col} 예측 성능 =====")
    print(classification_report(y_test_df[col], y_pred, zero_division=0))

    # 저장
    hierarchical_models[col] = (model, vectorizer)
    hierarchical_label_encoders[col] = label_encoders[col]

# TF-IDF 사전 학습 (SEARCH_NAME 기준)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=6000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df_model["SEARCH_NAME"])

# 보정용 조합 key
df_model["COMBO_KEY"] = (
    df_model["L1 NAME"] + " | " +
    df_model["L2 NAME"] + " | " +
    df_model["L3 NAME"] + " | " +
    df_model["L4 NAME"]
)
valid_combinations = set(df_model["COMBO_KEY"].unique())

# 예측 조합 보정 함수
def correct_prediction(l1, l2, l3, l4, search_input):
    original_key = f"{l1} | {l2} | {l3} | {l4}"

    if original_key in valid_combinations:
        return {
            "L1 NAME": l1, "L2 NAME": l2, "L3 NAME": l3, "L4 NAME": l4,
            "CORRECTED": False, "MATCHED_KEY": original_key
        }

    input_vec = tfidf_vectorizer.transform([search_input.upper()])
    sims = cosine_similarity(input_vec, tfidf_matrix).flatten()
    best_idx = sims.argmax()
    row = df_model.iloc[best_idx]

    return {
        "L1 NAME": row["L1 NAME"],
        "L2 NAME": row["L2 NAME"],
        "L3 NAME": row["L3 NAME"],
        "L4 NAME": row["L4 NAME"],
        "CORRECTED": True,
        "MATCHED_KEY": row["COMBO_KEY"],
        "SIMILARITY": round(sims[best_idx], 3)
    }



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["ITEM_NAME"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["ITEM_NAME"] = df["ITEM_NAME"].apply(preprocessor.preprocess_normalized)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["SEARCH_NAME"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_i

🔧 계층적 학습 중: L1 NAME
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.148774 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 145845
[LightGBM] [Info] Number of data points in the train set: 36296, number of used features: 5608
[LightGBM] [Info] Start training from score -3.610918
[LightGBM] [Info] Start training from score -3.610918
[LightGBM] [Info] Start training from score -3.610918
[LightGBM] [Info] Start training from score -3.610918
[LightGBM] [Info] Start training from score -3.610918
[LightGBM] [Info] Start training from score -3.610918
[LightGBM] [Info] Start training from score -3.610918
[LightGBM] [Info] Start training from score -3.610918
[LightGBM] [Info] Start training from score -3.610918
[LightGBM] [Info] Start training from score -3.610918
[LightGBM] [Info] Start training from score -3.610918
[LightGBM] [Info] Start training from score -3.610918
[LightGBM] [Info] Start training from s




===== L1 NAME 예측 성능 =====
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        85
           2       1.00      1.00      1.00         7
           3       1.00      1.00      1.00       106
           4       1.00      1.00      1.00        38
           5       1.00      1.00      1.00       204
           6       1.00      1.00      1.00       441
           7       1.00      1.00      1.00       526
           8       1.00      1.00      1.00        96
           9       1.00      1.00      1.00       417
          10       1.00      1.00      1.00       896
          11       1.00      1.00      1.00        47
          12       1.00      1.00      1.00       229
          13       1.00      1.00      1.00        14
          14       1.00      1.00      1.00       203
          15       1.00      1.00      1.00        45
          16       1.00      1.00      1.00        33


  ys_types = set(type_of_target(x) for x in ys)



===== L3 NAME 예측 성능 =====
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         0
           5       1.00      1.00      1.00         4
           6       0.00      0.00      0.00         1
           7       0.33      1.00      0.50         1
           9       0.08      1.00      0.15         1
          10       1.00      1.00      1.00         9
          12       0.00      0.00      0.00         2
          13       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         1
          16       1.00      1.00      1.00         2
          17       0.00      0.00      0.00         0
          18       0.00      0.00      0.00         2
          21       0.92      1.00      0.96        12
          22       0.00      0.00      0.00         2
          23       0.00      0.00      0.00         2
          25       0.00      0.00      0.00         2


  type_true = type_of_target(y_true, input_name="y_true")
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  ys_types = set(type_of_target(x) for x in ys)


              precision    recall  f1-score   support

           8       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         0
          19       0.00      0.00      0.00         1
          20       0.00      0.00      0.00         1
          21       0.00      0.00      0.00         1
          25       0.00      0.00      0.00         0
          26       0.00      0.00      0.00         1
          29       0.00      0.00      0.00         0
          36       0.00      0.00      0.00         1
          37       0.00      0.00      0.00         1
          38       0.00      0.00      0.00         1
          45       0.00      0.00      0.00         0
          47       0.00      0.00      0.00         1
          51       0.00    

In [20]:
# 11. 예측 함수 정의 (전처리기 포함)
def predict_item_name(raw_name, verbose=True):
    raw_name = raw_name.upper()
    normalized = preprocessor.preprocess_normalized(raw_name)

    context = normalized
    prediction = {}

    for col in required_cols[:-1]:  # L1~L4만 분류기로 예측
        model, vec = hierarchical_models[col]
        input_vec = vec.transform([context])
        label_id = model.predict(input_vec)[0]
        label = hierarchical_label_encoders[col].inverse_transform([label_id])[0]
        prediction[col] = label
        context = f"{context} {label}"  # 누적

    # 🎯 L5는 cosine similarity 기반으로 예측
    input_vec = vectorizer.transform([normalized])
    candidate_vecs = vectorizer.transform(df_model["ITEM_NAME"])
    similarities = cosine_similarity(input_vec, candidate_vecs).flatten()

    top_idx = similarities.argmax()
    predicted_l5 = df_model.iloc[top_idx]["L5 NAME (SPEC)"]
    prediction["L5 NAME (SPEC)"] = predicted_l5

    # 🎯 L1~L4 조합 유효성 보정
    correction = correct_prediction(
        prediction["L1 NAME"],
        prediction["L2 NAME"],
        prediction["L3 NAME"],
        prediction["L4 NAME"],
        search_input=raw_name  # 입력 텍스트
    )

    # 보정된 결과 반영
    if correction["CORRECTED"]:
        print(f"\n⚠️ 예측된 L1~L4 조합이 존재하지 않아 보정되었습니다 (유사도: {correction['SIMILARITY']})")
        for level in ["L1 NAME", "L2 NAME", "L3 NAME", "L4 NAME"]:
            prediction[level] = correction[level]

    print(f"🔍 L5는 분류 대신 유사도 기반 추천으로 예측되었습니다 (유사도: {round(similarities[top_idx], 3)})")

    return prediction

모델 저장

In [21]:
import os
import joblib

# 📁 새 폴더 생성
save_dir = "AI_Model_2"
os.makedirs(save_dir, exist_ok=True)

# ✅ 1. 계층별 모델 및 벡터라이저 저장
for level in hierarchical_models:
    model, vectorizer = hierarchical_models[level]
    joblib.dump(model, f"{save_dir}/{level}_model.pkl")
    joblib.dump(vectorizer, f"{save_dir}/{level}_vectorizer.pkl")

# ✅ 2. 라벨 인코더 저장
for level, encoder in hierarchical_label_encoders.items():
    joblib.dump(encoder, f"{save_dir}/{level}_label_encoder.pkl")

# ✅ 3. TF-IDF 벡터라이저 (SEARCH_NAME 기반 유사도 검색용)
joblib.dump(tfidf_vectorizer, f"{save_dir}/searchname_vectorizer.pkl")

# ✅ 4. 전체 품목 데이터 (보정 및 추천 기반)
df_model.to_pickle(f"{save_dir}/df_model.pkl")
joblib.dump(valid_combinations, f"{save_dir}/valid_combinations.pkl")

print(f"✅ 모든 모델과 데이터가 '{save_dir}/' 폴더에 저장되었습니다.")

✅ 모든 모델과 데이터가 'AI_Model_2/' 폴더에 저장되었습니다.


# 모델 불러오기 예시
import os
import joblib
import pandas as pd

# 저장된 경로
load_dir = "AI_Model_2"

# 🔁 불러올 계층 컬럼 정의 (학습 시 사용한 것과 동일해야 함)
required_cols = ['L1 NAME', 'L2 NAME', 'L3 NAME', 'L4 NAME']

# ✅ 1. 계층별 모델 및 벡터라이저 불러오기
hierarchical_models = {}
for col in required_cols:
    model = joblib.load(f"{load_dir}/{col}_model.pkl")
    vectorizer = joblib.load(f"{load_dir}/{col}_vectorizer.pkl")
    hierarchical_models[col] = (model, vectorizer)

# ✅ 2. 라벨 인코더 불러오기
hierarchical_label_encoders = {}
for col in required_cols:
    encoder = joblib.load(f"{load_dir}/{col}_label_encoder.pkl")
    hierarchical_label_encoders[col] = encoder

# ✅ 3. 유사도 기반 TF-IDF 벡터라이저
tfidf_vectorizer = joblib.load(f"{load_dir}/searchname_vectorizer.pkl")

# ✅ 4. 품목 전체 데이터 및 조합
df_model = pd.read_pickle(f"{load_dir}/df_model.pkl")
valid_combinations = joblib.load(f"{load_dir}/valid_combinations.pkl")

print("✅ 모든 모델, 벡터라이저, 라벨 인코더, 데이터 불러오기 완료!")



P CODE 및 유사 품목 추천

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from difflib import get_close_matches
import numpy as np
import pandas as pd

# 🔧 오타 보정 + 후보 선택 + 유사도 점수 표시
def autocorrect_input(user_input, top_n=3, cutoff=0.6):
    input_upper = user_input.upper()
    candidates = df_model["ITEM_NAME"].unique()

    # 1차 후보: 문자 기반 유사 품명
    close = get_close_matches(input_upper, candidates, n=top_n, cutoff=cutoff)
    if not close:
        return input_upper  # 후보 없음 → 원본 유지

    # TF-IDF 유사도 계산
    preprocessed_input = preprocessor.preprocess(input_upper)["normalized"]
    input_vec = vectorizer.transform([preprocessed_input])
    candidate_vecs = vectorizer.transform(close)
    similarities = cosine_similarity(input_vec, candidate_vecs).flatten()

    # 후보 리스트 정리
    scored_candidates = list(zip(close, similarities))
    scored_candidates.sort(key=lambda x: x[1], reverse=True)

    print("\n🔍 입력하신 품명과 유사한 후보를 찾았어요:")
    for i, (text, score) in enumerate(scored_candidates, 1):
        print(f"  {i}. {text}  (유사도: {round(score, 3)})")

    # 사용자 선택
    try:
        selection = int(input("👉 어떤 후보가 맞나요? (번호 입력, 0 = 원래 입력 유지): "))
        if selection in range(1, len(scored_candidates) + 1):
            return scored_candidates[selection - 1][0]
        else:
            return input_upper
    except ValueError:
        return input_upper


# 🔍 유사 품목 추천
def recommend_similar_items(input_name, top_n=5):
    preprocessed = preprocessor.preprocess(input_name.upper())["normalized"]
    input_vec = vectorizer.transform([preprocessed])

    # L1~L4 구조가 동일한 것만 비교 대상으로 제한
    predicted = predict_item_name(input_name)
    l1 = predicted["L1 NAME"]
    l2 = predicted["L2 NAME"]
    l3 = predicted["L3 NAME"]
    l4 = predicted["L4 NAME"]

    # 후보 필터링
    filtered_df = df_model[
        (df_model["L1 NAME"] == l1) &
        (df_model["L2 NAME"] == l2) &
        (df_model["L3 NAME"] == l3) &
        (df_model["L4 NAME"] == l4)
    ]

    if filtered_df.empty:
        return []

    candidate_vecs = vectorizer.transform(filtered_df["SEARCH_NAME"])
    similarities = cosine_similarity(input_vec, candidate_vecs).flatten()

    top_indices = similarities.argsort()[::-1]
    recommendations = []

    for idx in top_indices:
        if similarities[idx] <= 0:
            continue

        row = filtered_df.iloc[idx]
        rec = {
            "SIMILAR_ITEM_NAME": row["ITEM_NAME"],
            "SIMILARITY_SCORE": round(similarities[idx], 3),
            "L1 NAME": row["L1 NAME"],
            "L2 NAME": row["L2 NAME"],
            "L3 NAME": row["L3 NAME"],
            "L4 NAME": row["L4 NAME"],
            "L5 NAME (SPEC)": row["L5 NAME (SPEC)"],
            "P CODE": row["P CODE"]
        }
        recommendations.append(rec)
        if len(recommendations) >= top_n:
            break

    return recommendations


def generate_structured_pcode_based_on_similar(similar_items, fallback_predicted):
    if not similar_items:
        return generate_structured_pcode(fallback_predicted)
    
    # 가장 유사한 기존 품목의 P CODE 접두부 (앞 8자리)
    closest_code = similar_items[0]['P CODE']
    prefix = closest_code[:8]

    # 중복 방지
    existing_codes = df["P CODE"].dropna().astype(str)
    used_suffixes = [
        int(code[-3:]) for code in existing_codes
        if code.startswith(prefix) and code[-3:].isdigit()
    ]

    next_suffix = max(used_suffixes, default=0) + 1
    return f"{prefix}{str(next_suffix).zfill(3)}"

def generate_structured_pcode(predicted):
    # 정확한 키 이름으로 추출
    l1 = predicted.get("L1 NAME", "")[:2].upper().ljust(2, "X")
    l2 = predicted.get("L2 NAME", "")[:2].upper().ljust(2, "X")
    l3 = predicted.get("L3 NAME", "")[:2].upper().ljust(2, "X")
    l4 = predicted.get("L4 NAME", "")[:2].upper().ljust(2, "X")

    prefix = f"{l1}{l2}{l3}{l4}"

    existing_codes = df["P CODE"].dropna().astype(str)
    used_suffixes = [
        int(code[-3:]) for code in existing_codes
        if code.startswith(prefix) and code[-3:].isdigit()
    ]

    next_suffix = max(used_suffixes, default=0) + 1
    full_suffix = str(next_suffix).zfill(3)  # 항상 3자리 ("001" 등)

    trimmed_suffix = full_suffix[1:]

    new_pcode = f"{prefix}{trimmed_suffix}"  # 8 + 2 = 10자리

    if len(new_pcode) != 10:
        raise ValueError(f"❌ 생성된 P CODE 길이 오류: {new_pcode} (길이 {len(new_pcode)})")

    return new_pcode

def recommend_similar_pcodes_detailed(new_pcode, top_n=5):
    existing = df.dropna(subset=["P CODE"])
    existing_codes = existing["P CODE"].astype(str).unique()

    # 문자 유사도 기반 후보 추출
    similar_codes = get_close_matches(new_pcode, existing_codes, n=top_n * 5, cutoff=0.5)  # 더 많이 뽑고 필터링

    # 기준 분류 추출 (new_pcode 기준)
    base_row = {
        "L1": new_pcode[0:2],
        "L2": new_pcode[2:4],
        "L3": new_pcode[4:6],
        "L4": new_pcode[6:8],
    }

    # 상세 정보 구성 (분류 구조가 동일한 것만)
    results = []
    for code in similar_codes:
        row = existing[existing["P CODE"] == code].iloc[0]
        if (
            row.get("L1 NAME", "")[:2].upper() == base_row["L1"] and
            row.get("L2 NAME", "")[:2].upper() == base_row["L2"] and
            row.get("L3 NAME", "")[:2].upper() == base_row["L3"] and
            row.get("L4 NAME", "")[:2].upper() == base_row["L4"]
        ):
            results.append({
                "P CODE": code,
                "ITEM_NAME": row.get("ITEM_NAME") or row.get("L5 NAME (SPEC)", "N/A"),
                "L1": row.get("L1 NAME", ""),
                "L2": row.get("L2 NAME", ""),
                "L3": row.get("L3 NAME", ""),
                "L4": row.get("L4 NAME", ""),
                "L5": row.get("L5 NAME (SPEC)", "")
            })
        if len(results) >= top_n:
            break

    return pd.DataFrame(results)

# 🔹 여기가 함수 바깥입니다 (맨 왼쪽)

def is_valid_combination(predicted: dict, df_ref: pd.DataFrame) -> bool:
    l1, l2, l3, l4, l5 = (
        predicted["L1 NAME"],
        predicted["L2 NAME"],
        predicted["L3 NAME"],
        predicted["L4 NAME"],
        predicted["L5 NAME (SPEC)"],
    )
    return not df_ref[
        (df_ref["L1 NAME"] == l1)
        & (df_ref["L2 NAME"] == l2)
        & (df_ref["L3 NAME"] == l3)
        & (df_ref["L4 NAME"] == l4)
        & (df_ref["L5 NAME (SPEC)"] == l5)
    ].empty


def suggest_similar_combination(predicted: dict, df_ref: pd.DataFrame, top_n=5):
    query = " ".join([
        predicted.get("L1 NAME", ""),
        predicted.get("L2 NAME", ""),
        predicted.get("L3 NAME", ""),
        predicted.get("L4 NAME", ""),
        predicted.get("L5 NAME (SPEC)", "")
    ])
    input_vec = vectorizer.transform([preprocessor.preprocess_normalized(query)])
    candidate_vecs = vectorizer.transform(df_ref["SEARCH_NAME"])
    similarities = cosine_similarity(input_vec, candidate_vecs).flatten()
    top_indices = similarities.argsort()[::-1]
    results = []
    for idx in top_indices:
        row = df_ref.iloc[idx]
        results.append({
            "P CODE": row["P CODE"],
            "SEARCH_NAME": row["SEARCH_NAME"],
            "SIMILARITY_SCORE": round(similarities[idx], 3),
        })
        if len(results) >= top_n:
            break
    return pd.DataFrame(results)

def fallback_by_l1_l3(predicted, input_name, top_n=5, similarity_threshold=0.3):
    l1 = predicted.get("L1 NAME", "")
    l2 = predicted.get("L2 NAME", "")
    l3 = predicted.get("L3 NAME", "")

    # 후보군 필터링
    candidates = df_model[
        (df_model["L1 NAME"] == l1) &
        (df_model["L2 NAME"] == l2) &
        (df_model["L3 NAME"] == l3)
    ]

    if candidates.empty:
        return []

    # 입력 벡터 (SEARCH_NAME 기준 전처리)
    input_query = preprocessor.preprocess_normalized(input_name)
    input_vec = vectorizer.transform([input_query])
    candidate_vecs = vectorizer.transform(candidates["SEARCH_NAME"])

    similarities = cosine_similarity(input_vec, candidate_vecs).flatten()
    top_indices = similarities.argsort()[::-1]

    results = []
    for idx in top_indices:
        sim = similarities[idx]
        if sim < similarity_threshold:
            continue  # 의미 없는 추천은 제외

        row = candidates.iloc[idx]
        results.append({
            "SIMILAR_ITEM_NAME": row["SEARCH_NAME"],
            "SIMILARITY_SCORE": round(sim, 3),
            "P CODE": row["P CODE"],
            "L5 NAME (SPEC)": row["L5 NAME (SPEC)"]
        })

        if len(results) >= top_n:
            break

    return results

def recommend_by_similar_l1_l2_l3(input_name, predicted, top_n=5):
    l1, l2, l3 = predicted["L1 NAME"], predicted["L2 NAME"], predicted["L3 NAME"]

    # 분류 조건에 맞는 후보 추출
    candidates = df_model[
        (df_model["L1 NAME"] == l1) &
        (df_model["L2 NAME"] == l2) &
        (df_model["L3 NAME"] == l3)
    ]

    if candidates.empty:
        return []

    input_vec = vectorizer.transform([preprocessor.preprocess(input_name)["normalized"]])
    candidate_vecs = vectorizer.transform(candidates["SEARCH_NAME"])
    similarities = cosine_similarity(input_vec, candidate_vecs).flatten()
    top_indices = similarities.argsort()[::-1]

    results = []
    for idx in top_indices:
        row = candidates.iloc[idx]
        results.append({
            "SIMILAR_ITEM_NAME": row["SEARCH_NAME"],
            "SIMILARITY_SCORE": round(similarities[idx], 3),
            "P CODE": row["P CODE"],
            "L5 NAME (SPEC)": row["L5 NAME (SPEC)"]
        })
        if len(results) >= top_n:
            break

    return results

def recommend_global_similar_items(input_name, top_n=5):
    query = preprocessor.preprocess(input_name)["normalized"]
    input_vec = vectorizer.transform([query])
    candidate_vecs = vectorizer.transform(df_model["SEARCH_NAME"])

    similarities = cosine_similarity(input_vec, candidate_vecs).flatten()
    top_indices = similarities.argsort()[::-1]

    results = []
    for idx in top_indices:
        row = df_model.iloc[idx]
        results.append({
            "SIMILAR_ITEM_NAME": row["SEARCH_NAME"],
            "SIMILARITY_SCORE": round(similarities[idx], 3),
            "P CODE": row["P CODE"],
            "L1 NAME": row["L1 NAME"],
            "L2 NAME": row["L2 NAME"],
            "L3 NAME": row["L3 NAME"],
            "L4 NAME": row["L4 NAME"],
            "L5 NAME (SPEC)": row["L5 NAME (SPEC)"],
        })
        if len(results) >= top_n:
            break

    return results

def keyword_fallback_items(input_text, top_n=5):
    words = input_text.upper().split()

    # Step 1: 모든 단어 포함 (AND)
    matches_all = df_model.copy()
    for word in words:
        matches_all = matches_all[matches_all["SEARCH_NAME"].str.contains(word, na=False)]

    if not matches_all.empty:
        return format_keyword_results(matches_all, top_n, mode="AND")

    # Step 2: 일부 단어 포함 (OR)
    pattern = "|".join(words)
    matches_any = df_model[df_model["SEARCH_NAME"].str.contains(pattern, na=False)]

    if not matches_any.empty:
        return format_keyword_results(matches_any, top_n, mode="OR")

    return []

def format_keyword_results(df_matches, top_n=5, mode=""):
    results = []
    for _, row in df_matches.head(top_n).iterrows():
        results.append({
            "P CODE": row["P CODE"],
            "SEARCH_NAME": row["SEARCH_NAME"],
            "L1 NAME": row["L1 NAME"],
            "L2 NAME": row["L2 NAME"],
            "L3 NAME": row["L3 NAME"],
            "L4 NAME": row["L4 NAME"],
            "L5 NAME (SPEC)": row["L5 NAME (SPEC)"],
            "MATCH_MODE": mode
        })
    return results

def extract_main_keyword(text, common_words=["ELECTRIC", "PRODUCT", "SET"]):
    words = text.upper().split()
    keywords = [w for w in words if w not in common_words]
    return keywords[0] if keywords else text


함수들 전체 실행

In [52]:
def interactive_product_recommendation():
    user_input = input("🔍 품명을 입력하세요: ").strip()
    if not user_input:
        print("❗ 입력이 비어 있습니다.")
        return

    # 1. 오타 보정
    corrected = autocorrect_input(user_input)
    if corrected != user_input.upper():
        print(f"🔧 오타 보정: '{user_input}' → '{corrected}'")
    else:
        print(f"✅ 입력 인식됨: '{corrected}'")

    # 2. 품명 예측
    predicted = predict_item_name(corrected)

    # 3. 예측 신뢰도 평가 함수
    def is_prediction_reasonable(predicted, user_input, threshold=0.3):
        pred_string = f"{predicted.get('L1 NAME', '')} {predicted.get('L2 NAME', '')} {predicted.get('L3 NAME', '')} {predicted.get('L4 NAME', '')} {predicted.get('L5 NAME (SPEC)', '')}"
        user_vec = vectorizer.transform([user_input.upper()])
        pred_vec = vectorizer.transform([pred_string.upper()])
        sim = cosine_similarity(user_vec, pred_vec)[0][0]
        if len(user_input.strip().split()) <= 1:
            print("⚠️ 입력이 너무 짧습니다. 신뢰도 판단 없이 fallback 진행될 수 있습니다.")
            return False
        print(f"📊 예측 신뢰도 (입력 vs 예측): {round(sim * 100, 1)}%")
        return sim >= threshold

    # 4. 전역 유사 품목 추천
    global_recommendations = recommend_global_similar_items(corrected)
    if global_recommendations and global_recommendations[0]["SIMILARITY_SCORE"] >= 0.4:
        print("\n🔎 전역 유사 품목 Top 5:")
        for rec in global_recommendations:
            print(f"[{rec['SIMILARITY_SCORE']}] {rec['SIMILAR_ITEM_NAME']} → {rec['P CODE']}")
    else:
        print("📭 전역 유사 품목 부족. fallback을 시도합니다.")

    # 5. 예측 신뢰도 낮으면 fallback
    if not is_prediction_reasonable(predicted, corrected):
        print("⚠️ 예측된 분류가 입력과 너무 다릅니다. fallback 중...")
        similar_items = recommend_similar_items(corrected)

        if similar_items:
            best_fallback = similar_items[0]["SIMILAR_ITEM_NAME"]
            predicted = predict_item_name(best_fallback, verbose=False)
            print(f"🔁 fallback 예측된 품명: '{best_fallback}' → 재예측됨")
        else:
            print("📭 유사한 품명이 없음. L1~L3 기반 fallback 시도 중...")
            l3_fallbacks = fallback_by_l1_l3(predicted, corrected)
            if l3_fallbacks:
                best_fallback = l3_fallbacks[0]["SIMILAR_ITEM_NAME"]
                predicted = predict_item_name(best_fallback, verbose=False)
                print(f"🔁 L1~L3 기반 fallback 예측된 품명: '{best_fallback}' → 재예측됨")
            else:
                print("❌ 모든 fallback 실패")

    # 6. 최종 예측 분류 출력
    print("\n🎯 예측된 분류:")
    for k, v in predicted.items():
        print(f"- {k}: {v}")

    # 7. 정합성 체크
    if not is_valid_combination(predicted, df_model):
        print("❌ 예측된 L1~L5 조합이 실제 존재하지 않습니다.")
        print("🔁 유사한 L1~L5 조합 추천:")
        display(suggest_similar_combination(predicted, df_model))

    # 8. 기존 P CODE 확인 또는 신규 생성
    matched = df[
        (df["L1 NAME"] == predicted["L1 NAME"]) &
        (df["L2 NAME"] == predicted["L2 NAME"]) &
        (df["L3 NAME"] == predicted["L3 NAME"]) &
        (df["L4 NAME"] == predicted["L4 NAME"]) &
        (df["L5 NAME (SPEC)"] == predicted["L5 NAME (SPEC)"])
    ]

    if not matched.empty:
        existing_pcode = matched["P CODE"].iloc[0]
        print(f"- 📦 추천 P CODE (기존): {existing_pcode}")
    else:
        # global_recommendations 우선 사용, 없으면 similar_items 대체
        base_items = global_recommendations or recommend_similar_items(corrected)
        new_pcode = generate_structured_pcode_based_on_similar(base_items, predicted)
        print(f"- 🆕 추천 P CODE (신규): {new_pcode}")
        similar_details_df = recommend_similar_pcodes_detailed(new_pcode)
        if not similar_details_df.empty:
            print("\n📋 유사한 기존 P CODE 및 품명:")
            display(similar_details_df)
        else:
            print("📭 유사한 기존 P CODE 없음")

    # 9. TF-IDF 유사 품목 추천
    recommendations = recommend_similar_items(corrected)
    if recommendations and any(rec['SIMILARITY_SCORE'] > 0 for rec in recommendations):
        print("\n🔎 유사한 품목 Top 5:")
        for rec in recommendations:
            print(f"[{rec['SIMILARITY_SCORE']}] {rec['SIMILAR_ITEM_NAME']} → {rec['P CODE']}")
    else:
        print("📭 TF-IDF 기반 추천 실패. 전역 검색 시도 중...")
        recommendations = global_recommendations or []
        if not recommendations:
            print("📭 전역 유사 품목도 없음. 키워드 포함 검색 시도 중...")
            keyword_matches = keyword_fallback_items(corrected)
            if keyword_matches:
                print("\n🔎 단어 포함 품목 추천:")
                for item in keyword_matches:
                    print(f"🔹 [{item.get('MATCH_MODE', '키워드')}] {item['SEARCH_NAME']} → {item['P CODE']}")
            else:
                print("❌ 단어 포함 결과도 없음.")


보기 편하게 수정하는 함수들

In [55]:
def print_predicted_hierarchy(predicted):
    print("\n🎯 예측된 분류:")
    for key, label in [
        ("L1 NAME", "📁 L1 (대분류)"),
        ("L2 NAME", "📂 L2 (중분류)"),
        ("L3 NAME", "📄 L3 (소분류)"),
        ("L4 NAME", "🔹 L4 (세분류)"),
        ("L5 NAME (SPEC)", "🧷 L5 (상세/스펙)")
    ]:
        print(f"{label}: {predicted.get(key, '❓')}")


def print_similar_items(items, title="🔎 유사한 품목"):
    if not items:
        print("📭 유사 품목 없음")
        return
    print(f"\n{title}:")
    for rec in items:
        print(f"\n▶ 품명: {rec['SIMILAR_ITEM_NAME']} (유사도: {round(rec['SIMILARITY_SCORE'] * 100, 1)}%)")
        print(f"   🔹 P CODE: {rec['P CODE']}")
        for key, label in [
            ("L1 NAME", "L1"), ("L2 NAME", "L2"), ("L3 NAME", "L3"),
            ("L4 NAME", "L4"), ("L5 NAME (SPEC)", "L5")
        ]:
            print(f"   - {label}: {rec.get(key, '')}")


def print_keyword_matches(matches):
    if not matches:
        print("📭 단어 포함 결과 없음")
        return
    print("\n🔎 단어 포함 품목 추천:")
    for item in matches:
        print(f"\n🔹 품명: {item['SEARCH_NAME']} (매치 방식: {item.get('MATCH_MODE', '단어 포함')})")
        print(f"   🔹 P CODE: {item['P CODE']}")
        for key, label in [
            ("L1 NAME", "L1"), ("L2 NAME", "L2"), ("L3 NAME", "L3"),
            ("L4 NAME", "L4"), ("L5 NAME (SPEC)", "L5")
        ]:
            print(f"   - {label}: {item.get(key, '')}")


def interactive_product_recommendation():
    user_input = input("🔍 품명을 입력하세요: ").strip()
    if not user_input:
        print("❗ 입력이 비어 있습니다.")
        return

    corrected = autocorrect_input(user_input)
    if corrected != user_input.upper():
        print(f"🔧 오타 보정: '{user_input}' → '{corrected}'")
    else:
        print(f"✅ 입력 인식됨: '{corrected}'")

    predicted = predict_item_name(corrected)

    def is_prediction_reasonable(predicted, user_input, threshold=0.3):
        pred_string = " ".join([
            predicted.get('L1 NAME', ''), predicted.get('L2 NAME', ''),
            predicted.get('L3 NAME', ''), predicted.get('L4 NAME', ''),
            predicted.get('L5 NAME (SPEC)', '')
        ])
        user_vec = vectorizer.transform([user_input.upper()])
        pred_vec = vectorizer.transform([pred_string.upper()])
        sim = cosine_similarity(user_vec, pred_vec)[0][0]
        if len(user_input.strip().split()) <= 1:
            print("⚠️ 입력이 너무 짧습니다. 신뢰도 판단 없이 fallback 진행될 수 있습니다.")
            return False
        print(f"📊 예측 신뢰도 (입력 vs 예측): {round(sim * 100, 1)}%")
        return sim >= threshold

    global_recommendations = recommend_global_similar_items(corrected)
    if global_recommendations and global_recommendations[0]["SIMILARITY_SCORE"] >= 0.4:
     print_similar_items(global_recommendations, "🔎 전역 유사 품목 Top 5")
    else:
        print("📭 전역 유사 품목 부족. fallback을 시도합니다.")

    if not is_prediction_reasonable(predicted, corrected):
        print("⚠️ 예측된 분류가 입력과 너무 다릅니다. fallback 중...")
        similar_items = recommend_similar_items(corrected)

        if similar_items:
            best_fallback = similar_items[0]["SIMILAR_ITEM_NAME"]
            predicted = predict_item_name(best_fallback, verbose=False)
            print(f"🔁 fallback 예측된 품명: '{best_fallback}' → 재예측됨")
        else:
            print("📭 유사한 품명이 없음. L1~L3 기반 fallback 시도 중...")
            l3_fallbacks = fallback_by_l1_l3(predicted, corrected)
            if l3_fallbacks:
                best_fallback = l3_fallbacks[0]["SIMILAR_ITEM_NAME"]
                predicted = predict_item_name(best_fallback, verbose=False)
                print(f"🔁 L1~L3 기반 fallback 예측된 품명: '{best_fallback}' → 재예측됨")
            else:
                print("❌ 모든 fallback 실패")

    # 분류 출력
    print_predicted_hierarchy(predicted)

    if not is_valid_combination(predicted, df_model):
        print("❌ 예측된 L1~L5 조합이 실제 존재하지 않습니다.")
        print("🔁 유사한 L1~L5 조합 추천:")
        display(suggest_similar_combination(predicted, df_model))

    matched = df[
        (df["L1 NAME"] == predicted["L1 NAME"]) &
        (df["L2 NAME"] == predicted["L2 NAME"]) &
        (df["L3 NAME"] == predicted["L3 NAME"]) &
        (df["L4 NAME"] == predicted["L4 NAME"]) &
        (df["L5 NAME (SPEC)"] == predicted["L5 NAME (SPEC)"])
    ]

    if not matched.empty:
        existing_pcode = matched["P CODE"].iloc[0]
        print(f"- 📦 추천 P CODE (기존): {existing_pcode}")
    else:
        base_items = global_recommendations or recommend_similar_items(corrected)
        new_pcode = generate_structured_pcode_based_on_similar(base_items, predicted)
        print(f"- 🆕 추천 P CODE (신규): {new_pcode}")
        similar_details_df = recommend_similar_pcodes_detailed(new_pcode)
        if not similar_details_df.empty:
            print("\n📋 유사한 기존 P CODE 및 품명:")
            display(similar_details_df)
        else:
            print("📭 유사한 기존 P CODE 없음")

    recommendations = recommend_similar_items(corrected)
    if recommendations and any(rec['SIMILARITY_SCORE'] > 0 for rec in recommendations):
        print_similar_items(recommendations, "🔎 유사한 품목 Top 5")
    else:
        print("📭 TF-IDF 기반 추천 실패. 전역 검색 시도 중...")
        recommendations = global_recommendations or []
        if not recommendations:
            print("📭 전역 유사 품목도 없음. 키워드 포함 검색 시도 중...")
            keyword_matches = keyword_fallback_items(corrected)
            print_keyword_matches(keyword_matches)


최종 프로그램 실행

In [58]:
interactive_product_recommendation()

✅ 입력 인식됨: 'HEAT INSULATION PIPE COVER'





⚠️ 예측된 L1~L4 조합이 존재하지 않아 보정되었습니다 (유사도: 0.771)
🔍 L5는 분류 대신 유사도 기반 추천으로 예측되었습니다 (유사도: 0.593)

🔎 전역 유사 품목 Top 5:

▶ 품명: PACKING / JOINTING HEAT INSULATION PIPE COVER PIPE COVERS HEAT INSULATION PIPE HEAT INSULATION ROCK WOOL THICK20MM 32AX1000MM (유사도: 77.1%)
   🔹 P CODE: SMNAP1PC04
   - L1: PACKING / JOINTING
   - L2: HEAT INSULATION PIPE COVER
   - L3: PIPE COVERS HEAT INSULATION
   - L4: PIPE HEAT INSULATION ROCK WOOL
   - L5: THICK20MM 32AX1000MM

▶ 품명: PACKING / JOINTING HEAT INSULATION PIPE COVER PIPE COVERS HEAT INSULATION PIPE HEAT INSULATION ROCK WOOL THICK50MM 400AX1000MM (유사도: 77.1%)
   🔹 P CODE: SMNAP1PC1E
   - L1: PACKING / JOINTING
   - L2: HEAT INSULATION PIPE COVER
   - L3: PIPE COVERS HEAT INSULATION
   - L4: PIPE HEAT INSULATION ROCK WOOL
   - L5: THICK50MM 400AX1000MM

▶ 품명: PACKING / JOINTING HEAT INSULATION PIPE COVER PIPE COVERS HEAT INSULATION PIPE HEAT INSULATION ROCK WOOL THICK40MM 350AX1000MM (유사도: 77.1%)
   🔹 P CODE: SMNAP1PC10
   - L1: PACKING / JOINTING
   - L




⚠️ 예측된 L1~L4 조합이 존재하지 않아 보정되었습니다 (유사도: 0.771)
🔍 L5는 분류 대신 유사도 기반 추천으로 예측되었습니다 (유사도: 0.593)

🔎 유사한 품목 Top 5:

▶ 품명: PIPE COVERS HEAT INSULATION PIPE HEAT INSULATION ROCK WOOL THICK50MM 400AX1000MM (유사도: 77.1%)
   🔹 P CODE: SMNAP1PC1E
   - L1: PACKING / JOINTING
   - L2: HEAT INSULATION PIPE COVER
   - L3: PIPE COVERS HEAT INSULATION
   - L4: PIPE HEAT INSULATION ROCK WOOL
   - L5: THICK50MM 400AX1000MM

▶ 품명: PIPE COVERS HEAT INSULATION PIPE HEAT INSULATION ROCK WOOL THICK50MM 350AX1000MM (유사도: 77.1%)
   🔹 P CODE: SMNAP1PC1D
   - L1: PACKING / JOINTING
   - L2: HEAT INSULATION PIPE COVER
   - L3: PIPE COVERS HEAT INSULATION
   - L4: PIPE HEAT INSULATION ROCK WOOL
   - L5: THICK50MM 350AX1000MM

▶ 품명: PIPE COVERS HEAT INSULATION PIPE HEAT INSULATION ROCK WOOL THICK50MM 300AX1000MM (유사도: 77.1%)
   🔹 P CODE: SMNAP1PC1C
   - L1: PACKING / JOINTING
   - L2: HEAT INSULATION PIPE COVER
   - L3: PIPE COVERS HEAT INSULATION
   - L4: PIPE HEAT INSULATION ROCK WOOL
   - L5: THICK50MM 300AX1000MM
