In [7]:
!pip install datasets



In [8]:
#A_baseline 개선 → word+char ngram, LinearSVC, 파라미터 튜닝

In [None]:
"""
Contents:
Load dataset (BANKING77)
Train/Test split
TF-IDF (word 1–2gram) → Logistic Regression
Evaluation (Accuracy = 0.844, Macro-F1 = 0.832)
"""

In [None]:
import sys, platform
print("PY:", sys.executable)
print("VER:", platform.python_version())

In [None]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [None]:
ds = load_dataset("PolyAI/banking77")

In [None]:
print(ds)

In [None]:
X, y = ds["train"]["text"], ds["train"]["label"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
len(X_train), len(X_test), len(set(y))

In [None]:
#TF-IDF + Logistic Regression

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [None]:
vec = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=100_000)
Xtr = vec.fit_transform(X_train); Xte = vec.transform(X_test)

In [None]:
clf = LogisticRegression(max_iter=2000, n_jobs=-1)
clf.fit(Xtr, y_train)

In [None]:
pred = clf.predict(Xte)
print("Accuracy:", accuracy_score(y_test, pred))
print("Macro-F1:", f1_score(y_test, pred, average="macro"))
print(classification_report(y_test, pred, digits=4))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

In [None]:
word_vec = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=150_000)
char_vec = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), min_df=2, max_features=150_000)

In [None]:
Xtr_word = word_vec.fit_transform(X_train)
Xte_word = word_vec.transform(X_test)
Xtr_char = char_vec.fit_transform(X_train)
Xte_char = char_vec.transform(X_test)


In [None]:

Xtr = hstack([Xtr_word, Xtr_char])
Xte = hstack([Xte_word, Xte_char])

In [None]:
clf = LogisticRegression(max_iter=3000, n_jobs=-1)
clf.fit(Xtr, y_train)
pred = clf.predict(Xte)
print("Acc:", accuracy_score(y_test, pred))
print("Macro-F1:", f1_score(y_test, pred, average="macro"))

In [None]:
#LinearSVC로 분류기 교체

In [None]:
from sklearn.svm import LinearSVC
svc = LinearSVC(C=1.0)
svc.fit(Xtr, y_train)
pred = svc.predict(Xte)
print("Acc:", accuracy_score(y_test, pred))
print("Macro-F1:", f1_score(y_test, pred, average="macro"))


In [None]:
#(c) 빠른 하이퍼파라미터 튜닝

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {"C": [0.5, 1.0, 2.0, 5.0]}
grid = GridSearchCV(LinearSVC(), param_grid, cv=3, n_jobs=-1, scoring="f1_macro", verbose=0)
grid.fit(Xtr, y_train)
best_svc = grid.best_estimator_
pred = best_svc.predict(Xte)
print("BEST", grid.best_params_)
print("Acc:", accuracy_score(y_test, pred))
print("Macro-F1:", f1_score(y_test, pred, average="macro"))

In [None]:
#2) 에러 분석(바로 액션 가능한 셀)

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix

pred = best_svc.predict(Xte)  # 위에서 학습한 분류기 사용
cm = confusion_matrix(y_test, pred)
pairs = []
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        if i!=j and cm[i,j]>0:
            pairs.append((cm[i,j], i, j))
pairs = sorted(pairs, reverse=True)[:10]
pairs  # (count, true, pred) 상위 10개

In [None]:
#(b) 모델이 헷갈린 실제 텍스트 보기

In [None]:
import pandas as pd
wrong_idx = np.where(pred != y_test)[0][:30]
pd.DataFrame({
    "text": [X_test[i] for i in wrong_idx],
    "true": [y_test[i] for i in wrong_idx],
    "pred": [pred[i] for i in wrong_idx],
})

In [None]:
#3) 결과 아카이브(깃헙 업로드용)

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, pred)
plt.figure(figsize=(6,6))
plt.imshow(cm, interpolation='nearest')
plt.title("Confusion Matrix (TF-IDF baseline)")
plt.xlabel("Predicted"); plt.ylabel("True")
plt.tight_layout()
plt.savefig("02_NLP_Concepts/Intent_Classification/exports/cm_tfidf_baseline.png", dpi=150)


In [None]:
(b) 모델/벡터라이저 저장

In [None]:
import joblib, os
os.makedirs("02_NLP_Concepts/Intent_Classification/exports", exist_ok=True)
joblib.dump(best_svc, "02_NLP_Concepts/Intent_Classification/exports/tfidf_linearSVC.joblib")
joblib.dump(word_vec, "02_NLP_Concepts/Intent_Classification/exports/tfidf_word.joblib")
joblib.dump(char_vec, "02_NLP_Concepts/Intent_Classification/exports/tfidf_char.joblib")
