In [10]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import numpy as np
from tqdm import tqdm
import warnings

warnings.filterwarnings("ignore")

# 1. 데이터 로드
categories = ['rec.sport.hockey', 'sci.med', 'comp.graphics', 'talk.politics.mideast']  # 선택된 카테고리
newsgroups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.2, random_state=42)

# 2. Document-Term Matrix
def document_term_matrix(X_train, X_test):
    vectorizer = CountVectorizer(max_features=5000)  # 상위 5000개 단어 사용
    X_train_dtm = vectorizer.fit_transform(X_train).toarray()
    X_test_dtm = vectorizer.transform(X_test).toarray()
    return X_train_dtm, X_test_dtm

# 3. TF-IDF Matrix
def tfidf_matrix(X_train, X_test):
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
    X_test_tfidf = vectorizer.transform(X_test).toarray()
    return X_train_tfidf, X_test_tfidf

# 4. Word2Vec 임베딩
def word2vec_embedding_optimized(X_train, X_test):
    X_train_tokens = [text.split() for text in X_train]
    X_test_tokens = [text.split() for text in X_test]
    
    # Word2Vec 모델
    model = Word2Vec(
        sentences=X_train_tokens,
        vector_size=768,   # 벡터 크기 증가
        window=10,         # 넓은 컨텍스트
        min_count=2,       # 최소 단어 빈도 증가
        workers=10,
        sg=1,              # Skip-Gram 사용
        epochs=15          # 학습 반복 증가
    )
    
    # 평균 임베딩 계산
    def get_avg_embedding(tokens, model):
        embeddings = [model.wv[word] for word in tokens if word in model.wv]
        return np.mean(embeddings, axis=0) if embeddings else np.zeros(model.vector_size)
    
    X_train_w2v = np.array([get_avg_embedding(tokens, model) for tokens in X_train_tokens])
    X_test_w2v = np.array([get_avg_embedding(tokens, model) for tokens in X_test_tokens])
    
    return X_train_w2v, X_test_w2v

# 5. Pre-Trained GloVe 임베딩
def glove_embedding(X_train, X_test, glove_path='glove.twitter.27B.25d.txt'):
    # GloVe 벡터 로드
    glove = KeyedVectors.load_word2vec_format(glove_path, binary=False, no_header=True)
    # 토큰화
    X_train_tokens = [text.split() for text in X_train]
    X_test_tokens = [text.split() for text in X_test]
    # 평균 임베딩 계산
    def get_avg_embedding(tokens, glove):
        embeddings = [glove[word] for word in tokens if word in glove]
        return np.mean(embeddings, axis=0) if embeddings else np.zeros(glove.vector_size)
    X_train_glove = np.array([get_avg_embedding(tokens, glove) for tokens in X_train_tokens])
    X_test_glove = np.array([get_avg_embedding(tokens, glove) for tokens in X_test_tokens])
    return X_train_glove, X_test_glove

# 6. 모델 학습 및 성능 평가
def evaluate_model(X_train, X_test, y_train, y_test, method_name):
    model = LogisticRegression(max_iter=500)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)  # 클래스별 확률 분포
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # multi_class='ovr'를 위해 다중 클래스의 클래스별 확률 제공
    try:
        auc = roc_auc_score(y_test, y_prob, multi_class='ovr')
    except ValueError:
        auc = float('nan')  # AUC 계산이 실패하면 NaN
    
    print(f"{method_name} - Accuracy: {acc:.4f}, F1 Score: {f1:.4f}, ROC/AUC: {auc:.4f}")
    return acc, f1, auc


# 7. 각 방법별로 실행
print("Evaluating Document-Term Matrix...")
X_train_dtm, X_test_dtm = document_term_matrix(X_train, X_test)
evaluate_model(X_train_dtm, X_test_dtm, y_train, y_test, "DTM")

print("\nEvaluating TF-IDF Matrix...")
X_train_tfidf, X_test_tfidf = tfidf_matrix(X_train, X_test)
evaluate_model(X_train_tfidf, X_test_tfidf, y_train, y_test, "TF-IDF")

print("\nEvaluating Word2Vec Embedding...")
X_train_w2v, X_test_w2v = word2vec_embedding_optimized(X_train, X_test)
evaluate_model(X_train_w2v, X_test_w2v, y_train, y_test, "Word2Vec")

print("\nEvaluating Pre-Trained GloVe Embedding...")
glove_path = '/path/to/glove.twitter.27B.25d.txt'  # 실제 파일 경로
X_train_glove, X_test_glove = glove_embedding(X_train, X_test, glove_path)
evaluate_model(X_train_glove, X_test_glove, y_train, y_test, "GloVe")


Evaluating Document-Term Matrix...
DTM - Accuracy: 0.8540, F1 Score: 0.8550, ROC/AUC: 0.9772

Evaluating TF-IDF Matrix...
TF-IDF - Accuracy: 0.8886, F1 Score: 0.8887, ROC/AUC: 0.9854

Evaluating Word2Vec Embedding...
Word2Vec - Accuracy: 0.8809, F1 Score: 0.8817, ROC/AUC: 0.9768

Evaluating Pre-Trained GloVe Embedding...


FileNotFoundError: [Errno 2] No such file or directory: '/path/to/glove.twitter.27B.25d.txt'