# 신문 기사 분류 실험

In [None]:
import ujson
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import numpy as np


FEATURE_POSES = ["NC", "NQ"]
MA_KEY = "Contents_ma"
LABEL_KEY = "Category"
NUM_FOLDS = 10


def read_documents_with_labels(input_file_name):
    documents = []
    labels = []

    with open(input_file_name, "r", encoding="utf-8") as input_file:
        for line in input_file:
            morphs = []
            json_obj = ujson.loads(line)
            label = json_obj[LABEL_KEY]
            labels.append(label)

            for sent_anal in json_obj[MA_KEY]:
                for morph_lex, morph_cat in sent_anal:
                    if morph_cat not in FEATURE_POSES:
                        continue

                    morphs.append(morph_lex)

            document = " ".join(morphs)
            documents.append(document)

    # scikit의 다양한 기능을 이용하기 위해서
    # 파이썬 리스트를 numpy 모듈의 배열(array)로 바꾼다.
    documents = np.asarray(documents)
    labels = np.asarray(labels)
    
    return documents, labels


def build_doc_term_mats(train_documents, test_documents):
    vectorizer = TfidfVectorizer(tokenizer=str.split)
    train_doc_term_mat = vectorizer.fit_transform(train_documents)
    test_doc_term_mat = vectorizer.transform(test_documents)

    return train_doc_term_mat, test_doc_term_mat


def build_classifier(train_doc_term_mat, train_labels):
    clf = SGDClassifier()
    clf.fit(train_doc_term_mat, train_labels)

    return clf


def test_classifier(clf, test_doc_term_mat, test_labels):
    pred_labels = clf.predict(test_doc_term_mat)
    accuracy = accuracy_score(test_labels, pred_labels)
    precision = precision_score(test_labels, pred_labels, average="macro")
    recall = recall_score(test_labels, pred_labels, average="macro")
    f1 = f1_score(test_labels, pred_labels, average="macro")

    return accuracy, precision, recall, f1


def main():
    input_file_name = "../data/user/news_data.ma.txt"
    documents, labels = read_documents_with_labels(input_file_name)
    cross_val_set = KFold(n=len(documents), n_folds=NUM_FOLDS, shuffle=True)
    accuracies = []
    precisions = []
    recalls = []
    f1s = []

    for train, test in cross_val_set:
        train_documents = documents[train]
        train_labels = labels[train]
        test_documents = documents[test]
        test_labels = labels[test]

        train_doc_term_mat, test_doc_term_mat = \
                build_doc_term_mats(train_documents, test_documents)
        clf = build_classifier(train_doc_term_mat, train_labels)
        accuracy, precision, recall, f1 = \
                test_classifier(clf, test_doc_term_mat, test_labels)
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)

    print("Avg Accuracy: {}, Std Dev: {}".format(np.mean(accuracies), 
                                                 np.std(accuracies)))
    print("Avg Preision: {}, Std Dev: {}".format(np.mean(precisions), 
                                                 np.std(precision)))
    print("Avg Recall: {}, Std Dev: {}".format(np.mean(recalls), 
                                               np.std(recalls)))
    print("Avg F1: {}, Std Dev: {}".format(np.mean(f1s), 
                                           np.std(f1s)))

    
# 실행
main()

In [None]:
import ujson
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

FEATURE_POSES = ["NC", "NQ"]
MA_KEY = "Contents_ma"
LABEL_KEY = "Category"
NUM_FOLDS = 10


def read_documents_with_labels(input_file_name):
    documents = []
    labels = []

    with open(input_file_name, "r", encoding="utf-8") as input_file:
        for line in input_file:
            morphs = []
            json_obj = ujson.loads(line)
            label = json_obj[LABEL_KEY]                
            labels.append(label)

            for sent_anal in json_obj[MA_KEY]:
                for morph_lex, morph_cat in sent_anal:
                    if morph_cat not in FEATURE_POSES:
                        continue

                    morphs.append(morph_lex)

            document = " ".join(morphs)
            documents.append(document)

    # scikit의 다양한 기능을 이용하기 위해서
    # 파이썬 리스트를 numpy 모듈의 배열(array)로 바꾼다.
    documents = np.asarray(documents)
    labels = np.asarray(labels)
    
    return documents, labels


def build_pipeline():
    vectorizer = TfidfVectorizer(tokenizer=str.split)
    clf = SGDClassifier()
    pipeline = Pipeline([
        ('vect', vectorizer),
        ('clf', clf),
    ])
    
    return pipeline


def main():
    input_file_name = "../data/user/news_data.ma.txt"
    documents, labels = read_documents_with_labels(input_file_name)
    train_documents, test_documents, train_labels, test_labels = \
            train_test_split(documents, labels)
    pipeline = build_pipeline()

    parameters = {
        "vect__max_features": (5000, None),
        "vect__ngram_range": ((1, 1), (1, 2)),
        "vect__use_idf": (True, False),
        "vect__smooth_idf": (True, False),
        "vect__sublinear_tf": (True, False),
        "vect__norm": ("l1", "l2", None),
        "clf__loss": ("hinge", "log"),
        "clf__penalty": ("l1", "l2", "none"),
        "clf__alpha": (0.0001, 0.05, 0.1),
    }

    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, 
                               scoring="accuracy", cv=3)
    grid_search.fit(train_documents, train_labels)
    best_parameters = grid_search.best_estimator_.get_params()
    
    pred_labels = grid_search.predict(test_documents)

    print("Best score: {}".format(grid_search.best_score_))

    print("Best parameter set:")

    for param_name in parameters:
        print("\t{}: {}".format(param_name, best_parameters[param_name]))

    print("Accurary: {}".format(accuracy_score(test_labels, pred_labels)))


# 실행
main()