# 분류 모델의 저장과 이용

In [1]:
# 학습한 분류 모델 저장

import ujson
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
import numpy as np

FEATURE_POSES = ["NC", "NQ"]
MA_KEY = "Contents_ma"
LABEL_KEY = "Category"


def read_documents_with_labels(input_file_name):
    """주어진 이름을 파일에서 문서 집합과 해당 레이블을 읽어서 돌려준다."""
    
    documents = []
    labels = []

    with open(input_file_name, "r", encoding="utf-8") as input_file:
        for line in input_file:
            morphs = []
            json_obj = ujson.loads(line)
            label = json_obj[LABEL_KEY]
            labels.append(label)

            for sent_anal in json_obj[MA_KEY]:
                for morph_lex, morph_cat in sent_anal:
                    if morph_cat not in FEATURE_POSES:
                        continue

                    morphs.append(morph_lex)

            document = " ".join(morphs)
            documents.append(document)

    documents = np.asarray(documents)
    labels = np.asarray(labels)
    
    return documents, labels


def build_pipeline(train_documents, train_labels):
    """주어진 학습 문서 집합과 레이블로 분류 파이프라인을 생성하여 돌려준다."""
    
    vectorizer = TfidfVectorizer(tokenizer=str.split, ngram_range=(1, 2),
                                 smooth_idf=True, norm="l2",
                                 sublinear_tf=True, max_features=5000,
                                 use_idf=False)
    clf = SGDClassifier(penalty="l2", alpha=0.0001, loss="log")
    pipeline = Pipeline([
        ('vect', vectorizer),
        ('clf', clf),
    ])
    pipeline.fit(train_documents, train_labels)
    
    return pipeline


def save_pipeline(model_file_name, pipeline):
    """주어진 분류 모델을 모델 파일에 기록한다."""
    
    joblib.dump(pipeline, model_file_name)


def main():
    """뉴스 기사 분류 모델을 학습하여 저장한다."""
    
    input_file_name = "../data/user/news_data.ma.txt"
    model_file_name = "../data/user/news_class.model"
    documents, labels = read_documents_with_labels(input_file_name)
    pipeline = build_pipeline(documents, labels)
    save_pipeline(model_file_name, pipeline)
    
    
# 실행
main()

In [None]:
# 저장한 분류 모델 이용

import ujson
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score
import numpy as np

FEATURE_POSES = ["NC", "NQ"]
MA_KEY = "Contents_ma"
LABEL_KEY = "Category"


def read_documents_with_labels(input_file_name):
    """주어진 이름을 파일에서 문서 집합과 해당 레이블을 읽어서 돌려준다."""
    
    documents = []
    labels = []

    with open(input_file_name, "r", encoding="utf-8") as input_file:
        for line in input_file:
            morphs = []
            json_obj = ujson.loads(line)
            label = json_obj[LABEL_KEY]
            labels.append(label)

            for sent_anal in json_obj[MA_KEY]:
                for morph_lex, morph_cat in sent_anal:
                    if morph_cat not in FEATURE_POSES:
                        continue

                    morphs.append(morph_lex)

            document = " ".join(morphs)
            documents.append(document)

    documents = np.asarray(documents)
    labels = np.asarray(labels)
    
    return documents, labels


def load_pipeline(model_file_name):
    """분류 모델을 주어진 이름의 파일에서 읽어서 돌려준다."""
    
    pipeline = joblib.load(model_file_name)
    
    return pipeline
    
    
def test_pipeline(pipeline, test_documents, test_labels):
    """주어진 파이프라인의 성능을 측정한다."""
    
    pred_labels = pipeline.predict(test_documents)
    accuracy = accuracy_score(test_labels, pred_labels)
    print("Accuracy: {}".format(accuracy))


def main():
    """저장한 분류 모델을 읽어서 분류를 수행한다."""
    
    input_file_name = "../data/user/news_data.ma.txt"
    model_file_name = "../data/user/news_class.model"
    documents, labels = read_documents_with_labels(input_file_name)
    pipeline = load_pipeline(model_file_name)
    test_pipeline(pipeline, documents, labels)
    
    
# 실행
main()