In [2]:
import ujson
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import numpy as np


FEATURE_POSES = ["NC", "NQ"]
MA_KEY = "Contents_ma"
LABEL_KEY = "Type"
EXCL_LABEL = "환경"
NUM_FOLDS = 10


def read_documents_with_labels(input_file_name):
    documents = []
    labels = []

    with open(input_file_name, "r", encoding="utf-8") as input_file:
        for line in input_file:
            morphs = []
            json_obj = ujson.loads(line)
            label = json_obj[LABEL_KEY]
            
            if label == EXCL_LABEL:
                continue
                
            labels.append(label)

            for sent_anal in json_obj[MA_KEY]:
                for morph_lex, morph_cat in sent_anal:
                    if morph_cat not in FEATURE_POSES:
                        continue

                    morphs.append(morph_lex)

            document = " ".join(morphs)
            documents.append(document)

    # scikit의 다양한 기능을 이용하기 위해서
    # 파이썬 리스트를 numpy 모듈의 배열(array)로 바꾼다.
    documents = np.asarray(documents)
    labels = np.asarray(labels)
    
    return documents, labels


def build_doc_term_mats(train_documents, test_documents):
    vectorizer = TfidfVectorizer(tokenizer=str.split)
    train_doc_term_mat = vectorizer.fit_transform(train_documents)
    test_doc_term_mat = vectorizer.transform(test_documents)

    return train_doc_term_mat, test_doc_term_mat


def build_classifier(train_doc_term_mat, train_labels):
    clf = SGDClassifier()
    clf.fit(train_doc_term_mat, train_labels)

    return clf


def test_classifier(clf, test_doc_term_mat, test_labels):
    pred_labels = clf.predict(test_doc_term_mat)
    accuracy = accuracy_score(test_labels, pred_labels)
    precision = precision_score(test_labels, pred_labels, average="macro")
    recall = recall_score(test_labels, pred_labels, average="macro")
    f1 = f1_score(test_labels, pred_labels, average="macro")

    return accuracy, precision, recall, f1


def main():
    input_file_name = "../data/user/news_data.ma.txt"
    documents, labels = read_documents_with_labels(input_file_name)
    cross_val_set = KFold(n=len(documents), n_folds=NUM_FOLDS, shuffle=True)
    accuracies = []
    precisions = []
    recalls = []
    f1s = []

    for train, test in cross_val_set:
        train_documents = documents[train]
        train_labels = labels[train]
        test_documents = documents[test]
        test_labels = labels[test]

        train_doc_term_mat, test_doc_term_mat = \
                build_doc_term_mats(train_documents, test_documents)
        clf = build_classifier(train_doc_term_mat, train_labels)
        accuracy, precision, recall, f1 = \
                test_classifier(clf, test_doc_term_mat, test_labels)
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)

    print("Avg Accuracy: {}, Std Dev: {}".format(np.mean(accuracies), 
                                                 np.std(accuracies)))
    print("Avg Preision: {}, Std Dev: {}".format(np.mean(precisions), 
                                                 np.std(precision)))
    print("Avg Recall: {}, Std Dev: {}".format(np.mean(recalls), 
                                               np.std(recalls)))
    print("Avg F1: {}, Std Dev: {}".format(np.mean(f1s), 
                                           np.std(f1s)))

    
# 실행
main()

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Avg Accuracy: 0.7147676767676766, Std Dev: 0.04074224111659713
Avg Preision: 0.449339292257113, Std Dev: 0.0
Avg Recall: 0.4322521853227904, Std Dev: 0.07198938157493094
Avg F1: 0.4260110943498101, Std Dev: 0.06976468352947812
