sklearn

In [1]:
# coding=utf8
# 调用sklearn实现逻辑回归对短文本（垃圾短信）分类
#  Copyright 木豆
#  2017-11-6

import os
import sys
import jieba
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

def create_vocab_dict(dataSet):
    vocab_dict = {}    
    for document in dataSet:
        for term in document:
           if term in vocab_dict:
              vocab_dict[term] += 1
           else:
              vocab_dict[term] = 1
    return vocab_dict

def BOW_feature(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

def get_dataset(data_path):
    text_list = []
    labels = []
    for line in open(data_path, "r"):     
        arr = line.rstrip().split('\t')
        if len(arr) < 3:
            continue
        label = arr[0]
        text = arr[2]
        text_list.append(list(text.split()))
        labels.append(float(label))
    return text_list, labels    

if __name__ == "__main__":
    
    # 读取数据
    train_file_path = 'mudou_spam.train'
    test_file_path = 'mudou_spam.test'
    train_data, train_label = get_dataset(train_file_path)
    test_data, test_label = get_dataset(test_file_path)
    
    # 构造词典
    min_freq = 5
    vocab_dict = create_vocab_dict(train_data)
    sorted_vocab_list= sorted(vocab_dict.iteritems(), key=lambda d:d[1], reverse = True)    
    vocab_list = [  v[0]  for v  in sorted_vocab_list if int(v[1]) > min_freq ]
 
    # 生成文本的词袋（BOW）特征
    train_X = []
    for one_msg  in train_data:
        train_X.append(BOW_feature(vocab_list, one_msg))
        
    test_X = []
    for one_msg  in test_data:
        test_X.append(BOW_feature(vocab_list, one_msg))
        
    # 训练模型
    model = LogisticRegression()
    model.fit(train_X, train_label)
    pred = model.predict(test_X)

    # 模型评估
    accuracy_train = model.score(train_X, train_label)
    print '训练集accuracy:',accuracy_train
    accuracy_test = model.score(test_X, test_label)
    print '测试集accuracy: ',accuracy_test

    predict_prob_y = model.predict_proba(test_X)[:,1]
    test_auc = metrics.roc_auc_score(test_label,predict_prob_y)
    print '测试集AUC:',test_auc


训练集accuracy: 0.993464052288
测试集accuracy:  0.962454873646
测试集AUC: 0.991765957543


In [None]:
from sklearn.linear_model import LogisticRegression