#### SMS Spam Collection Dataset (垃圾邮件分类)
https://www.kaggle.com/uciml/sms-spam-collection-dataset

In [None]:
'''
    Import training data
'''

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

data_dir = "../input/"

df = pd.read_csv(data_dir + '/spam.csv', encoding='latin-1')
print ('Training data: ')
print (df.head())

# split into train and test
data_train, data_test, labels_train, labels_test = train_test_split(
    df.v2,
    df.v1, 
    test_size=0.2, 
    random_state=0)  

print ('Show email contents:')
print (data_train[:10]) 
print ('Labeled the email as Spam or Ham:')
print (labels_train[:10])


统计总共单词个数

In [None]:
'''
    Keep words in a dictionary with its unique ID
'''
def GetVocabulary(data): 
    vocab_dict = {}
    wid = 0
    for document in data:
        words = document.split()
        for word in words:
            word = word.lower()
            if word not in vocab_dict:
                vocab_dict[word] = wid
                wid += 1
    return vocab_dict

# 用训练集建立词汇表
vocab_dict = GetVocabulary(data_train)
print ('Number of all the unique words : ' + str(len(vocab_dict)))

In [None]:
'''
    Convert the Document -> Vector
'''
def Document2Vector(vocab_list, data):
    word_vector = np.zeros(len(vocab_list))
    words = data.split()
    for word in words:
        if word in vocab_list:
            word_vector[vocab_list[word]] += 1
    return word_vector

print (data_train[1:2,])
ans = Document2Vector(vocab_dict,"we are good good")
print(ans)
#print (data_train.values[2])
print(ans[vocab_dict['we']], ans[vocab_dict['are']], ans[vocab_dict['good']])

In [None]:
train_matrix = []
for document in data_train.values:
    word_vector = Document2Vector(vocab_dict, document)
    train_matrix.append(word_vector)

print (len(train_matrix))

做naive bayes 训练，得到训练集每个词概率

In [None]:
'''
    Possibilities in Trainset：
        1. Word's possibility in one catagory e.g. P('email'|Spam)
        2. Spam or Ham's Possibilities e.g. P(Spam)
        
    计算实现巧妙利用了numpy的array结构：
        1. 在每个分类下创建一个与词汇量大小相等的vector(即 numpy array), 即spam_word_counter 和 ham_word_counter
        2. 在遍历每一个句子的时候，直接与句子对应的vector相加，累积每个单词出现的次数
        3. 在遍历完所有句子之后，再除以总词汇量，得到每个单词的概率
'''
def NaiveBayes_train(train_matrix, labels_train):
    num_docs = len(train_matrix)
    num_words = len(train_matrix[0])
    
    spam_vector_count = np.ones(num_words)
    ham_vector_count = np.ones(num_words)  #计算频数初始化为1
    
    spam_total_count = 0
    ham_total_count = 0
    
    spam_count = 0
    ham_count = 0
    for i in range(num_docs):
        if i % 500 == 0:
            print ('Train on the doc id:' + str(i))
            
        if labels_train[i] == 'spam':
            ham_vector_count += train_matrix[i]
            ham_total_count += sum(train_matrix[i])
            ham_count += 1
        else:
            spam_vector_count += train_matrix[i]
            spam_total_count += sum(train_matrix[i])
            spam_count += 1
    
    print (ham_count)
    print (spam_count)
    
    p_spam_vector = np.log(ham_vector_count/ham_total_count + num_words) #注意在分母加上拉普拉斯平滑
    p_ham_vector = np.log(spam_vector_count/spam_total_count + num_words)#注意在分母加上拉普拉斯平滑
    
    return p_spam_vector, np.log(spam_count/num_docs), p_ham_vector, np.log(ham_count/num_docs)
    #返回各类对应特征的条件概率向量
    #和各类的先验概率
    
p_spam_vector, p_spam, p_ham_vector, p_ham = NaiveBayes_train(train_matrix, labels_train.values)

In [None]:
data_test.values.shape

In [None]:
'''
    Test words -> vectors，增加smoothing的部分
'''
def Test2Vector(vocab_dict, data):
    word_vector = np.zeros(len(vocab_dict.keys()))
    words = data.split()
    # 统计out-of-voc的词汇量
    out_of_voc = 0
    for word in words:
        word = word.lower()
        if word in vocab_dict:
            word_vector[vocab_dict[word]] += 1
        else:
            out_of_voc += 
    return word_vector

'''
     Predict on testset and check which possibility is greater.
'''    
def Predict(test_word_vector, p_spam_vector, p_spam, p_ham_vector, p_ham):
    # Note: If it's a new word，test_word_vector对应的维度为0
    # Thus: test_word_vector * p_spam_vector 不为0的维度正好是句子中每个词的概率
    spam = sum(test_word_vector * p_spam_vector) + p_spam
    ham = sum(test_word_vector * p_ham_vector) + p_ham
    if spam > ham:
        return 'spam'
    else:
        return 'ham'

predictions = []
i = 0
for document in data_test.values:
    if i % 200 == 0:
        print ('Test on the doc id:' + str(i))
    i += 1    
    test_word_vector = Document2Vector(vocab_list, document)
    ans = Predict(test_word_vector, p_spam_vector, p_spam, p_ham_vector, p_ham)
    predictions.append(ans)

print (len(predictions))

In [None]:
'''Accuracy, report in Testset '''

from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.model_selection import cross_val_score


print (accuracy_score(labels_test, predictions))
print (classification_report(labels_test, predictions))
print (confusion_matrix(labels_test, predictions))
