In [41]:
import os
import jieba
import re
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [42]:
def load_from_txt(fdir, stop_words):
    """loading data from txt 

    @type fdir: str
    @type stop_words: set
    @rtype: data List
    @rtype: word_set Dict
    """
    
    data = []
    word_set = set()
    p1 = re.compile('[0-9a-zA-Z]')
    if os.path.exists(fdir) == False:
        print(fdir,'does not exist, try again!')
        return None
    for fname in os.listdir(fdir):
        content = []
        for line in open(fdir+'/'+fname):
            line = p1.subn('',line)[0]
            for word in jieba.cut(line.strip()):
                if len(word) > 1 \
                   and word not in stop_words:
                    content.append(word)
                    word_set.add(word)
        data.append(' '.join(content))
    return data, word_set

In [43]:

def write_dict(word_set, fname):
    """write the dict into a file

    @type word_set: word dictionary
    @type fname: dictionary filename
    """
    f = open(fname,'w')
    for word in word_set:
        #use the utf8 encode
        f.write(word.encode('utf8')+'\n')
    f.close()

In [44]:
def write_seg_result(data, fname):
    """write the seg result into a file

    @type data: seg data
    @type fname: seg filename
    """
    f = open(fname,'w')
    for line in data:
        #use the utf8 encode
        f.write(line.encode('utf8')+'\n')
    f.close()

In [45]:
def write_result(result_data, fname):
    """write the result into a file

    @type data: reuslt data
    @type fname: result filename
    """
    f = open(fname,'w')
    for d in result_data:
        s = ''
        for dd in d[:-1]:
            s += str(dd)+','    
        f.write(s+str(d[-1])+'\n')
    f.close()

In [46]:
def word_count(data, word_set):
    """count the data into the map
    @type data: seg data
    @type word_set: word dictionary
    """
    wd = dict(zip(word_set, range(len(word_set))))
    result_data = []
    for line in data:
        d = [0]*len(word_set)
        for word in line.split(' '):
            if len(word) > 0:
                d[wd[word]] += 1
        result_data.append(d)
    return result_data

In [47]:
def get_data_result(fdir):
    print ('load stop words')
    stop_words = set([i.strip().encode('utf8').decode('utf8') for i in open('stoplis.txt')])
    print ('load file and seg:')    
    data,word_set = load_from_txt(fdir, stop_words)
    print ('seg over')
    print ('write the dict')
    write_dict(word_set, 'my_dict_'+fdir+'.txt')
    print ('write the seg file')
    write_seg_result(data, 'seg_'+fdir+'.txt')
    print ('start to word count')    
    return data,word_set

In [48]:
def text_preocess():
    data = [i.strip().decode('utf8') for i in open('seg_pos.txt')]    
    label_pos = np.ones((3000,1),dtype='int')
    label_neg = np.zeros((3000,1),dtype='int')
    label = np.vstack((label_pos,label_neg))    
    data = data+[i.strip().decode('utf8') for i in open('seg_neg.txt')]
    c = CountVectorizer(min_df=3,ngram_range=(1,3))
    tfidf = TfidfTransformer()
    tfidf_all=tfidf.fit_transform(c.fit_transform(data))
    print(tfidf_all.shape)
    data = tfidf_all
    labels = label
    xtrain, xtest, ytrain, ytest = train_test_split(data.toarray(), labels, test_size=0.4, random_state=1)
    model = GaussianNB()
    model.fit(xtrain, ytrain)
    ypredict = model.predict(xtest)
    print ('accuracy is:',accuracy_score(ytest, ypredict))

In [49]:
def predict():
    pos_data = np.array([i.strip().split(',') for i in open('data_pos.txt')])
    m1 = np.array(pos_data,dtype='int')
    label = np.ones((3000,1),dtype='int')
    pd = np.hstack((m1,label))
    #print(pd.shape)
    neg_data = np.array([i.strip().split(',') for i in open('data_neg.txt')])
    m1 = np.array(neg_data,dtype='int')
    label = np.zeros((3000,1),dtype='int')
    nd = np.hstack((m1,label))
    print (pd.shape)
    print (nd.shape)
    ad = np.concatenate((pd,nd))
    data = np.array(ad[:, :-1])
    
    labels = np.array(ad[:,-1])
    xtrain, xtest, ytrain, ytest = train_test_split(data, labels, test_size=0.4, random_state=1)
    model = GaussianNB()
    model.fit(xtrain, ytrain)
    ypredict = model.predict(xtest)
    print ('accuracy is:',accuracy_score(ytest, ypredict))

In [50]:
if __name__ =='__main__':
    # main function    
    posdata,word_set_pos = get_data_result('pos')
    negdata,word_set_neg = get_data_result('neg')
    word_set = word_set_pos | word_set_neg
    result_data = word_count(posdata, word_set)
    write_result(result_data, 'data_pos.txt')
    result_data = word_count(negdata, word_set)
    write_result(result_data, 'data_neg.txt')
    text_preocess()
    #predict()

load stop words
load file and seg:


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xbe in position 0: invalid start byte