In [4]:
import pandas as pd
import numpy as np

from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import gensim
from sklearn import metrics
from sklearn.svm import SVC
import re

# 保存训练好的模型
def storeModel(MyModel, fileName):
    fw = open(fileName, 'wb')
    pickle.dump(MyModel, fw)
    fw.close()

# 读取模型
def loadModel(fileName):
    fr = open(fileName, 'rb')
    MyModel = pickle.load(fr)
    fr.close()
    return MyModel

# 过滤emoji
def filterFace(destStr):
    try:
        p = re.compile(u'([[\U00010000-\U0010ffff])')
    except re.error:
        p = re.compile(u'([\uD800-\uDBFF][\uDC00-\uDFFF])')

    t = p.findall(destStr)
    return list(set(t))


# 过滤类似 :)  :(的表情
def filterFace1(destStr):
    p = re.compile(":\)")
    q = re.compile(":\(")
    t1 = list(p.findall(destStr))
    t2 = list(q.findall(destStr))

    if len(t1) < 1:
        t = t2
    else:
        t = t1
    return t


# 预处理文本信息，数据清洗
def proReview(dataSet):
    # 缺失项统计
    # print("null num :", trainData.isnull().sum())
    # 表情处理
    emjoy = []
    for words in dataSet['review']:
        emjoy.append(filterFace(words))

    emjoy1 = []
    for words in dataSet['review']:
        emjoy1.append(filterFace1(words))

    for i in range(len(emjoy)):
        emjoy[i].extend(tok for tok in emjoy1[i])

    # 去除字符串中的非字母 数字 下划线 空格
    dataSet['review'] = dataSet['review'].str.replace('[^\w\s]', ' ')
    dataSet['review'] = dataSet['review'].str.replace('[0-9]', '')
    # 将所有的大写转化为小写
    dataSet['review'] = dataSet['review'].apply(lambda sen: " ".join(x.lower() for x in sen.split() if len(x) > 0))
    dat = []
    for i in range(len(dataSet['review'])):
        strTemp = dataSet['review'][i] + ' '.join(emjoy[i])
        dat.append(strTemp)
    '''
    # 常见词去除
    freq = pd.Series(' '.join(dataSet['review']).split()).value_counts()[:5]
    delFreq = list(freq.index)
    dataSet['review'] = dataSet['review'].apply(lambda x: ' '.join(word for word in x.split() if word not in delFreq))


    # 稀缺词去除
    freq = pd.Series(' '.join(trainData['review']).split()).value_counts()[-10:]
    delFreq = list(freq.index)
    trainData['review'] = trainData['review'].apply(lambda x: ' '.join(word for word in x.split() if word not in delFreq))
    '''
    # 将表情添加进去
    
    return pd.Series(dat)

# 预处理数据
def loadData():
    trainData = pd.read_csv('train.csv', lineterminator='\n')
    testData = pd.read_csv('20190506_test.csv', lineterminator='\n')

    trainDataMat = proReview(trainData)
    testDataMat = proReview(testData)

    m = trainDataMat.shape[0]
    # get babels
    labelsText = trainData['label']
    labels = np.ones((m, 1))
    for i in range(m):
        if labelsText[i] == 'Negative':
            labels[i] = 0

    return trainDataMat, testDataMat, labels

# 加载数据
trainDataSet, testDataSet, labels = loadData()
trainLen = len(trainDataSet)

# TFI-TF 关键词提取以及分词
tfidf_vec = TfidfVectorizer(max_df=0.7, min_df=2, ngram_range=(1,3))
tfidf_matrix = tfidf_vec.fit_transform(trainDataSet.astype('U').tolist() + testDataSet.astype('U').tolist())

trainMat = tfidf_matrix[:trainLen]
testMat = tfidf_matrix[trainLen:]
trainLabels = labels

# K fold
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score

MNB = MultinomialNB(alpha=0.7)
sfd = StratifiedKFold(n_splits=10) # 10折验证
sfd.get_n_splits(trainMat, trainLabels)
sumAuc = 0.0
for train_index, test_index in sfd.split(trainMat, labels):
    X_train = trainMat[train_index]
    y_train = trainLabels[train_index]
    
    X_test = trainMat[test_index]
    y_test = trainLabels[test_index]
    
    MNB.fit(X_train, y_train)
    auc = roc_auc_score(y_test, MNB.predict_proba(X_test)[:, 1])
    sumAuc += auc
    print(auc)
print("ave Auc :", sumAuc / 10.0)
MNB.fit(trainMat, trainLabels)
# 保存模型
storeModel(MNB, 'MNB.txt')
# 读取模型
MNB = loadModel('MNB.txt')
predictions = MNB.predict_proba(testMat)
result = {'ID': range(1, len(predictions) + 1), "Pred": predictions[:, 1]}
result = pd.DataFrame(result)

result.to_csv('submit.csv', index=False)


0.8830041263275684
0.8497474747474748
0.8579044412377747
0.8721941638608306
0.8703803912137246
0.8435445727112393
0.8814534231200897
0.8441320785070785
0.8560066763191765
0.8804898648648649
ave Auc : 0.8638857212909821


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
