In [21]:
# Step1。加载数据
import os 
import pickle 
import pandas as pd
import numpy as np
if os.path.isfile('data.pkl'):
    f = open('data.pkl', 'rb')
    data = pickle.load(f)
    f.close()
else:
    data = pd.read_csv('sqlResult.csv', encoding = 'gb18030')
    f = open('data.pkl', 'wb')
    pickle.dump(data,f)
    f.close()
stopWords = []
with open('chinese_stopwords.txt') as stopWordsFile:
    stopWords = [i.strip('\n') for i in stopWordsFile.readlines()]
#print(data.info)
# Step2。数据清洗
 # 1）数据清洗，针对content字段为空的情况，进行dropna
data = data.dropna(subset=['content'])
# 2）分词，使用jieba进行分词

# 3）将处理好的分词保存到 corpus.pkl，方便下次调用
if os.path.isfile('coups.pkl'):
    f = open('coups.pkl', 'rb')
    coups = pickle.load(f)
    f.close()
else:
    coups = list(map(split_text, [str(text) for text in data.content]))
    f = open('coups.pkl', 'wb')
    pickle.dump(coups,f)
    f.close()
# Step3。计算数据集的TFIDF
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
count_vectorizer = CountVectorizer(encoding = 'gb18030', min_df = 0.015)
tfidf_transformer = TfidfTransformer()
count_vector = count_vectorizer.fit_transform(coups)
tfidf = tfidf_transformer.fit_transform(count_vector)
# 4）数据集切分
'''标记是否是新华社新闻'''
lable = list(map(lambda source: 1 if '新华社' in str(source) else 0, data.source))
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split( tfidf.toarray(), lable, test_size = 0.25, random_state = 33)

# Step4。预测文章风格是否和自己一致
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train, y_train)

'''预测test数据与查看预测性能'''
from sklearn.metrics import accuracy_score, precision_score, recall_score
y_predict = model.predict(x_test)
print('准确率：',accuracy_score(y_test, y_predict))
print('精确率：', precision_score(y_test, y_predict))
print('召回率：', recall_score(y_test, y_predict))

'''使用全量数据预测'''
prediction = model.predict(tfidf.toarray())
lables = np.array(lable)
# Step5。找到可能Copy的文章，即预测label=1，但实际label=0
'''预测结果与真实结果的拼接'''
compare_news_index = pd.DataFrame({'perdiction' : prediction, 'lables' : lables})
copy_news_index = compare_news_index[(compare_news_index['perdiction'] == 1) & (compare_news_index['lables'] == 0)].index

xinhuashe_news_index = compare_news_index[(compare_news_index['lables'] == 1)].index

from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
normalizer = Normalizer()
scaled_array = normalizer.fit_transform(tfidf.toarray())

#使用KMeans
kmeans = KMeans(n_clusters = 25)
k_labels = kmeans.fit_predict(scaled_array)

# 统计聚类之后，包含新华社新闻的class
from collections import defaultdict
id_class = {index:class_ for index, class_ in enumerate(k_labels)}
class_id = defaultdict(set)
for index, class_ in id_class.items():
    if index in xinhuashe_news_index.tolist():
        class_id[class_].add(index)
cpindex = 3352
print('是否在新华社',cpindex in xinhuashe_news_index)
print('是否在copy_news', cpindex in copy_news_index)
similar_list = find_similar_text(cpindex)
print(similar_list)

准确率： 0.8869693071126631
精确率： 0.9618423174074273
召回率： 0.9109278770295719
是否在新华社 False
是否在copy_news True
[(3134, array([[0.96849134]])), (63511, array([[0.94643198]])), (29441, array([[0.94283416]])), (3218, array([[0.87621892]])), (29615, array([[0.86936328]])), (29888, array([[0.86215862]])), (64046, array([[0.85278235]])), (29777, array([[0.84875422]])), (63974, array([[0.73415212]])), (63975, array([[0.73415212]]))]


In [37]:
import jieba
def split_text(text):
    text = text.replace(' ','').replace('\n', '')
    text2 = jieba.cut(text)
    result = ' '.join([w for w in text2 if w not in stopWords])
    return result

In [1]:
from sklearn.metrics.pairwise import cosine_similarity
# 查找相似的文章
def find_similar_text(cpindex, top=10):
    # 只在新华社发布的文章中查找
    dist_dict = {i:cosine_similarity(tfidf[cpindex], tfidf[i]) for i in class_id[id_class[cpindex]]}
    # 从大到小排序
    return sorted(dist_dict.items(), key=lambda x:x[1][0], reverse=True)[:top]

In [5]:
def editdistance(str1, str2):

SyntaxError: invalid syntax (<ipython-input-5-0a8cf7b99133>, line 1)