In [1]:
import pymongo

In [2]:
# MongoDB数据库配置
client = pymongo.MongoClient('mongodb://localhost:27017/')
db = client['elastic']
collection = db['juejin']

In [3]:
# 分析文章的长度分布
all_items = collection.find({})
all_content_length = [len(item['content']) for item in all_items]
print(all_items.count())
print(all_content_length[:50])

2996
[3122, 1804, 1961, 2857, 2485, 0, 220, 5235, 3927, 6569, 2746, 6083, 4884, 2834, 1603, 550, 2927, 6132, 2614, 0, 0, 3273, 13271, 3928, 3893, 9318, 1096, 13271, 13271, 2425, 1530, 668, 1945, 1802, 6733, 1695, 5357, 0, 68444, 11914, 19613, 5662, 6574, 7921, 2942, 7443, 3475, 9702, 365, 4719]


In [4]:
content_length_distribution = {
    'large': 0,
    'middle': 0,
    'small': 0,
    'x-small': 0,
}
for item_content_length in all_content_length:
    if item_content_length >= 10000:
        content_length_distribution['large'] += 1 
    elif item_content_length >= 5000 and item_content_length < 10000:
        content_length_distribution['middle'] += 1 
    elif item_content_length >= 1000 and item_content_length < 5000:
        content_length_distribution['small'] += 1 
    else:
        content_length_distribution['x-small'] += 1
print(content_length_distribution)

{'small': 1707, 'middle': 440, 'large': 175, 'x-small': 674}


In [5]:
from pyecharts import Pie
pie = Pie("掘金文章长度分布图", "单位为：（字）")
pie.add("字数", ["x-small", "large", "middle", "small"], [674,175,440,1707],
        is_more_utils=True)
pie

In [6]:
from pyecharts import Bar
# 字数在1000 - 5000 的文章字数分布
small_content_length_distribution = {
    '1000-2000': 0 ,
    '2000-3000': 0 ,
    '3000-4000': 0 ,
    '4000-5000': 0 
}
for item_content_length in all_content_length:
    if item_content_length >= 1000 and item_content_length < 2000:
        small_content_length_distribution['1000-2000'] += 1 
    elif item_content_length >= 2000 and item_content_length < 3000:
        small_content_length_distribution['2000-3000'] += 1 
    elif item_content_length >= 3000 and item_content_length < 4000:
        small_content_length_distribution['3000-4000'] += 1 
    elif item_content_length >= 4000 and item_content_length < 5000:
        small_content_length_distribution['4000-5000'] += 1
print(small_content_length_distribution)

{'3000-4000': 347, '4000-5000': 238, '2000-3000': 495, '1000-2000': 627}


In [7]:
bar = Bar('字数在1000 - 5000 的文章字数分布',"单位为：（字）")
labels = [label for label in small_content_length_distribution.keys()]
values = [value for value in small_content_length_distribution.values()]
# print(labels)
bar.add('字数',labels,values,is_more_utils=True)
bar

In [4]:
# 分析所有文章的tags
all_items = collection.find({})
# 获得所有的tags
all_tags = []
for item in all_items:
    all_tags += item['tags']
print(all_tags[:20])

['JavaScript', '前端', '微信小程序', 'RxJS', '微信', 'JavaScript', '设计', '微信小程序', '前端', '微信', '微信小程序', '微信小程序', '前端', '微信', '微信小程序', '前端', '微信', '微信小程序', 'JavaScript', '前端']


In [5]:
all_tags_set = set(all_tags)
print(len(all_tags_set))
all_tags_distribution = {}
for set_item in all_tags_set:
    all_tags_distribution[set_item] = 0
# print(all_tags_distribution)
for tag_item in all_tags:
    all_tags_distribution[tag_item] += 1
# print(all_tags_distribution)

302


In [8]:
from pyecharts import WordCloud

word_cloud = WordCloud('掘金文章的标签的分布','')

labels = [label for label in all_tags_distribution.keys()]
values = [value for value in all_tags_distribution.values()]

word_cloud.add('次数',labels,values)
# word_cloud.options
word_cloud

In [9]:
# 分析文章的时间分布
all_items = collection.find({})
all_created_date = [item['created_date'].split('T')[0] for item in all_items]
print(all_created_date[:5])

['2016-12-30', '2016-09-23', '2017-12-27', '2016-11-16', '2016-09-26']


In [10]:
year_list = []
for item_create_date in all_created_date:
    year_list.append(item_create_date.split('-')[0])
year_list = sorted(year_list,key=lambda x:int(x[3]))
# print(year_list)
year_set = set(year_list)
print(year_set)
all_year_distribution = {}
for set_item in year_set:
    all_year_distribution[set_item] = 0

for list_item in year_list:
    all_year_distribution[list_item] += 1
print(all_year_distribution)

{'2017', '2018', '2016', '2015'}
{'2017': 1162, '2016': 327, '2015': 4, '2018': 1503}


In [11]:
from pyecharts import Line

line = Line('掘金文章年份分布图','')
labels = [label for label in all_year_distribution.keys()]
values = [value for value in all_year_distribution.values()]

line.add('文章数',['2015', '2016', '2017', '2018'],[ 4, 327, 1162, 1503],is_more_utils=True)
line

In [27]:
# 2017年每个月份的文章分布图
month_list = []
# all_created_date[:5]
for item_created_date in all_created_date:
    if item_created_date.split('-')[0] == '2017':
        month_list.append(int(item_created_date.split('-')[1]))
        
month_list.sort()
month_set = set(month_list)
# print(month_set)

month_distribution = {}
for set_item in month_set:
    month_distribution[set_item] = 0

for list_item in month_list:
    month_distribution[list_item] += 1
# print(month_distribution)

line = Line('掘金文章2017年每月分布图','')
labels = [label for label in month_distribution.keys()]
values = [value for value in month_distribution.values()]

line.add('文章数',labels,values,is_more_utils=True)
# line.options
line

In [88]:
# 分析点赞数，浏览量，收藏量
sort_by_views_count = collection.find().sort('views_count',pymongo.DESCENDING)

# 浏览量前五十
views_count_distribution = {}
for item in sort_by_views_count[:50]:
#     print(item['title'],item['views_count'])
    views_count_distribution[item['title']] = item['views_count']
print(views_count_distribution)

{'编写自己的代码库（javascript常用实例的实现与封装）': 16254, '未来的前端工程师': 11002, '你敢在post和get上刁难我，就别怪我装逼了': 14472, '送给前端开发者的一份新年礼物': 12775, 'B站的前端之路': 17492, '2018前端值得关注的技术': 23680, '微信小游戏跳一跳外挂辅助程序': 12887, '如何优雅地使用 Git': 11595, 'AI 系统首次实现真正自主编程，完爆初级程序员': 28363, '首个微信小程序开发教程！': 125928, '面试过阿里等互联网大公司，我知道了这些套路 | 掘金技术征文': 18862, '鹿晗关晓彤公开恋情，是如何把微博服务器搞炸的？': 21196, '某小公司RESTful、共用接口、前后端分离、接口约定的实践': 11735, '这一次，彻底弄懂 JavaScript 执行机制': 24905, '打造自己的JavaScript武器库': 12927, '100+ 超全的 web 开发工具和资源': 10838, 'JavaScript专题系列20篇正式完结！': 20346, 'iView 发布后台管理系统 iview-admin，没错，它就是你想要的': 14191, '2018 我所了解的 Vue 知识大全（一）': 11840, '[译] React、Jest、Flow 和 Immutable.js 将使用 MIT 许可证': 27578, '个人总结（css3新特性）': 12321, '教你用Python来玩微信跳一跳': 22652, '2017下半年掘金日报优质文章合集：前端篇': 16820, '[译] 2017 年比较 Angular、React、Vue 三剑客 ': 15398, '个人分享--web前端学习资源分享': 19548, '如何无痛降低 if else 面条代码复杂度': 19974, '技术胖155集前端视频教程-全部免费观看': 19691, '关于IT培训机构的个人看法': 18355, 'JS维护nginx反向代理，妈妈再也不用担心我跨域了！': 10739, '能让你开发效率翻倍的 VSCode 插件配置（上）': 11849, '手摸手，带你优雅的使用 icon': 11728, 'Vue 

In [89]:
bar = Bar('掘金文章浏览量前50分布图','')
labels = [label for label in views_count_distribution.keys()]
values = [value for value in views_count_distribution.values()]

bar.add('浏览数',labels,values,is_more_utils=True)
bar

In [91]:
# 收藏量前五十
sort_by_collection_count = collection.find().sort('collection_count',pymongo.DESCENDING)

collection_count_distribution = {}
for item in sort_by_collection_count[:50]:
#     print(item['title'],item['views_count'])
    collection_count_distribution[item['title']] = item['collection_count']
# print(collection_count_distribution)

bar = Bar('掘金文章收藏量前50分布图','')
labels = [label for label in collection_count_distribution.keys()]
values = [value for value in collection_count_distribution.values()]

bar.add('收藏数',labels,values,is_more_utils=True)
bar

In [25]:
# 评论量前五十
sort_by_comments_count = collection.find().sort('comments_count',pymongo.DESCENDING)

comments_count_distribution = {}
for item in sort_by_comments_count[:50]:
#     print(item['title'],item['views_count'])
    comments_count_distribution[item['title']] = item['comments_count']
# print(collection_count_distribution)

from pyecharts import Bar
bar = Bar('掘金文章评论量前50分布图','')
labels = [label for label in comments_count_distribution.keys()]
values = [value for value in comments_count_distribution.values()]

bar.add('评论数',labels,values,is_more_utils=True)
bar

In [16]:
import jieba

# 用结巴分词分析所有title
all_titles = [item['title'] for item in collection.find({})]
# print(all_titles[:5])

In [17]:
import re
from string import punctuation

add_punc='0123456789-/］…」__［｜＂～「『』②·☞，。、【】“”：；（）《》‘’{}？！⑦()、%^>℃：.”“^-_[]-——=擅长于的&#@￥' + punctuation

all_tokens_list = []

for line in all_titles:
    cuts = jieba.cut(line,cut_all=False)
    for cut in cuts:
        if cut not in add_punc:
            all_tokens_list.append(cut)
        
print(len(all_tokens_list))

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 2.045 seconds.
Prefix dict has been built succesfully.


27092


In [18]:
all_tokens_set = set(all_tokens_list)
# print(all_tokens_set)
all_tokens_distribution = {}
for set_item in all_tokens_set:
    all_tokens_distribution[set_item] = 0
# print(all_tags_distribution)
for token_item in all_tokens_list:
    all_tokens_distribution[token_item] += 1
# print(all_tokens_distribution)

In [21]:
from pyecharts import WordCloud
word_cloud = WordCloud('掘金文章标题分布','')

labels = [label for label in all_tokens_distribution.keys()]
values = [value for value in all_tokens_distribution.values()]

word_cloud.add('次数',labels,values)
word_cloud

In [27]:
# 分析文章内容
all_content = [item['content'] for item in collection.find({})]

content_tokens_list = []

for line in all_content:
    cuts = jieba.cut(line,cut_all=False)
    for cut in cuts:
        if cut not in add_punc:
            content_tokens_list.append(cut)
        
print(len(content_tokens_list))

4806605


In [28]:
content_tokens_set = set(content_tokens_list)
print(len(content_tokens_set))

80081


In [29]:
content_tokens_distribution = {}
for set_item in content_tokens_set:
    content_tokens_distribution[set_item] = 0
# print(all_tags_distribution)
for token_item in content_tokens_list:
    content_tokens_distribution[token_item] += 1
# print(content_tokens_distribution)

In [46]:
new_content_tokens_distribution = sorted(content_tokens_distribution.items(),key=lambda x:x[1])
# print(content_tokens_distribution.items())
word_cloud_data = new_content_tokens_distribution[-800:-150]

In [47]:
word_cloud = WordCloud('掘金文章内容分布','')

labels = [label[0] for label in word_cloud_data]
values = [value[1] for value in word_cloud_data]

word_cloud.add('次数',labels,values)
word_cloud

In [50]:
# 将掘金的数据从mongodb中清洗后转入elasticsearch中
# 定义elasticsearch DocType
from elasticsearch_dsl import DocType,Text,Keyword,Integer,Completion,Date
from elasticsearch_dsl.connections import connections
from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer

connections.create_connection(hosts=['localhost'])

class CustomAnalyzer(_CustomAnalyzer):
    def get_analysis_definition(self):
        return {}

ik_analyzer = CustomAnalyzer('ik_max_word',filter=['lowercase'])

In [68]:
class JuejinType(DocType):
    suggest = Completion(analyzer=ik_analyzer)
    title = Text(analyzer='ik_max_word')
    tags = Text(analyzer='ik_max_word')
    content = Text(analyzer='ik_max_word')
    
    created_date = Date()
    url = Keyword()
    
    comments_cound = Integer()
    collection_count = Integer()
    views_count = Integer()
    
    class Meta:
        index = 'juejin'
        doc_type = 'article'
    
JuejinType.init()

In [64]:
import datetime

def date_convert(value):
    value = value.split('T')[0]
    try:
        created_date = datetime.datetime.strptime(value,'%Y/%m/%d').date()
    except Exception as e:
        created_date = datetime.datetime.now().date()
    return created_date

def tags_convert(tags):
    return ",".join(tags)

es = connections.create_connection(JuejinType._doc_type.using)


def gen_suggest(index,info_tuple):
    used_words = set()
    suggest = []
    for text,weight in info_tuple:
        if text:
            words = es.indices.analyze(index=index,body={
                'filter':['lowercase'],
                'analyzer':'ik_max_word',
                'text':text
            })
            analyzed_words = set(r["token"] for r in words["tokens"] if len(r["token"]) > 1)
            new_words = analyzed_words - used_words
        else:
            new_words = set()
        if new_words:
            suggest.append({
                'input':list(new_words),
                'weight':weight
            })
    return suggest

In [69]:
all_items = collection.find({})

for item_test in all_items:
    article = JuejinType()
    article.title = item_test['title']
    article.tags = tags_convert(item_test['tags'])
    article.content = item_test['content']
    article.created_date = date_convert(item_test['created_date'])
    article.url = item_test['url']
    article.comments_count = item_test['comments_count']
    article.collection_count = item_test['collection_count']
    article.views_count = item_test['views_count']
    article.meta.id = item_test['object_id']
    article.suggest = gen_suggest(JuejinType._doc_type.index,((article.title, 7), (article.tags, 10)))
    article.save()
    
print('done it!')

done it!
