In [1]:
import numpy as np
import pandas as pd
import math
import pyecharts
import re
import nltk
from pyecharts.charts import Bar
from pyecharts.charts import Line
from pyecharts.charts import Pie
from pyecharts.charts import WordCloud
from pyecharts.charts import Graph
from pyecharts.charts import ThemeRiver
from pyecharts.charts import HeatMap
from pyecharts import options as opts
from pyecharts.commons.utils import JsCode
from pyecharts.globals import ThemeType
from pyecharts.globals import RenderType
from pyecharts.globals import CurrentConfig, OnlineHostType
# OnlineHostType.NOTEBOOK_HOST 默认值为 http://localhost:8888/nbextensions/assets/
CurrentConfig.ONLINE_HOST = OnlineHostType.NOTEBOOK_HOST


## 读取数据

In [115]:
# df中存放dataframe格式的文件
df = pd.read_csv('total.csv')
df.drop_duplicates(subset=['paperid'],keep='first',inplace=True)

In [5]:
df

Unnamed: 0,tag,paperid,title,author,publishdate,pagecount,abstract,keyword,sourceagency,subject,documenttype,issuenumber,supplementalnotes,contractnumber,coauthor
0,1.0,AAECE260,Behaviour of High Purity Semiconductor Surface...,Australian Atomic Energy Commission Research E...,1973.0,17.0,"For abstract, see NSA 28 11, number 27462.",Surface barrier detectors\n ...,Atomic Energy Commission,77E - Nuclear Instrumentation,Technical Report,197404.0,,,Australian Atomic Energy Commission Research E...
1,1.0,AAECE297,Semiconductor X-Ray Spectrometer System Type 454.,Australian Atomic Energy Commission Research E...,1973.0,35.0,"For abstract, see NSA 29 08, number 18470.",Si semiconductor detectors\n ...,Atomic Energy Commission,77E - Nuclear Instrumentation,Technical Report,197412.0,,,Australian Atomic Energy Commission Research E...
2,1.0,ACRH1000314,Aspects of Imaging and Counting in Nuclear Med...,"Chicago Univ., Ill. Dept. Of Radiology. Argonn...",1972.0,24.0,"For abstract, see NSA 26 13, number 31048.",Gamma cameras\n ...,,77E - Nuclear Instrumentation,Conference Proceedings,197215.0,From Scintillation And Semiconductor Counter S...,AT(11-1)-69,"Chicago Univ., Ill. Dept. Of Radiology. Argonn..."
3,1.0,AD1003457,Ion Trap in a Semiconductor Chip.,University of Michigan Ann Arbor United States,2006.0,4.0,The electromagnetic manipulation of isolated a...,Laser cooling\n ...,Non Paid ADAS,46 - Physics,Technical Report,201709.0,"Nature Physics , 2, 01 Jan 0001, 01 Jan 0001,",,University of Michigan Ann Arbor United States
4,1.0,AD1005447,Electron Transport and Dephasing in Semiconduc...,"Stanford Univ., CA.",1999.0,140.0,"At low temperatures, electrons in semiconducto...",Electron transport\n ...,Non Paid ADAS,97I - Electric Power Production,Technical Report,201715.0,,,"Stanford Univ., CA."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8223,,AD1122786,Charge Transfer States at Donor-Acceptor Heter...,The Regents of the University of Michigan Ann ...,2020.0,21.0,The generation of photocurrent at organic dono...,Quantum efficiency\n ...,Non Paid ADAS,46 - Physics\n ...,,202126.0,,,The Regents of the University of Michigan Ann ...
8225,,N205005959,Integrating Graphene into Polymer Derived Cera...,"Kennedy Space Center Merritt Island, Florida, ...",2020.0,18.0,No abstract available.,Nstrf\n \n ...,National Aeronautics and Space Administration,71 - Materials Sciences,,202117.0,Text in English. Presented at AMPB NIFS Virtua...,,"Kennedy Space Center Merritt Island, Florida, ..."
8226,,N205007433,"Urban Air Mobility Noise: Current Practice, Ga...","Langley Research Center Hampton, Virginia, Uni...",2020.0,59.0,"In 2018, NASA formed an Urban Air Mobility Noi...",Advanced air mobility\n ...,National Aeronautics and Space Administration,46A - Acoustics,,202116.0,Text in English.,,
8227,,N205010373,Material and Electrical Characterization of Oh...,"HX5, LLC; Glenn Research Center Cleveland, Ohi...",2020.0,10.0,No abstract available.,High temperature\n ...,National Aeronautics and Space Administration,49 - Electrotechnology\n ...,,202116.0,Text in English. Presented at 2020 Virtual MRS...,,"HX5, LLC; Glenn Research Center Cleveland, Ohi..."


## 报告来源方统计(source agency)

In [6]:
source_agency = df['sourceagency'].value_counts()
# 取排名前10的数据
source_agency = source_agency.head(15)
source_agency = pd.DataFrame({'name':source_agency.index,'count':source_agency.values})
source_agency

Unnamed: 0,name,count
0,Non Paid ADAS,1638
1,Technical Information Center Oak Ridge Tennessee,977
2,Army,567
3,Invalid Source Agency Code,499
4,Air Force,476
5,National Aeronautics and Space Administration,403
6,International Nuclear Information System,299
7,National Institute of Standards and Technology,226
8,NASA Foreign Exchange Program,211
9,Non Paid Reprints,187


In [14]:
# 定义将dataframe转换成list函数
def df2list(df):
    columns = df.columns
    res = []
    for c in columns:
        d = df[c].values.tolist()
        res.append(d)
    return res

In [5]:
# 将属性转换成list方便作图
sa_list = df2list(source_agency)
sa_name = sa_list[0]
sa_count = sa_list[1]
# 绘制直方图
bar = (
    Bar(init_opts=opts.InitOpts(theme=ThemeType.LIGHT,width='2100px'))
    .add_xaxis(sa_name)
    .add_yaxis("机构",sa_count,category_gap="50%")
    .reversal_axis()
    .set_series_opts(label_opts=opts.LabelOpts(position="right"))
    .set_global_opts(yaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(font_size=8)),
                     title_opts=opts.TitleOpts(title="科技报告中前10项目发布者数量分布"),
                     legend_opts=opts.LegendOpts(is_show=False),)

    
)
bar.render_notebook()

## 报告合作方统计(Corporate Authors)

In [7]:
co_author = []
wordlist = [str(w) for w in df['coauthor']]
for i in wordlist:
    for word in i.split(';'):
        co_author.append(word)
     
fdist = nltk.FreqDist(co_author)
fdist
ca_name = []
ca_count = []
for (a,b) in fdist.most_common(15):
    ca_name.append(a)
    ca_count.append(b)

In [8]:
ca_name

[' Department of Energy, Washington, DC.',
 ' National Aeronautics and Space Administration, Washington, DC.',
 ' Army Research Office, Research Triangle Park, NC.',
 'Sandia National Labs., Albuquerque, NM.',
 ' Air Force Office of Scientific Research, Bolling AFB, DC.',
 'International Centre for Theoretical Physics, Trieste (Italy).',
 ' Defense Advanced Research Projects Agency, Arlington, VA.',
 'Massachusetts Inst. of Tech., Lexington. Lincoln Lab.',
 'Department of the Navy, Washington, DC.',
 ' Electronic Systems Div., Hanscom AFB, MA.',
 'Oak Ridge National Lab., TN.',
 'IBM Thomas J. Watson Research Center, Yorktown Heights, NY.',
 'nan',
 'Lawrence Berkeley Lab., CA.',
 ' Department of Energy.']

In [10]:
ca_count

[893, 359, 340, 237, 213, 104, 78, 77, 77, 62, 61, 54, 53, 53, 48]

In [9]:
# 绘制直方图
bar = (
    Bar(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
    .add_xaxis(ca_name)
    .add_yaxis("机构",ca_count,category_gap="50%")
    .reversal_axis()
    .set_series_opts(label_opts=opts.LabelOpts(position="right"))
    .set_global_opts(yaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(font_size=6)),
                     title_opts=opts.TitleOpts(title="科技报告中前20项目承担者数量分布"))

    
)
bar.render_notebook()

## 报告合同号统计(Contract Number)

In [15]:
contract_number = df['contractnumber'].value_counts()
# 取排名前20的数据
contract_number = contract_number.head(10)
contract_number = pd.DataFrame({'name':contract_number.index,'count':contract_number.values})
contract_number

Unnamed: 0,name,count
0,AC04-76DP00789,146
1,AC04-94AL85000,126
2,AC03-76SF00098,88
3,W-7405-ENG-48,70
4,AC02-77CH00178,34
5,AC05-84OR21400,34
6,F19628-80-C-0002,31
7,AC02-83CH10093,29
8,W-7405-ENG-36,26
9,DAAL03-86-K-0173,20


In [16]:
cn_list = df2list(contract_number)
cn_pie = (
    Pie(init_opts=opts.InitOpts(theme=ThemeType.LIGHT,height='800px',width='950px'))
    .add(series_name="合同号",
         data_pair=[list(z) for z in zip(cn_list[0], cn_list[1])])
    .set_global_opts(title_opts=opts.TitleOpts(title="合同号统计",pos_top=40))
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
)
cn_pie.render_notebook()

In [17]:
cn_pie.render()

'D:\\黄哲远\\juypter notebook\\render.html'

## 年份分析

In [18]:
# 将dataframe中的年份数据转换成整型
publish_date = df['publishdate']
year_dist = {}
year_list=[]
for i in publish_date:
    if i == i:
        year_list.append(int(i))
    else:
        year_list.append(0)
year_list
year_dist = nltk.FreqDist(year_list)
del year_dist[0]

In [19]:
# 按年份排序
lsKV=[(k,v) for k,v in year_dist.items()]
lsKV.sort()
year_name = []
year_count = []
year_dist = dict(lsKV)
for (a,b) in year_dist.items():
    year_name.append(str(a))
    year_count.append(b)

In [20]:
year_line = (
    Line()
    .set_global_opts(
        tooltip_opts=opts.TooltipOpts(is_show=True),
        xaxis_opts=opts.AxisOpts(type_="category"),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            axistick_opts=opts.AxisTickOpts(is_show=True),
            splitline_opts=opts.SplitLineOpts(is_show=True),
        ),
    )
    .add_xaxis(xaxis_data=year_name)
    .add_yaxis(
        series_name="",
        y_axis=year_count,
        symbol="emptyCircle",
        is_symbol_show=True,
        is_smooth=True,
        label_opts=opts.LabelOpts(is_show=False),
    )
)
year_line.render_notebook()

## 报告学科类别统计(NTIS Subject Category)

In [35]:
# 定义抽取领域字段并进行频次统计的函数
def field_extraction(df,field,flag=1):
    subject = []
    wordlist = [str(w) for w in df[field]]
    # 使用正则表达式提取相关词
    if flag == 1: 
        for i in wordlist:
            for word in re.findall(r'[A-z &,]+',i):
                word = word.replace('nan','')
                word = word.replace('                                        ','')
                subject.append(word)
    elif flag ==0:
        for i in wordlist:
            for word in re.findall(r'[\w &--()/]+',i):
                word = word.replace('                                        ','')
                word = word.replace('nan','')
                subject.append(word)        
    # 调用nltk词频统计函数
    fdist = nltk.FreqDist(subject)
    # 清洗数据
    for i in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890':
        del fdist[i+'-']
        del fdist[i+' ']
        del fdist[' '+i]
        del fdist[i]
        del fdist[i.lower()]
    del fdist['                                        ']
    del fdist['                                    ']
    del fdist[' ']
    del fdist['']
    del fdist['of']
    # 返回频率字典
    return fdist


In [22]:
subject = field_extraction(df,'subject')
# 取排名前15的数据
subject_name = []
subject_count = []
for (a,b) in subject.most_common(15):
    subject_name.append(a)
    subject_count.append(b)

In [28]:
subject_name

[' Solid State Physics',
 ' Semiconductor Devices',
 ' Optics & Lasers',
 ' Physical & Theoretical Chemistry',
 ' Optoelectronic Devices & Systems',
 ' Electrotechnology',
 ' Nuclear Instrumentation',
 ' Resistive, Capacitive, & Inductive Components',
 ' Solar Energy',
 ' Physics',
 ' Manufacturing Processes & Materials Handling',
 ' Circuits',
 ' Basic & Synthetic Chemistry',
 ' Computer Hardware',
 ' Laboratory & Test Facility Design & Operation']

In [29]:
subject_count

[2544, 1523, 679, 666, 500, 386, 270, 254, 209, 198, 154, 137, 128, 89, 83]

In [26]:
# worfcloud
subject_cloud = (
    WordCloud(init_opts=opts.InitOpts(theme=ThemeType.ESSOS,height='900px'))
    .add(series_name="学科分析", data_pair=subject.items(), word_size_range=[20,50])
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title="学科分析", title_textstyle_opts=opts.TextStyleOpts(font_size=23)
        ),
        tooltip_opts=opts.TooltipOpts(is_show=True),
    )
)
subject_cloud.render_notebook()

In [27]:
subject_cloud.render()

'D:\\黄哲远\\juypter notebook\\render.html'

In [22]:
# line chart
background_color_js = (
    "new echarts.graphic.LinearGradient(0, 0, 0, 1, "
    "[{offset: 0, color: '#224CB5'}, {offset: 1, color: '#72B9C7'}], false)"
)
area_color_js = (
    "new echarts.graphic.LinearGradient(0, 0, 0, 1, "
    "[{offset: 0, color: '#B059C7'}, {offset: 1, color: '#3fbbff0d'}], false)"
)
c = (
    Line(init_opts=opts.InitOpts(bg_color=JsCode(background_color_js)))
    .add_xaxis(xaxis_data=['固体物理学','半导体器件','光学与激光','物理与理论化学','光电子器件和系统','电气技术','电阻性、电容性和感应性元件','核仪器','太阳能','物理学'])
    .add_yaxis(
        series_name="学科",
        y_axis=[3254,1884,831,823,601,475,297,288,263,222],
        is_smooth=True,
        is_symbol_show=True,
        symbol="circle",
        symbol_size=6,
        linestyle_opts=opts.LineStyleOpts(color="#fff"),
        label_opts=opts.LabelOpts(is_show=True, position="top", color="white"),
        itemstyle_opts=opts.ItemStyleOpts(
            color="red", border_color="#fff", border_width=3
        ),
        tooltip_opts=opts.TooltipOpts(is_show=True),
        areastyle_opts=opts.AreaStyleOpts(color=JsCode(area_color_js), opacity=1),
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(
            pos_bottom="5%",
            pos_left="center",
            title_textstyle_opts=opts.TextStyleOpts(color="#fff", font_size=12),
        ),
        xaxis_opts=opts.AxisOpts(
            type_="category",
            boundary_gap=False,
            axislabel_opts=opts.LabelOpts(margin=15, color="#DDEBEC",font_size=10,rotate=-15),
            axisline_opts=opts.AxisLineOpts(is_show=False),
            axistick_opts=opts.AxisTickOpts(
                is_show=True,
                length=25,
                linestyle_opts=opts.LineStyleOpts(color="#ffffff1f"),
            ),
            splitline_opts=opts.SplitLineOpts(
                is_show=True, linestyle_opts=opts.LineStyleOpts(color="#ffffff1f")
            ),
        ),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            position="left",
            axislabel_opts=opts.LabelOpts(margin=20, color="#ffffff63"),
            axisline_opts=opts.AxisLineOpts(
                linestyle_opts=opts.LineStyleOpts(width=2, color="#ffffff63")
            ),
            axistick_opts=opts.AxisTickOpts(
                is_show=True,
                length=15,
                linestyle_opts=opts.LineStyleOpts(color="#ffffff1f"),
            ),
            splitline_opts=opts.SplitLineOpts(
                is_show=True, linestyle_opts=opts.LineStyleOpts(color="#ffffff1f")
            ),
        ),
        legend_opts=opts.LegendOpts(is_show=False),
    )
)
c.render_notebook()

In [23]:
c.render()

'D:\\黄哲远\\juypter notebook\\render.html'

## 报告关键词统计分析(Keywords)

In [36]:
keywords = field_extraction(df,'keyword',0)
# 取排名前15的数据
keywords_name = []
keywords_count = []
for (a,b) in keywords.most_common(15):
    keywords_name.append(a)
    keywords_count.append(b)

In [37]:
keywords_name

['Semiconductors',
 'Semiconductor devices',
 'Silicon',
 'Gallium arsenides',
 'Reprints',
 'Foreign technology',
 'Semiconductor lasers',
 'Semiconductor Materials',
 'Epitaxial growth',
 'Substrates',
 'Fabrication',
 'Thin films',
 'Electrical properties',
 'Optical properties',
 'Integrated circuits']

In [38]:
keywords_count

[2110, 1136, 990, 945, 892, 806, 436, 397, 388, 388, 387, 373, 368, 366, 317]

In [39]:
# worfcloud
keyword_cloud = (
    WordCloud(init_opts=opts.InitOpts(theme=ThemeType.ESSOS,height='900px'))
    .add(series_name="关键词分析", data_pair=keywords.items(), word_size_range=[10, 66])
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title="关键词分析", title_textstyle_opts=opts.TextStyleOpts(font_size=23)
        ),
        tooltip_opts=opts.TooltipOpts(is_show=True),
    )
)
keyword_cloud.render_notebook()

In [40]:
keyword_cloud.render()

'D:\\黄哲远\\juypter notebook\\render.html'

In [29]:
# line chart
background_color_js = (
    "new echarts.graphic.LinearGradient(0, 0, 0, 1, "
    "[{offset: 0, color: '#c86589'}, {offset: 1, color: '#06a7ff'}], false)"
)
area_color_js = (
    "new echarts.graphic.LinearGradient(0, 0, 0, 1, "
    "[{offset: 0, color: '#eb64fb'}, {offset: 1, color: '#3fbbff0d'}], false)"
)
c = (
    Line(init_opts=opts.InitOpts(bg_color=JsCode(background_color_js)))
    .add_xaxis(xaxis_data=["半导体",'半导体器件','硅','砷化镓','外国技术','半导体激光器','基板','磊晶成长','薄膜','半导体材料'])
    .add_yaxis(
        series_name="关键词",
        y_axis=[2663,1339,1206,1181,966,529,503,502,501,493],
        is_smooth=True,
        is_symbol_show=True,
        symbol="circle",
        symbol_size=6,
        linestyle_opts=opts.LineStyleOpts(color="#fff"),
        label_opts=opts.LabelOpts(is_show=True, position="top", color="white"),
        itemstyle_opts=opts.ItemStyleOpts(
            color="red", border_color="#fff", border_width=3
        ),
        tooltip_opts=opts.TooltipOpts(is_show=True),
        areastyle_opts=opts.AreaStyleOpts(color=JsCode(area_color_js), opacity=1),
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(
            pos_bottom="5%",
            pos_left="center",
            title_textstyle_opts=opts.TextStyleOpts(color="#fff", font_size=12),
        ),
        xaxis_opts=opts.AxisOpts(
            type_="category",
            boundary_gap=False,
            axislabel_opts=opts.LabelOpts(margin=15, color="#DDEBEC",font_size=12,rotate=-15),
            axisline_opts=opts.AxisLineOpts(is_show=False),
            axistick_opts=opts.AxisTickOpts(
                is_show=True,
                length=25,
                linestyle_opts=opts.LineStyleOpts(color="#ffffff1f"),
            ),
            splitline_opts=opts.SplitLineOpts(
                is_show=True, linestyle_opts=opts.LineStyleOpts(color="#ffffff1f")
            ),
        ),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            position="left",
            axislabel_opts=opts.LabelOpts(margin=20, color="#ffffff63"),
            axisline_opts=opts.AxisLineOpts(
                linestyle_opts=opts.LineStyleOpts(width=2, color="#ffffff63")
            ),
            axistick_opts=opts.AxisTickOpts(
                is_show=True,
                length=15,
                linestyle_opts=opts.LineStyleOpts(color="#ffffff1f"),
            ),
            splitline_opts=opts.SplitLineOpts(
                is_show=True, linestyle_opts=opts.LineStyleOpts(color="#ffffff1f")
            ),
        ),
        legend_opts=opts.LegendOpts(is_show=False),
    )
)
c.render_notebook()

In [30]:
c.render()

'D:\\黄哲远\\juypter notebook\\render.html'

## 新兴关键词

In [41]:
keyword_2010 = {}
keyword_1970 = {}
keyword_data = df[['keyword','publishdate']]
for line in keyword_data.iterrows():
    # 使用正则表达式抽取关键词
    for word in re.findall(r'[\w &--()/]+',str(line[1]['keyword'])):
        word = word.replace('                                        ','')
        word = word.replace('                                    ','')
    
        if line[1]['publishdate'] > 2010 and word != '':
            keyword_2010[word] = keyword_2010.get(word,0) + 1
        elif line[1]['publishdate'] < 2010 and word != '':
            keyword_1970[word] = keyword_1970.get(word,0) + 1
            

for key in keyword_2010.keys():
    keyword_2010[key] -= keyword_1970.get(key,0) 
          
dic1SortList = sorted( keyword_2010.items(),key = lambda x:x[1],reverse = True)


In [42]:
dic1SortList

[('Graphene', 12),
 ('Simulations', 6),
 ('Power electronics', 6),
 ('SEMICONDUCTORS', 6),
 ('Fermi levels', 5),
 ('Band structures', 4),
 ('Metamaterials', 4),
 ('Nanomaterials', 4),
 ('Materials laboratories', 4),
 ('Materials processing', 4),
 ('Electronics laboratories', 4),
 ('ELECTROOPTICS', 4),
 ('Metal-semiconductor junctions', 3),
 ('Carbon nanotubes', 3),
 ('Crystal lattice vibrations', 3),
 ('Silicon carbide', 3),
 ('Probability distributions', 3),
 ('Degenerate semiconductors', 3),
 ('Ionized impurity scattering', 3),
 ('Non-parabolic conduction bands', 3),
 ('Two-dimensional materials', 3),
 ('Quantum information science', 3),
 ('Energy harvesting', 3),
 ('Modules (electronics)', 3),
 ('Eoard(European office of aerospace research and development)', 3),
 ('OPTOELECTRONICS', 3),
 ('PE61102F', 3),
 ('GALLIUM ARSENIDES', 3),
 ('Infringement', 3),
 ('Nanocomposites', 2),
 ('Optical cooling', 2),
 ('Ultrawide-Bandgap Semiconductors', 2),
 ('Advanced materials', 2),
 ('Hydrogen b

In [43]:
y_data = [ "Graphene", "Nanostructures", "Quantum dots", "Nanocrystals","Nanotechnology","Nanoparticles"]
keyword_year = {}
keyword_data = df[['keyword','publishdate']]
for line in keyword_data.iterrows():
    # 使用正则表达式抽取关键词
    for word in re.findall(r'[\w &--()/]+',str(line[1]['keyword'])):
        word = word.replace('                                        ','')
        word = word.replace('                                    ','')
    
        for i in y_data:
            if word == i:
                # 构建关键词和年份的关联字典
                word = word + ',' + str(line[1]['publishdate'])[:4]
                keyword_year[word] = keyword_year.get(word,0) + 1
ky_list = []
for key, value in keyword_year.items():
    A = key.split(',')[1]
    B = key.split(',')[0]
    ky_list.append([A,value,B])

In [34]:
y_data

['Graphene',
 'Nanostructures',
 'Quantum dots',
 'Nanocrystals',
 'Nanotechnology',
 'Nanoparticles']

In [44]:
# 主题河流图
river = (
    ThemeRiver(init_opts=opts.InitOpts(width="1600px",theme=ThemeType.LIGHT))
    .add(
        series_name=y_data,
        data=ky_list,
        singleaxis_opts=opts.SingleAxisOpts(
            pos_top="50", pos_bottom="50", type_="time"
        ),
    )
    .set_global_opts(
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="line"),


    )
)
river.render_notebook()

In [45]:
river.render()

'D:\\黄哲远\\juypter notebook\\render.html'

## 关键词-时间

In [46]:
y_data = ["Semiconductors", "Semiconductor devices", "Silicon", "Gallium arsenides", "Foreign technology","Semiconductor lasers"]
keyword_year = {}
keyword_data = df[['keyword','publishdate']]
for line in keyword_data.iterrows():
    # 使用正则表达式抽取关键词
    for word in re.findall(r'[\w &---()/]+',str(line[1]['keyword'])):
        word = word.replace('                                        ','')
        word = word.replace('                                    ','')
    
        for i in y_data:
            if word == i:
                # 构建关键词和年份的关联字典
                word = word + ',' + str(line[1]['publishdate'])[:4]
                keyword_year[word] = keyword_year.get(word,0) + 1
ky_list = []
for key, value in keyword_year.items():
    A = key.split(',')[1]
    B = key.split(',')[0]
    ky_list.append([A,value,B])

In [38]:
ky_list

[['1973', 48, 'Silicon'],
 ['2006', 35, 'Semiconductors'],
 ['1999', 49, 'Semiconductors'],
 ['2016', 12, 'Semiconductors'],
 ['2016', 2, 'Silicon'],
 ['2013', 2, 'Silicon'],
 ['2016', 3, 'Semiconductor devices'],
 ['2015', 2, 'Semiconductors'],
 ['2017', 1, 'Semiconductor devices'],
 ['2018', 2, 'Semiconductors'],
 ['2018', 1, 'Semiconductor lasers'],
 ['2017', 3, 'Semiconductors'],
 ['2018', 3, 'Semiconductor devices'],
 ['2019', 23, 'Semiconductors'],
 ['2019', 7, 'Semiconductor devices'],
 ['2019', 2, 'Silicon'],
 ['2019', 1, 'Semiconductor lasers'],
 ['2013', 12, 'Semiconductors'],
 ['2020', 7, 'Semiconductors'],
 ['2020', 3, 'Semiconductor devices'],
 ['2021', 3, 'Semiconductors'],
 ['2021', 3, 'Semiconductor devices'],
 ['1999', 21, 'Semiconductor devices'],
 ['1972', 123, 'Semiconductors'],
 ['1972', 57, 'Semiconductor devices'],
 ['1972', 74, 'Silicon'],
 ['1970', 65, 'Semiconductor devices'],
 ['1971', 64, 'Semiconductor devices'],
 ['1972', 30, 'Gallium arsenides'],
 ['1973'

In [47]:
# 主题河流图
river = (
    ThemeRiver(init_opts=opts.InitOpts(width="1600px",theme=ThemeType.LIGHT))
    .add(
        series_name=y_data,
        data=ky_list,
        singleaxis_opts=opts.SingleAxisOpts(
            pos_top="50", pos_bottom="50", type_="time"
        ),
    )
    .set_global_opts(
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="line"),


    )
)
river.render_notebook()

In [48]:
river.render()

'D:\\黄哲远\\juypter notebook\\render.html'

In [49]:
y_data = ["Semiconductors", "Semiconductor devices",  "Silicon", "Gallium arsenides", "Foreign technology","Semiconductor lasers","Substrates","Epitaxial growth","Thin films"]
# 构造热力图数据
keyword_year = {}
keyword_data = df[['keyword','publishdate']]
for line in keyword_data.iterrows():
    # 使用正则表达式抽取关键词
    for word in re.findall(r'[\w &---()/]+',str(line[1]['keyword'])):
        word = word.replace('                                        ','')
        word = word.replace('                                    ','')
    
        for i in y_data:
            if word == i:
                # 构建关键词和年份的关联字典
                word = word + ',' + str(line[1]['publishdate'])[:4]
                keyword_year[word] = keyword_year.get(word,0) + 1
year_list = []
for i in range(1970,2022):
    year_list.append(i)
y_dict = {"Semiconductors":0,"Semiconductor devices":1,"Silicon":2,"Gallium arsenides":3,"Foreign technology":4,"Semiconductor lasers":5,"Substrates":6,"Epitaxial growth":7,"Thin films":8}
data = []
for key, value in keyword_year.items():
    A = key.split(',')[1]
    B = key.split(',')[0]
    data.append([int(A)-1970,y_dict[B],value])

In [50]:
# 热力图
heatmap = (
    HeatMap(init_opts=opts.InitOpts( height="720px"))
    .add_xaxis(xaxis_data=year_list)
    .add_yaxis(
        series_name="Punch Card",
        yaxis_data=["Semiconductors", "Semiconductor devices",  "Silicon", "Gallium arsenides", "Foreign technology","Semiconductor lasers","Substrates","Epitaxial growth","Thin films"],
        value=data,
        label_opts=opts.LabelOpts(
            is_show=False, color="#fff", position="bottom", horizontal_align="50%"
        ),
    )
    .set_series_opts()
    .set_global_opts(
        legend_opts=opts.LegendOpts(is_show=False),
        xaxis_opts=opts.AxisOpts(
            type_="category",
            splitarea_opts=opts.SplitAreaOpts(
                is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
            ),
        ),
        yaxis_opts=opts.AxisOpts(
            type_="category",
            splitarea_opts=opts.SplitAreaOpts(
                is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
            ),
        ),
        visualmap_opts=opts.VisualMapOpts(
            min_=0, max_=130, is_calculable=True, orient="horizontal", pos_left="center"
        ),
    )
)
heatmap.render_notebook()

In [51]:
heatmap.render()

'D:\\黄哲远\\juypter notebook\\render.html'

## 关系图

In [116]:
# 构建关系网络中的节点和链接数据
keyword_dict ={}
keyword_group ={}
keyword_data = df['keyword']

# 第一层循环 以一行为单位
for line in keyword_data:
    # 定义每行的词列表
    keyword_co1 = []
    # 第二层循环 以每个词为单位
    for word in re.findall(r'[\w &--()/]+',str(line)):
        word = word.replace('                                        ','')
        word = word.replace('                                    ','')
        if word != '':
            # 构建单个词的频率字典
            keyword_dict[word] = keyword_dict.get(word,0) + 1
            keyword_co1.append(word)
    # 统计共现
    keyword_co2 = keyword_co1
    for word in keyword_co1:
        for k in keyword_co2:
            A, B = word, k
            if A > B:
                A, B = B, A  # 保持关键词顺序一致
            if A == B:
                continue
            word_group = A + ',' + B  # 将两个关键词合并起来，以逗号隔开
            if word_group not in keyword_group:
                keyword_group[word_group] = 1
            else:
                keyword_group[word_group] += 1

  for word in re.findall(r'[\w &--()/]+',str(line)):


In [153]:
# 构建画图所需的node link category
node_data = []
link_data = []
for key, value in keyword_dict.items():
    if value > 1500:
        node_data.append(opts.GraphNode(name=key, symbol_size=value/20, category=0, value=value))
    if value > 750 and value <= 1500:
        node_data.append(opts.GraphNode(name=key, symbol_size=value/20, category=1, value=value))
    if value > 350 and value <= 750:
        node_data.append(opts.GraphNode(name=key, symbol_size=value/20, category=2, value=value))
    if value > 250 and value <= 350:
        node_data.append(opts.GraphNode(name=key, symbol_size=value/20, category=3, value=value))
    if value > 200 and value <= 250:
        node_data.append(opts.GraphNode(name=key, symbol_size=value/20, category=4, value=value))
        
for key, value in keyword_group.items():
    A = key.split(',')[0]
    B = key.split(',')[1]
    if value > 150:
        link_data.append(opts.GraphLink(source=A, target=B, value=value))   
categories=[
    {"name":"1500+"},
    {"name":"750~1500"},
    {"name":"350~750"},
    {"name":"250~350"},
    {"name":"200~250"},
]

In [118]:
# 作关系网(环形)
network1 = (
    Graph(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
    .add("", node_data, link_data, categories, repulsion=8000,
        layout="circular",
        is_rotate_label=True,
        linestyle_opts=opts.LineStyleOpts(color="source", curve=0.3),
        label_opts=opts.LabelOpts(position="right"),)
    .set_global_opts(title_opts=opts.TitleOpts(title="Keyword Network"))
)
network1.render_notebook()

In [64]:
network1.render()

'D:\\黄哲远\\juypter notebook\\render.html'

In [154]:
# 作关系网
network2 = (
    Graph(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
    .add("", node_data, link_data, categories, repulsion=8000,
        is_rotate_label=True,
        linestyle_opts=opts.LineStyleOpts(color="source", curve=0.3),
        label_opts=opts.LabelOpts(position="right"),)
    .set_global_opts(title_opts=opts.TitleOpts(title="Keyword Network"))
)
network2.render_notebook()

In [49]:
network2.render()

'D:\\黄哲远\\juypter notebook\\render.html'

## 实体识别——热力图

In [2]:
# data中存放dataframe格式的文件
df_ner = pd.read_csv('name_entity.csv')

In [27]:
# 定义热力图数据构造函数
def heatmap_data(df,ner_type):
    # 统计高频词
    type_dict = {}
    for line in df[['name_entity','type','publishdate']].iterrows():
        if line[1]['type'] == ner_type:
            type_dict[line[1]['name_entity']] = type_dict.get(line[1]['name_entity'],0) + 1
    
    y_data = []
    # 对统计结果进行频次排序        
    type_rank = sorted( type_dict.items(),key = lambda x:x[1],reverse = True)
    for i in type_rank[:6]:
        y_data.append(i[0])
    
    print(y_data)
    # 定义实体和时间关联的字典
    ner_year = {}
    # 构造实体与时间关联
    for line in df[['name_entity','type','publishdate']].iterrows():
        if line[1]['type'] == ner_type:
            # 匹配高频词表
            for i in y_data:
                if i == line[1]['name_entity']:
                    word = line[1]['name_entity'] + ','+ str(line[1]['publishdate'])[:4]
                    ner_year[word] = ner_year.get(word,0) + 1
    
    # 构造热力图笛卡尔坐标map
    y_dict = {}
    count = 0
    for word in y_data:
        y_dict[word] = count
        count += 1
        
    data = []
    for key, value in ner_year.items():
        A = key.split(',')[1]
        B = key.split(',')[0]
        data.append([int(A)-1970,y_dict[B],value])
        
    return data, y_data                                       

In [28]:
data_heat,y_data = heatmap_data(df_ner,'technology')

['MBE', 'molecular beam epitaxy', 'Atomindex', 'MOCVD', 'STM', 'TEM']


In [18]:
year_list = []
for i in range(1970,2022):
    year_list.append(i)

In [19]:
# 热力图
heatmap = (
    HeatMap(init_opts=opts.InitOpts( height="720px"))
    .add_xaxis(xaxis_data=year_list)
    .add_yaxis(
        series_name="Punch Card",
        yaxis_data=y_data,
        value=data_heat,
        label_opts=opts.LabelOpts(
            is_show=False, color="#fff", position="bottom", horizontal_align="50%"
        ),
    )
    .set_series_opts()
    .set_global_opts(
        legend_opts=opts.LegendOpts(is_show=False),
        xaxis_opts=opts.AxisOpts(
            type_="category",
            splitarea_opts=opts.SplitAreaOpts(
                is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
            ),
        ),
        yaxis_opts=opts.AxisOpts(
            type_="category",
            splitarea_opts=opts.SplitAreaOpts(
                is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
            ),
        ),
        visualmap_opts=opts.VisualMapOpts(
            min_=0, max_=10, is_calculable=True, orient="horizontal", pos_left="center"
        ),
    )
)
heatmap.render_notebook()

In [15]:
heatmap.render()

'D:\\黄哲远\\juypter notebook\\render.html'

## 实体识别——关系网络

In [11]:
entity_dict = {}
entity_group ={}
entity_co = []
id_tag0 = ''
id_tag1 = ''
for line in df_ner.iterrows():
    # 统计单个词出现的概率
    if line[1]['type'] != 'experiment' and line[1]['type'] != 'organization' and line[1]['type'] != 'structure':
        entity_dict[line[1]['name_entity']] = {'freq':entity_dict.get(line[1]['name_entity'],{'freq':0,'type':line[1]['type']}).get('freq') + 1,'type':line[1]['type']}
    id_tag0 = id_tag1
    id_tag1 = line[1]['artical_id']    
    # 统计同一篇报告的实体共现
    if id_tag0 != id_tag1 and id_tag0 != '':
        co_replica = entity_co
        for word in entity_co:
            for k in co_replica:
                A, B = word, k
                if A > B:
                    A, B = B, A  # 保持实体顺序一致
                if A == B:
                    continue
                word_group = A + ',' + B  # 将两个实体合并起来，以逗号隔开
                if word_group not in entity_group:
                    entity_group[word_group] = 1
                else:
                    entity_group[word_group] += 1
        entity_co = []
    entity_co.append(str(line[1]['name_entity']))                

In [12]:
# 构建画图所需的node link category
category_index = {'application': 0,'chemical':1,'physical':2,'component':3, 'material':4, 'technology':5}
node_data = []
link_data = []
for key, value in entity_dict.items():
    if value.get('freq') > 65:
        node_data.append(opts.GraphNode(name=key, symbol_size=value.get('freq')/10, category=category_index[value.get('type')], value=value.get('freq')))
        
for key, value in entity_group.items():
    A = key.split(',')[0]
    B = key.split(',')[1]
    if value > 20:
        link_data.append(opts.GraphLink(source=A, target=B, value=value))   
categories=[
    {"name":"application"},
    {"name":"chemical"},
    {"name":"physical"},
    {"name":"component"},
    {"name":"material"},
    {"name":"technology"}
]

In [13]:
# 作关系网(环形)
network3 = (
    Graph(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
    .add("", node_data, link_data, categories, repulsion=18000,
        layout='circular',
        is_rotate_label=True,
        linestyle_opts=opts.LineStyleOpts(color="source", curve=0.3),
        label_opts=opts.LabelOpts(position="right"),)
    .set_global_opts(title_opts=opts.TitleOpts(title=""))
)
network3.render_notebook()