/
xml_to_excel.py
64 lines (58 loc) · 2.01 KB
/
xml_to_excel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# Copyright 2019, University of Illinois at Chicago
# This file is part of the main_finding_recognition project.
# See the ReadMe.txt for licensing information.
import os
from bs4 import BeautifulSoup
import xlrd
import xlwt
from xlwt import Workbook
import nltk
import sys
# print str(sys.argv[1])
inputxml=sys.argv[1]
# create new xlsx
Excel = Workbook(encoding='utf-8')
# sheet1 title and abstract
sheet1 = Excel.add_sheet('title_abstract')
sheet1.write(0,0,"PMID")
sheet1.write(0,1,"title")
sheet1.write(0,2,"abstract")
# sheet2 all sentences
sheet2=Excel.add_sheet('all_sentences')
sheet2.write(0,0,"PMID")
sheet2.write(0,1,"index")
sheet2.write(0,2,"ID")
sheet2.write(0,3,"sentence")
# parse .xml (first input file:pubmed_result.xml)
xml_file = inputxml
content = open(xml_file).read()
bs = BeautifulSoup(content, features="xml")
articles=bs.findAll('PubmedArticle')
# parse title and abstract, insert into xlsx
title_abstract_num=1
all_sentences_num=1
for article in articles:
PMID=article.find("PMID").text
sheet1.write(title_abstract_num,0,PMID)
title=article.find("ArticleTitle").text
sheet1.write(title_abstract_num,1,title)
abstract=''
abstracttexts=article.findAll('AbstractText')
for abstracttext in abstracttexts:
abstract=abstract+abstracttext.text+" "
sheet1.write(title_abstract_num,2,abstract)
title_abstract_num=title_abstract_num+1
print ("parsing article:"+str(PMID))
# parse senencese in abstract, insert into xlsx
sentences=nltk.sent_tokenize(abstract)
sentence_index=1
for sentence in sentences:
sheet2.write(all_sentences_num,0,PMID)
sheet2.write(all_sentences_num,1,sentence_index)
sheet2.write(all_sentences_num,2,str(PMID)+";"+str(sentence_index))
sheet2.write(all_sentences_num,3,sentence)
all_sentences_num=all_sentences_num+1
sentence_index=sentence_index+1
source_text=inputxml.replace(".xml","")+'_source_text.xls'
Excel.save(source_text)
os.system("python sentences_scoring_raw.py %s %s" % (source_text,inputxml))