Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Branch: master
Fetching contributors…

Cannot retrieve contributors at this time

216 lines (196 sloc) 8.565 kB
'''
This python file will get the various fields from
a text file of a wiki page that has been produced
by html2text.py
'''
import re
class get_wiki_data:
def __init__(self, filename):
#self.f = open(filename, 'r')
self.s = filename
#print self.s
self.s = self.s.split(' [1]:')[0]
#self.s = self.f.read()
self.sections = self.s.split('\n## ')
self.pos = ['Noun', 'Verb', 'Pronoun', 'Adjective', 'Adverb', 'Preposition', 'Conjunction', 'Interjection', 'Antonyms', 'Synonyms', 'Derived terms', 'Etymology', 'Pronunciation']
def get_contents(self):
for i in self.sections:
if i.split('\n')[0].endswith('Contents'):
self.contents = i
def get_eng_fields(self):
self.fields = []
self.subfields = []
try:
l = self.contents.split('\n *')
for i in l:
i = i.split('\n')
#print i[0]
#print i[0].find('English')
if i[0].find('English') > 0:
for j in i:
'''
if j.startswith(' *'):
j = j.strip(' * ')
j = j[5: j.find(']')]
self.fields.append([j])
'''
j = j.strip()
j = j.lstrip('* [')
j = j.split('][')[0]
j = j.split(' ', 1)
index = j[0].split('.')
if index[0] == '1':
self.fields.append(j)
except:
t = self.s.split('\n')
for i in t:
if i.startswith('### ') or i.startswith('## '):
i = i.split(']] ')[1]
self.fields.append(i)
def cleanup(self):
#for i in self.fields:
for key in self.dict.keys():
#print self.dict[key]
for i in self.dict[key]:
links = re.findall('\[.*?\]', i)
for link in links:
index = self.dict[key].index(i)
if link.strip('[').strip(']').isdigit():
i = i.replace(link, '')
else:
i = i.replace(link, link.strip('[').strip(']'))
i = i.strip('\n ')
i = i.split('\n')
for j in i:
if j.startswith('!'):
i.remove(j)
i = '\n'.join(i)
self.dict[key][index] = i
#print self.dict[key]
def show(self, key):
if key in self.dict.keys():
#print key+':'
count = 1
for i in self.dict[key]:
#print '\t'+ str(count)+'. '+i
count = count + 1
def get_field_details(self):
self.dict = {}
for i in self.sections:
if i.split('\n')[0].endswith('English'):
self.eng_sec = i
#print self.eng_sec
self.eng_secs = self.eng_sec.split('\n###')
#print self.eng_secs
count = 0
for i in self.eng_secs:
t = i.split('\n')
if t[0].find('Etymology')>= 0:
#print '^i\n'+i
if 'Etymology' not in self.dict.keys():
self.dict['Etymology'] = ['\n'.join(t[1:-1])]
else:
self.dict['Etymology'].append('\n'.join(t[1:-1]))
elif t[0].find('Pronunciation')>=0:
if 'Pronunciation' not in self.dict.keys():
self.dict['Pronunciation'] = ['\n'.join(t[1:-1])]
else:
self.dict['Pronunciation'].append('\n'.join(t[1:-1]))
elif t[0].split('] ')[-1] in self.pos:
title = t[0].split('] ')[-1]
if title not in self.dict.keys():
self.dict[title] = ['\n'.join(t[1:-1])]
else:
self.dict[title].append('\n'.join(t[1:-1]))
self.cleanup()
def save_str(self):
l = len(self.fields)
s = ''
for i in self.fields[1:l]:
if len(i[0]) == 3:
for key in self.dict.keys():
if i[1] == key:
s = s + '#' + i[1]
self.dict[key][0] = self.dict[key][0].split('\n')
for i in self.dict[key][0]:
line = i
line = line.replace('*', '')
line = line.replace('_', '')
line = line.strip()
self.dict[key][0][self.dict[key][0].index(i)] = line
self.dict[key][0] = '\n'.join(self.dict[key][0])
s = s + '\n' + self.dict[key][0] + '\n'
elif i[1].find(key)>=0:
if i[1].find('edit')>=0:
i[1] = i[1].split('edit] ')[1]
s = s + '#' + i[1]
#self.dict[key][0]
self.dict[key][0] = self.dict[key][0].split('\n')
for i in self.dict[key][0]:
line = i
line = line.replace('*', '')
line = line.replace('_', '')
line = line.strip()
self.dict[key][0][self.dict[key][0].index(i)] = line
self.dict[key][0] = '\n'.join(self.dict[key][0])
s = s + '\n' + self.dict[key][0] + '\n'
self.dict[key].remove(self.dict[key][0])
elif len(i[0]) == 5:
for key in self.dict.keys():
if i[1] == key:
if len(self.dict[key]) > 0:
s = s + '\t#' + key
self.dict[key][0] = self.dict[key][0].split('\n')
for i in self.dict[key][0]:
line = i
line = line.replace('*', '')
line = line.replace('_', '')
line = line.strip()
line = '\t'+line
self.dict[key][0][self.dict[key][0].index(i)] = line
self.dict[key][0] = '\n'.join(self.dict[key][0])
s = s + '\n' + self.dict[key][0] + '\n'
self.dict[key].remove(self.dict[key][0])
elif len(i[0]) == 7:
for key in self.dict.keys():
if i[1] == key:
if len(self.dict[key]) > 0:
s = s + '\t\t#' + key
self.dict[key][0] = self.dict[key][0].split('\n')
for i in self.dict[key][0]:
line = i
line = line.replace('*', '')
line = line.replace('_', '')
line = line.strip()
line = '\t\t'+line
self.dict[key][0][self.dict[key][0].index(i)] = line
self.dict[key][0] = '\n'.join(self.dict[key][0])
s = s + '\n' + self.dict[key][0] + '\n'
self.dict[key].remove(self.dict[key][0])
elif i in self.dict.keys():
s = s + '#' + i
self.dict[i][0] = self.dict[i][0].split('\n')
for j in self.dict[i][0]:
line = j
line = line.replace('*', '')
line = line.replace('_', '')
line = line.strip()
self.dict[i][0][self.dict[i][0].index(j)] = line
self.dict[i][0] = '\n'.join(self.dict[i][0])
s = s + '\n' + self.dict[i][0] + '\n'
return s
def main(html):
g = get_wiki_data(html)
g.get_contents()
g.get_eng_fields()
g.get_field_details()
data = g.save_str()
return data
if __name__ == '__main__':
file = open('tmp.bak', 'r')
html = file.read()
g = get_wiki_data(html)
g.get_contents()
g.get_eng_fields()
g.get_field_details()
print g.save_str()
Jump to Line
Something went wrong with that request. Please try again.