-
Notifications
You must be signed in to change notification settings - Fork 0
/
title.py
179 lines (161 loc) · 9.58 KB
/
title.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# -*- coding: utf-8 *-*
import logging
from boilerpipe.extract import Extractor
from BeautifulSoup import BeautifulSoup
from logger import logger
import unicodedata as ud
def isIn(substring, text):
substringPos = 0
for c in text:
if substringPos >= len(substring):
return True
l = substring[substringPos]
if c == l:
substringPos+=1
continue
else:
cat1 = ud.category(c)
cat2 = ud.category(l)
#logger.info(c + u" " + cat1 + u" " + l + u" " + cat2)
if cat1[0] == cat2[0] and cat1[0] in ["P"]:
substringPos+=1
else:
substringPos = 0
if substringPos == len(substring):
return True
return False
def isSubstr(find, data):
if len(data) < 1 and len(find) < 1:
return False
for i in range(len(data)):
if not isIn(find, data[i]):
return False
return True
def longSubstr(data):
substr = ''
if len(data) > 1 and len(data[0]) > 0:
for i in range(len(data[0])):
for j in range(len(data[0])-i+1):
if j > len(substr) and isSubstr(data[0][i:i+j], data):
substr = data[0][i:i+j]
return substr
def longSubstrPair(data):
logger.debug(u"Data: " + u'\n-'.join([txt + "//" + type for txt, type in data]))
results = []
for elem1 in data:
for elem2 in data:
if elem1 is not elem2 and elem1[1] != elem2[1]:
substr = longSubstr([elem1[0], elem2[0]])
#logger.info(u"\n\"" + unicode(elem1) + u"\"\n\"" + unicode(elem2) + u"\"\nis:\n\"" + unicode(substr) + u"\"")
if len(substr) > 10:
results.append(substr)
results = sorted(results, key = lambda x: len(x), reverse=True)
logger.debug(u"Results: " + u'\n-'.join(results))
return results[0] if len(results) > 0 else ""
def __fixChars(text):
return text.replace(unichr(160), " ").strip().replace("\n", " ").replace("\t", " ")
def fetchTitle(html, titles = None):
if titles is None:
titles = []
bs = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
metaTitle = bs.find("title")
defaultTitle = ""
ogMetaTitle = bs.find("meta", {"property": "og:title"})
metaTitleText = __fixChars(u''.join(metaTitle.findAll(text=True))) if metaTitle else None
ogMetaTitleText = __fixChars(ogMetaTitle["content"]) if ogMetaTitle else None
if ogMetaTitleText and ogMetaTitleText != metaTitleText:
tagContent = ogMetaTitleText
return tagContent
elif metaTitleText:
tagContent = __fixChars(u''.join(metaTitle.findAll(text=True)))
defaultTitle = tagContent
titles.append((defaultTitle, "meta"))
logger.debug(u"Default title: " + unicode(defaultTitle))
heads = []
for i in [1, 2, 3, 4, 5]:
tagsH =bs.findAll("h" + str(i))
for tagH in tagsH:
tagContent = __fixChars(u''.join(tagH.findAll(text=True)))
if len(tagContent) < 250:
titles.append((tagContent, "h"))
heads.append(tagContent)
logger.debug(u"Titles: " + unicode(titles))
longestSubstring = longSubstrPair(titles)
logger.debug(u"Longest substring: " + unicode(longestSubstring))
longestSubstring = longestSubstring or defaultTitle
title = longestSubstring
title = __findHeader(heads, title)
#obcinamy zbędne spacje
while True:
titleTmp = title.replace(" ", " ")
if titleTmp == title:
break
title = titleTmp
return title or defaultTitle
def __findHeader(heads, title):
#szukamy nagłówka który zawiera tytuł
results = {}
for h in heads:
l = len(longSubstr([title, h]))
t = len(title)
if isIn(h, title) and l <= t :
results[h] = l
if results:
return sorted(results.items(), key=lambda x: x[1], reverse=True)[0][0]
return title
def fetchTitleByUrl(url, titles=None):
extractor = Extractor(extractor='ArticleExtractor', url=url)
html = extractor.data
return fetchTitle(html, titles)
def __test(correctTitle, url):
title = fetchTitleByUrl(url)
if title.lower() != correctTitle.lower():
logger.info(u"Wrong \"" + title + u"\" (\"" + correctTitle + u"\")")
raise BaseException()
else:
logger.info(u"Correct: \"" + title + u"\"\n\n")
if __name__ == "__main__":
logger.info(u"Test: " + longSubstr([u'Today\u2019s Christian Art \u2013 Some Useful Insights Revealed', u"Today's Christian Art - Some Useful Insights Revealed | Jesus Prince Of Peace by Akiane Kramarik"]))
logger.info(u"Test: " + longSubstr([u'Adsense vs Amazon Associates vs iHerb vs Tradedoubler (review)', u"Adsense vs Amazon vs iHerb vs Tradedoubler (review) | Sidekick Blog"]))
logger.info(longSubstr([u"tomek", u"tomek"]))
logger.info(longSubstr([u"tom", u"tomek"]))
logger.info(longSubstr([u"ek", u"tomek"]))
logger.info(longSubstr([u"e", u"tomek"]))
logger.info(longSubstr([u"test", u"tomek"]))
#data = [u"'Striking Mom' Jessica Stilwell: blog extracts - Telegraph", u"'Striking Mom' Jessica Stilwell: blog extracts", u"Good on her! 'Striking Mom' Jessica Stilwell: blog extracts via @Telegraph http://t.co/I90LwcR6"]
#print longSubstrPair(data)
#data = [u"Public Speaking: Already a Decent Speaker? Here Are 5 Expert Tips | Inc.com", u"Already a Decent Speaker? Here Are 5 Expert Tips", u"RT @Inc: MT @avemii: a few #speakingintech tips from @Inc: http://t.co/yKH29bHq I think the last one—practicing a lot—may be the best tip"]
#print longSubstrPair(data)
#data = [u"BBC News - Abdominal aortic aneurysm screening rollout in Wales", u"Abdominal aortic aneurysm screening rollout in Wales", u"RT @bbchealth: Aneurysm scans rollout in Wales http://t.co/B0mjQwA6"]
#print longSubstrPair(data)
#data = [u"Threats and silence: the intimidation by Rangers fans | Alex Thomson's View", u"Threats and silence: the intimidation by Rangers fans", u"Alex Thomson's View", u"There are 151 comments on this post", u"Have your say", u"TOMOBLOG RANGERS INTIMIDATION"]
#print longSubstrPair(data)
#__test(u"We might be in the Scottish third division... but we're aiming for European super league, says Rangers chief",
# "http://www.dailymail.co.uk/sport/football/article-2216624/Rangers-join-European-super-league-says-Charles-Green.html?ITO=socialnet-twitter-mailfootball")
#__test(u"Republic of Ireland 1-6 Germany", "http://www.bbc.co.uk/sport/0/football/19922447?utm_source=twitterfeed&utm_medium=twitter")
#__test(u"Barnet 1-4 Plymouth", "http://www.bbc.co.uk/sport/0/football/19854907")
#__test(u"Today's Christian Art - Some Useful Insights Revealed",
# "http://www.jesusprinceofpeace.com/blog/christian-art/todays-christian-art-insights-revealed")
#__test(u"The Good, the Bad, and the Guilty: Anticipating Feelings of Guilt Predicts Ethical Behavior",
# "http://www.sciencedaily.com/releases/2012/10/121010141452.htm?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+sciencedaily+%28ScienceDaily%3A+Latest+Science+News%29")
#__test(u"Un-Bee-lievable! Dutch legend Edgar Davids is joining Barnet",
# "http://www.mirror.co.uk/sport/football/transfer-news/barnet-are-signing-edgar-davids-the-former-1374019")
#__test(u"The Market Is Showing Signs Of Topping Out, And The Bad News Has Not Been Discounted",
# u"http://www.businessinsider.com/the-bad-news-has-not-been-discounted-2012-10?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+businessinsider+%28Business+Insider%29")
#__test(u"Mitt Romney, American Parasite",
# u"http://www.villagevoice.com/2012-04-18/news/Mitt-Romney-american-parasite/")
#__test(u"CSR Racing", "https://itunes.apple.com/app/id469369175?mt=8")
#__test(u"Romney's Convention Speech Destroyed: How Low Will He Go?", "http://explorer9360.xanga.com/767664210/romneys-convention-speech-destroyed-how-low-will-he-go/")
#__test(u"", u"http://www.uber-facts.com/2013/01/brown-eyes-seem-more-trustworthy/")
__test(u"Toyota No. 1 again with nearly 9.75M sales in 2012", u"http://bigstory.ap.org/article/toyota-sold-nearly-975-million-vehicles-last-year#overlay-context=article/white-wins-6th-straight-winter-x-superpipe-title")
__test(u"Justin Bieber Previews Believe Acoustic: Watch Now!", u"http://www.mtv.com/news/articles/1700813/justin-bieber-believe-acoustic-preview.jhtml")
__test(u"Funny as Hell – Kodi Me’chele Interview", u"http://www.playasonly.com/funny-as-hell-kodi-me%E2%80%99chele-interview/")
__test(u"Apple Has Quietly Started Tracking iPhone Users Again, And It's Tricky To Opt Out", u"http://www.businessinsider.com/ifa-apples-iphone-tracking-in-ios-6-2012-10?op=1")
__test(u"U-KISS’ Dongho to show his comedic side on ‘SNL Korea’", "http://www.allkpop.com/2012/10/u-kiss-dongho-to-show-his-comedic-side-on-snl-korea")
__test(u"", u"")
__test(u"Adsense vs Amazon Associates vs iHerb vs Tradedoubler (review)", u"http://pusabase.com/blog/2012/03/05/adsense-vs-amazon-associates-vs-iherb-vs-tradedoubler-review/")
__test(u"", "http://thestar.blogs.com/thespin/2012/10/not-deja-vu-all-over-again.html")
__test(u"", "http://www.france24.com/en/20121012-mars-rover-makes-surprising-rock-find?utm_source=dlvr.it&utm_medium=twitter")
__test(u"", "http://globalgrind.com/news/russell-simmons-womens-rights-romney-obama-vote")
__test(u"", "http://www.lancashiretelegraph.co.uk/sport/football/blackburn_rovers/news/9981760.Ewood_fears_as_manager_search_goes_on/?ref=twt")
__test(u"", "http://buildmemuscle.wordpress.com/2012/10/11/great-leg-workout/")