Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100644 222 lines (193 sloc) 8.438 kB
db4776f @samuelclay Adding URL normalization on new feeds. Also adding in collocation sea…
authored
1 """
2 URI Normalization function:
3 * Always provide the URI scheme in lowercase characters.
4 * Always provide the host, if any, in lowercase characters.
5 * Only perform percent-encoding where it is essential.
6 * Always use uppercase A-through-F characters when percent-encoding.
7 * Prevent dot-segments appearing in non-relative URI paths.
8 * For schemes that define a default authority, use an empty authority if the
9 default is desired.
10 * For schemes that define an empty path to be equivalent to a path of "/",
11 use "/".
12 * For schemes that define a port, use an empty port if the default is desired
13 * All portions of the URI must be utf-8 encoded NFC from Unicode strings
14
15 implements:
16 http://gbiv.com/protocols/uri/rev-2002/rfc2396bis.html#canonical-form
17 http://www.intertwingly.net/wiki/pie/PaceCanonicalIds
18
19 inspired by:
20 Tony J. Ibbs, http://starship.python.net/crew/tibs/python/tji_url.py
21 Mark Nottingham, http://www.mnot.net/python/urlnorm.py
22 """
23
24 __license__ = "Python"
25
26 import re, unicodedata, urlparse
27 from urllib import quote, unquote
28
29 default_port = {
30 'ftp': 21,
31 'telnet': 23,
32 'http': 80,
33 'gopher': 70,
34 'news': 119,
35 'nntp': 119,
36 'prospero': 191,
37 'https': 443,
38 'snews': 563,
39 'snntp': 563,
40 }
41
42 def normalize(url):
43 """Normalize a URL."""
d2a31cf @samuelclay Typo in urlnorm.
authored
44 if not isinstance(url, basestring):
a4c6b09 @samuelclay Don't normalize non-string urls.
authored
45 return url
46
db4776f @samuelclay Adding URL normalization on new feeds. Also adding in collocation sea…
authored
47 scheme,auth,path,query,fragment = urlparse.urlsplit(url.strip())
48 (userinfo,host,port)=re.search('([^@]*@)?([^:]*):?(.*)',auth).groups()
49
50 # Always provide the URI scheme in lowercase characters.
51 scheme = scheme.lower()
52
53 # Always provide the host, if any, in lowercase characters.
54 host = host.lower()
55 if host and host[-1] == '.': host = host[:-1]
56
57 # Only perform percent-encoding where it is essential.
58 # Always use uppercase A-through-F characters when percent-encoding.
59 # All portions of the URI must be utf-8 encoded NFC from Unicode strings
60 def clean(string):
5da8bf7 @samuelclay Fixing the feed chooser to show total # of feeds. Also fixing a unico…
authored
61 try:
62 string=unicode(unquote(string))
63 return unicodedata.normalize('NFC',string).encode('utf-8')
64 except UnicodeDecodeError:
65 return string
db4776f @samuelclay Adding URL normalization on new feeds. Also adding in collocation sea…
authored
66 path=quote(clean(path),"~:/?#[]@!$&'()*+,;=")
67 fragment=quote(clean(fragment),"~")
68
69 # note care must be taken to only encode & and = characters as values
70 query="&".join(["=".join([quote(clean(t) ,"~:/?#[]@!$'()*+,;=")
71 for t in q.split("=",1)]) for q in query.split("&")])
72
73 # Prevent dot-segments appearing in non-relative URI paths.
74 if scheme in ["","http","https","ftp","file"]:
75 output=[]
76 for input in path.split('/'):
77 if input=="":
78 if not output: output.append(input)
79 elif input==".":
80 pass
81 elif input=="..":
82 if len(output)>1: output.pop()
83 else:
84 output.append(input)
85 if input in ["",".",".."]: output.append("")
86 path='/'.join(output)
87
88 # For schemes that define a default authority, use an empty authority if
89 # the default is desired.
90 if userinfo in ["@",":@"]: userinfo=""
91
92 # For schemes that define an empty path to be equivalent to a path of "/",
93 # use "/".
94 if path=="" and scheme in ["http","https","ftp","file"]:
95 path="/"
96
97 # For schemes that define a port, use an empty port if the default is
98 # desired
99 if port and scheme in default_port.keys():
100 if port.isdigit():
101 port=str(int(port))
102 if int(port)==default_port[scheme]:
103 port = ''
104
105 # Put it all back together again
106 auth=(userinfo or "") + host
107 if port: auth+=":"+port
108 if url.endswith("#") and query=="" and fragment=="": path+="#"
109 url = urlparse.urlunsplit((scheme,auth,path,query,fragment))
110
111 if '://' not in url:
112 url = 'http://' + url
113 if url.startswith('feed://'):
114 url = url.replace('feed://', 'http://')
115
116 return url
117
118 if __name__ == "__main__":
119 import unittest
120 suite = unittest.TestSuite()
121
122 """ from http://www.intertwingly.net/wiki/pie/PaceCanonicalIds """
123 tests= [
124 (False, "http://:@example.com/"),
125 (False, "http://@example.com/"),
126 (False, "http://example.com"),
127 (False, "HTTP://example.com/"),
128 (False, "http://EXAMPLE.COM/"),
129 (False, "http://example.com/%7Ejane"),
130 (False, "http://example.com/?q=%C7"),
131 (False, "http://example.com/?q=%5c"),
132 (False, "http://example.com/?q=C%CC%A7"),
133 (False, "http://example.com/a/../a/b"),
134 (False, "http://example.com/a/./b"),
135 (False, "http://example.com:80/"),
136 (True, "http://example.com/"),
137 (True, "http://example.com/?q=%C3%87"),
138 (True, "http://example.com/?q=%E2%85%A0"),
139 (True, "http://example.com/?q=%5C"),
140 (True, "http://example.com/~jane"),
141 (True, "http://example.com/a/b"),
142 (True, "http://example.com:8080/"),
143 (True, "http://user:password@example.com/"),
144
145 # from rfc2396bis
146 (True, "ftp://ftp.is.co.za/rfc/rfc1808.txt"),
147 (True, "http://www.ietf.org/rfc/rfc2396.txt"),
148 (True, "ldap://[2001:db8::7]/c=GB?objectClass?one"),
149 (True, "mailto:John.Doe@example.com"),
150 (True, "news:comp.infosystems.www.servers.unix"),
151 (True, "tel:+1-816-555-1212"),
152 (True, "telnet://192.0.2.16:80/"),
153 (True, "urn:oasis:names:specification:docbook:dtd:xml:4.1.2"),
154
155 # other
156 (True, "http://127.0.0.1/"),
157 (False, "http://127.0.0.1:80/"),
158 (True, "http://www.w3.org/2000/01/rdf-schema#"),
159 (False, "http://example.com:081/"),
160 ]
161
162 def testcase(expected,value):
163 class test(unittest.TestCase):
164 def runTest(self):
165 assert (normalize(value)==value)==expected, \
166 (expected, value, normalize(value))
167 return test()
168
169 for (expected,value) in tests:
170 suite.addTest(testcase(expected,value))
171
172 """ mnot test suite; three tests updated for rfc2396bis. """
173 tests = {
174 '/foo/bar/.': '/foo/bar/',
175 '/foo/bar/./': '/foo/bar/',
176 '/foo/bar/..': '/foo/',
177 '/foo/bar/../': '/foo/',
178 '/foo/bar/../baz': '/foo/baz',
179 '/foo/bar/../..': '/',
180 '/foo/bar/../../': '/',
181 '/foo/bar/../../baz': '/baz',
182 '/foo/bar/../../../baz': '/baz', #was: '/../baz',
183 '/foo/bar/../../../../baz': '/baz',
184 '/./foo': '/foo',
185 '/../foo': '/foo', #was: '/../foo',
186 '/foo.': '/foo.',
187 '/.foo': '/.foo',
188 '/foo..': '/foo..',
189 '/..foo': '/..foo',
190 '/./../foo': '/foo', #was: '/../foo',
191 '/./foo/.': '/foo/',
192 '/foo/./bar': '/foo/bar',
193 '/foo/../bar': '/bar',
194 '/foo//': '/foo/',
195 '/foo///bar//': '/foo/bar/',
196 'http://www.foo.com:80/foo': 'http://www.foo.com/foo',
197 'http://www.foo.com:8000/foo': 'http://www.foo.com:8000/foo',
198 'http://www.foo.com./foo/bar.html': 'http://www.foo.com/foo/bar.html',
199 'http://www.foo.com.:81/foo': 'http://www.foo.com:81/foo',
200 'http://www.foo.com/%7ebar': 'http://www.foo.com/~bar',
201 'http://www.foo.com/%7Ebar': 'http://www.foo.com/~bar',
202 'ftp://user:pass@ftp.foo.net/foo/bar':
203 'ftp://user:pass@ftp.foo.net/foo/bar',
204 'http://USER:pass@www.Example.COM/foo/bar':
205 'http://USER:pass@www.example.com/foo/bar',
206 'http://www.example.com./': 'http://www.example.com/',
207 '-': '-',
208 }
209
210 def testcase(original,normalized):
211 class test(unittest.TestCase):
212 def runTest(self):
213 assert normalize(original)==normalized, \
214 (original, normalized, normalize(original))
215 return test()
216
217 for (original,normalized) in tests.items():
218 suite.addTest(testcase(original,normalized))
219
220 """ execute tests """
221 unittest.TextTestRunner().run(suite)
Something went wrong with that request. Please try again.