Skip to content

Commit

Permalink
Fixed issue in jsoup.connect when extracting character set from conte…
Browse files Browse the repository at this point in the history
…nt-type header; now supports quoted

charset declaration.
  • Loading branch information
jhy committed Jul 2, 2011
1 parent ef57a00 commit cacbbfb
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 20 deletions.
3 changes: 3 additions & 0 deletions CHANGES
@@ -1,6 +1,9 @@
jsoup changelog

*** Release 1.3.4 [2010-Sep-19]
* Fixed issue in jsoup.connect when extracting character set from content-type header; now supports quoted
charset declaration.

* Fixed support for jsoup.connect to follow redirects between http & https URLs.
<http://github.com/jhy/jsoup/issues/issue/37>

Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/helper/DataUtil.java
Expand Up @@ -15,7 +15,7 @@
*
*/
public class DataUtil {
private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=([^\\s;]*)");
private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)");
static final String defaultCharset = "UTF-8"; // used if not found in header or meta charset
private static final int bufferSize = 0x20000; // ~130K.

Expand Down
20 changes: 1 addition & 19 deletions src/main/java/org/jsoup/helper/HttpConnection.java
Expand Up @@ -21,8 +21,6 @@
* @see org.jsoup.Jsoup#connect(String)
*/
public class HttpConnection implements Connection {
private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=([^\\s;]*)");

public static Connection connect(String url) {
Connection con = new HttpConnection();
con.url(url);
Expand Down Expand Up @@ -341,7 +339,7 @@ static Response execute(Connection.Request req) throws IOException {
new BufferedInputStream(new GZIPInputStream(conn.getInputStream())) :
new BufferedInputStream(conn.getInputStream());
res.byteData = DataUtil.readToByteBuffer(inStream);
res.charset = getCharsetFromContentType(res.contentType); // may be null, readInputStream deals with it
res.charset = DataUtil.getCharsetFromContentType(res.contentType); // may be null, readInputStream deals with it
} finally {
if (inStream != null) inStream.close();
}
Expand Down Expand Up @@ -542,20 +540,4 @@ public String toString() {
return key + "=" + value;
}
}

/**
* Parse out a charset from a content type header.
*
* @param contentType e.g. "text/html; charset=EUC-JP"
* @return "EUC-JP", or null if not found. Charset is trimmed and uppercased.
*/
private static String getCharsetFromContentType(String contentType) {
if (contentType == null) return null;

Matcher m = charsetPattern.matcher(contentType);
if (m.find()) {
return m.group(1).trim().toUpperCase();
}
return null;
}
}
6 changes: 6 additions & 0 deletions src/test/java/org/jsoup/helper/DataUtilTest.java
Expand Up @@ -12,4 +12,10 @@ public void testCharset() {
assertEquals(null, DataUtil.getCharsetFromContentType("text/html"));
assertEquals(null, DataUtil.getCharsetFromContentType(null));
}

@Test public void testQuotedCharset() {
assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html; charset=\"utf-8\""));
assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html;charset=\"utf-8\""));
assertEquals("ISO-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=\"ISO-8859-1\""));
}
}

0 comments on commit cacbbfb

Please sign in to comment.