Skip to content
This repository has been archived by the owner on Feb 12, 2023. It is now read-only.

Commit

Permalink
fix(chatform): Broaden URL matching to include unicode
Browse files Browse the repository at this point in the history
Fix #4853
Fix #4295

*Instead of searching strictly valid URIs, allow anything characters following scheme. This allows for UTF-8 characters used in other languages, as well as parentheses and other ASCII characters. This will over-match some invalid URLs.
*Ignored surrounding characters of URIs and ending punctuation
*Fix www-only links by adding http scheme to href
  • Loading branch information
anthonybilinski committed Feb 25, 2018
1 parent d3d81bb commit e564b85
Show file tree
Hide file tree
Showing 4 changed files with 142 additions and 29 deletions.
2 changes: 1 addition & 1 deletion src/chatlog/chatmessage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ ChatMessage::Ptr ChatMessage::createChatMessage(const QString& sender, const QSt

// quotes (green text)
text = detectQuotes(text, type);
text = highlightURL(text);
text = highlightURI(text);

// text styling
Settings::StyleType styleType = Settings::getInstance().getStylePreference();
Expand Down
140 changes: 116 additions & 24 deletions src/chatlog/textformatter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,40 +20,42 @@
#include "textformatter.h"

#include <QRegularExpression>
#include <QVector>

// clang-format off

static const QString SINGLE_SIGN_PATTERN = QStringLiteral("(?<=^|[\\s\\n])"
// Note: escaping of '\' is only needed because QStringLiteral is broken by linebreak
static const QString SINGLE_SIGN_PATTERN = QStringLiteral("(?<=^|\\s)"
"[%1]"
"(?!\\s)"
"([^%1\\n]+?)"
"(?<!\\s)"
"[%1]"
"(?=$|[\\s\\n])");
"(?=$|\\s)");

static const QString SINGLE_SLASH_PATTERN = QStringLiteral("(?<=^|[\\s\\n])"
static const QString SINGLE_SLASH_PATTERN = QStringLiteral("(?<=^|\\s)"
"/"
"(?!\\s)"
"([^/\\n]+?)"
"(?<!\\s)"
"/"
"(?=$|[\\s\\n])");
"(?=$|\\s)");

static const QString DOUBLE_SIGN_PATTERN = QStringLiteral("(?<=^|[\\s\\n])"
static const QString DOUBLE_SIGN_PATTERN = QStringLiteral("(?<=^|\\s)"
"[%1]{2}"
"(?!\\s)"
"([^\\n]+?)"
"(?<!\\s)"
"[%1]{2}"
"(?=$|[\\s\\n])");
"(?=$|\\s)");

static const QString MULTILINE_CODE = QStringLiteral("(?<=^|[\\s\\n])"
static const QString MULTILINE_CODE = QStringLiteral("(?<=^|\\s)"
"```"
"(?!`)"
"((.|\\n)+?)"
"(?<!`)"
"```"
"(?=$|[\\s\\n])");
"(?=$|\\s)");

#define REGEXP_WRAPPER_PAIR(pattern, wrapper)\
{QRegularExpression(pattern,QRegularExpression::UseUnicodePropertiesOption),QStringLiteral(wrapper)}
Expand All @@ -74,44 +76,134 @@ static const QPair<QRegularExpression, QString> REGEX_TO_WRAPPER[] {
#undef REGEXP_WRAPPER_PAIR

static const QString HREF_WRAPPER = QStringLiteral(R"(<a href="%1">%1</a>)");
static const QString WWW_WRAPPER = QStringLiteral(R"(<a href="http://%1">%1</a>)");

// based in this: https://tools.ietf.org/html/rfc3986#section-2
static const QString URL_PATH_PATTERN = QStringLiteral("[\\w:/?#\\[\\]@!$&'{}*+,;.~%=-]+");
static const QVector<QRegularExpression> WWW_WORD_PATTERN = {
QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*((www)\S+))"))
};

static const QRegularExpression URL_PATTERNS[] = {
QRegularExpression(QStringLiteral(R"(\b(www\.|((http[s]?)|ftp)://)%1)").arg(URL_PATH_PATTERN)),
QRegularExpression(QStringLiteral(R"(\b(file|smb)://([\S| ]*))")),
QRegularExpression(QStringLiteral(R"(\btox:[a-zA-Z\\d]{76})")),
QRegularExpression(QStringLiteral(R"(\bmailto:\S+@\S+\.\S+)")),
QRegularExpression(QStringLiteral(R"(\btox:\S+@\S+)")),
static const QVector<QRegularExpression> URI_WORD_PATTERNS = {
// Note: This does not match only strictly valid URLs, but we broaden search to any string following scheme to
// allow UTF-8 "IRI"s instead of ASCII-only URLs
QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*((((http[s]?)|ftp)://)\S+))")),
QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*((file|smb)://([\S| ]*)))")),
QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*(tox:[a-zA-Z\d]{76}))")),
QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*(mailto:\S+@\S+\.\S+))")),
QRegularExpression(QStringLiteral(R"((?<=^|\s)\S*(tox:\S+@\S+))")),
};


// clang-format on

struct MatchingUri {
bool valid{false};
int length{0};
};

// pairs of characters that are ignored when surrounding a URI
static const QPair<QString, QString> URI_WRAPPING_CHARS[] = {
{QString("("), QString(")")},
{QString("["), QString("]")},
{QString("&quot;"), QString("&quot;")},
{QString("'"), QString("'")}
};

// characters which are ignored from the end of URI
static const QChar URI_ENDING_CHARS[] = {
QChar::fromLatin1('?'),
QChar::fromLatin1('.'),
QChar::fromLatin1('!'),
QChar::fromLatin1(':'),
QChar::fromLatin1(',')
};

/**
* @brief Highlights URLs within passed message string
* @param message Where search for URLs
* @brief Strips wrapping characters and ending punctuation from URI
* @param QRegularExpressionMatch of a word containing a URI
* @return MatchingUri containing info on the stripped URI
*/
MatchingUri stripSurroundingChars(const QStringRef wrappedUri, const int startOfBareUri)
{
bool matchFound;
int curValidationStartPos = 0;
int curValidationEndPos = wrappedUri.length();
do {
matchFound = false;
for (auto const& surroundChars : URI_WRAPPING_CHARS)
{
const int openingCharLength = surroundChars.first.length();
const int closingCharLength = surroundChars.second.length();
if (surroundChars.first == wrappedUri.mid(curValidationStartPos, openingCharLength) &&
surroundChars.second == wrappedUri.mid(curValidationEndPos - closingCharLength, closingCharLength)) {
curValidationStartPos += openingCharLength;
curValidationEndPos -= closingCharLength;
matchFound = true;
break;
}
}
for (QChar const endChar : URI_ENDING_CHARS) {
const int charLength = 1;
if (endChar == wrappedUri.at(curValidationEndPos - charLength)) {
curValidationEndPos -= charLength;
matchFound = true;
break;
}
}
} while (matchFound);
MatchingUri strippedMatch;
if (startOfBareUri != curValidationStartPos) {
strippedMatch.valid = false;
} else {
strippedMatch.valid = true;
strippedMatch.length = curValidationEndPos - startOfBareUri;
}
return strippedMatch;
}

/**
* @brief Wrap substrings matching "patterns" with "wrapper" in "message"
* @param message Where search for patterns
* @param patterns Array of regex patterns to find strings to wrap
* @param wrapper Surrounds the matched strings
* @note done separately from URI since the link must have a scheme added to be valid
* @return Copy of message with highlighted URLs
*/
QString highlightURL(const QString& message)
QString highlight(const QString& message, const QVector<QRegularExpression>& patterns, const QString& wrapper)
{
QString result = message;
for (const QRegularExpression& exp : URL_PATTERNS) {
for (const QRegularExpression& exp : patterns) {
const int startLength = result.length();
int offset = 0;
QRegularExpressionMatchIterator iter = exp.globalMatch(result);
while (iter.hasNext()) {
const QRegularExpressionMatch match = iter.next();
const int startPos = match.capturedStart() + offset;
const int length = match.capturedLength();
const QString wrappedURL = HREF_WRAPPER.arg(match.captured());
result.replace(startPos, length, wrappedURL);
const int uriWithWrapMatch{0};
const int uriWithoutWrapMatch{1};
MatchingUri matchUri = stripSurroundingChars(match.capturedRef(uriWithWrapMatch),
match.capturedStart(uriWithoutWrapMatch) - match.capturedStart(uriWithWrapMatch));
if (!matchUri.valid) {
continue;
}
const QString wrappedURL = wrapper.arg(match.captured(uriWithoutWrapMatch).left(matchUri.length));
result.replace(match.capturedStart(uriWithoutWrapMatch) + offset, matchUri.length, wrappedURL);
offset = result.length() - startLength;
}
}
return result;
}

/**
* @brief Highlights URLs within passed message string
* @param message Where search for URLs
* @return Copy of message with highlighted URLs
*/
QString highlightURI(const QString& message)
{
QString result = highlight(message, URI_WORD_PATTERNS, HREF_WRAPPER);
result = highlight(result, WWW_WORD_PATTERN, WWW_WRAPPER);
return result;
}

/**
* @brief Checks HTML tags intersection while applying styles to the message text
* @param str Checking string
Expand Down
2 changes: 1 addition & 1 deletion src/chatlog/textformatter.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

#include <QString>

QString highlightURL(const QString& message);
QString highlightURI(const QString& message);

QString applyMarkdown(const QString& message, bool showFormattingSymbols);

Expand Down
27 changes: 24 additions & 3 deletions test/chatlog/textformatter_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,11 +168,12 @@ static const QVector<StringPair> MIXED_FORMATTING_SPECIAL_CASES {
};

#define MAKE_LINK(url) "<a href=\"" url "\">" url "</a>"
#define MAKE_WWW_LINK(url) "<a href=\"http://" url "\">" url "</a>"

static const QVector<QPair<QString, QString>> URL_CASES {
PAIR_FORMAT("https://github.com/qTox/qTox/issues/4233",
MAKE_LINK("https://github.com/qTox/qTox/issues/4233")),
PAIR_FORMAT("www.youtube.com", MAKE_LINK("www.youtube.com")),
PAIR_FORMAT("www.youtube.com", MAKE_WWW_LINK("www.youtube.com")),
PAIR_FORMAT("https://url.com/some*url/some*more*url/",
MAKE_LINK("https://url.com/some*url/some*more*url/")),
PAIR_FORMAT("https://url.com/some_url/some_more_url/",
Expand All @@ -191,7 +192,7 @@ static const QVector<QPair<QString, QString>> URL_CASES {
"www.site.com/part1/part2",
MAKE_LINK("http://site.com/part1/part2") " "
MAKE_LINK("http://site.com/part3") " and one more time "
MAKE_LINK("www.site.com/part1/part2")),
MAKE_WWW_LINK("www.site.com/part1/part2")),
PAIR_FORMAT("https://127.0.0.1/asd\n"
"https://ABCD:EF01:2345:6789:ABCD:EF01:2345:6789/\n"
"ftp://2001:DB8::8:800:200C:417A/\n"
Expand All @@ -213,6 +214,26 @@ static const QVector<QPair<QString, QString>> URL_CASES {
MAKE_LINK("http://[::1]:22/") " "
MAKE_LINK("http://[::]:20/") " "
),
// Test case from issue #4853 (include unicode, ending brackets that are part of URL)
PAIR_FORMAT("https://ja.wikipedia.org/wiki/印章",
MAKE_LINK("https://ja.wikipedia.org/wiki/印章")),
PAIR_FORMAT("https://en.wikipedia.org/wiki/Seal_(East_Asia)",
MAKE_LINK("https://en.wikipedia.org/wiki/Seal_(East_Asia)")),
// Test cases from issue #4295 (exclude surrounding quotes, brackets, ending punctuation)
PAIR_FORMAT("(http://www.google.com)",
"(" MAKE_LINK("http://www.google.com") ")"),
PAIR_FORMAT("&quot;http://www.google.com&quot;",
"&quot;" MAKE_LINK("http://www.google.com") "&quot;"),
PAIR_FORMAT("http://www.google.com.",
MAKE_LINK("http://www.google.com") "."),
PAIR_FORMAT("http://www.google.com,",
MAKE_LINK("http://www.google.com") ","),
PAIR_FORMAT("http://www.google.com?",
MAKE_LINK("http://www.google.com") "?"),
PAIR_FORMAT("https://google.com?gfe_rd=cr",
MAKE_LINK("https://google.com?gfe_rd=cr")),
PAIR_FORMAT("[&quot;https://en.wikipedia.org/wiki/Seal_(East_Asia)&quot;]?",
"[&quot;" MAKE_LINK("https://en.wikipedia.org/wiki/Seal_(East_Asia)") "&quot;]?")
};

#undef PAIR_FORMAT
Expand Down Expand Up @@ -325,7 +346,7 @@ private slots:
void urlTest();
private:
const MarkdownFunction markdownFunction = applyMarkdown;
UrlHighlightFunction urlHighlightFunction = highlightURL;
UrlHighlightFunction urlHighlightFunction = highlightURI;
};

static QString commonWorkCasesProcessInput(const QString& str, const MarkdownToTags& mtt)
Expand Down

0 comments on commit e564b85

Please sign in to comment.