Skip to content
Permalink
Browse files

refactor: initial cleanup of MarkupSanitizer. Added setup script

  • Loading branch information...
Chris S
Chris S committed Sep 19, 2016
1 parent 832944f commit 297f4a2ba961892f11cf1c2e55d020bb8118fb06
@@ -0,0 +1,9 @@
iwr https://chocolatey.org/install.ps1 -UseBasicParsing | iex
choco install -y ruby
choco install -y chromedriver

echo "You will need to install MongoDb and Postgres for some tests."
echo "The best way to do this is via Docker for Windows."
echo ""
echo "Docker requires Windows 10, build 10586 upwards."
echo "You can install it by typing choco install docker."
@@ -4,6 +4,8 @@
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
using AngleSharp.Dom.Html;
using AngleSharp.Parser.Html;
using HtmlAgilityPack;
using Roadkill.Core.Configuration;
using HapHtmlAttribute = HtmlAgilityPack.HtmlAttribute;
@@ -95,60 +97,68 @@ private HtmlWhiteList GetCachedWhiteList()
/// <returns>Html text after sanitize.</returns>
public string SanitizeHtml(string htmlText)
{
// Create Html document
HtmlDocument html = new HtmlDocument();
if (string.IsNullOrEmpty(htmlText))
return "";

var parserOptions = new HtmlParserOptions()
{
IsStrictMode = false
};
var parser = new HtmlParser(parserOptions);
IHtmlDocument document = parser.Parse(htmlText);

// Create Html document
HtmlDocument html = new HtmlDocument();
html.OptionFixNestedTags = true;
html.OptionAutoCloseOnEnd = true;
html.OptionDefaultStreamEncoding = Encoding.UTF8;
html.LoadHtml(htmlText);

if (html == null)
return string.Empty;

HtmlNode allNodes = html.DocumentNode;

if (UseWhiteList)
{
string[] tagNames = GetCachedWhiteList().ElementWhiteList.Select(x => x.Name).ToArray();
CleanNodes(allNodes, tagNames);
CleanNoneWhiteListedAttributes(allNodes);
}

// TODO: make this neater
if (UseWhiteList)
else
{
// Filter the attributes of the remaining
foreach (HtmlElement whiteListTag in GetCachedWhiteList().ElementWhiteList)
{
IEnumerable<HtmlNode> nodes = (from n in allNodes.DescendantsAndSelf()
where n.Name == whiteListTag.Name
select n);
CleanAllTagAttributes(allNodes);
}

if (nodes == null)
continue;
return allNodes.InnerHtml;
}

foreach (HtmlNode node in nodes)
{
if (!node.HasAttributes) continue;
private void CleanAllTagAttributes(HtmlNode allNodes)
{
IEnumerable<HtmlNode> nodes = allNodes.DescendantsAndSelf();

// Get all the allowed attributes for this tag
HapHtmlAttribute[] attributes = node.Attributes.ToArray();
foreach (HapHtmlAttribute attribute in attributes)
{
if (!whiteListTag.ContainsAttribute(attribute.Name))
{
attribute.Remove(); // Wasn't in the list
}
else
{
CleanAttributeValues(attribute);
}
}
}
foreach (HtmlNode node in nodes)
{
if (!node.HasAttributes) continue;

// Get all the allowed attributes for this tag
HapHtmlAttribute[] attributes = node.Attributes.ToArray();
foreach (HapHtmlAttribute attribute in attributes)
{
CleanAttributeValues(attribute);
}
}
else
}

private void CleanNoneWhiteListedAttributes(HtmlNode allNodes)
{
string[] tagNames = GetCachedWhiteList().ElementWhiteList.Select(x => x.Name).ToArray();
CleanNodes(allNodes, tagNames);

// Filter the attributes of the remaining
foreach (HtmlElement whiteListTag in GetCachedWhiteList().ElementWhiteList)
{
IEnumerable<HtmlNode> nodes = allNodes.DescendantsAndSelf();
IEnumerable<HtmlNode> nodes = (from n in allNodes.DescendantsAndSelf()
where n.Name == whiteListTag.Name
select n);

if (nodes == null)
continue;

foreach (HtmlNode node in nodes)
{
@@ -158,20 +168,25 @@ public string SanitizeHtml(string htmlText)
HapHtmlAttribute[] attributes = node.Attributes.ToArray();
foreach (HapHtmlAttribute attribute in attributes)
{
CleanAttributeValues(attribute);
if (!whiteListTag.ContainsAttribute(attribute.Name))
{
attribute.Remove(); // Wasn't in the list
}
else
{
CleanAttributeValues(attribute);
}
}
}
}
}

return allNodes.InnerHtml;
}

/// <summary>
/// This removes the current node tags and its child nodes if these are not in whitelist.
/// </summary>
/// <param name="node"></param>
/// <param name="tagWhiteList"></param>
private void CleanNodes(HtmlNode node, string[] tagWhiteList)
/// <summary>
/// This removes the current node tags and its child nodes if these are not in whitelist.
/// </summary>
/// <param name="node"></param>
/// <param name="tagWhiteList"></param>
private void CleanNodes(HtmlNode node, string[] tagWhiteList)
{
// remove node that is not in the whitelist.
if (node.NodeType == HtmlNodeType.Element)
@@ -12,10 +12,6 @@

namespace Roadkill.Tests.Unit.Text
{
/// <summary>
///This is a test class for HtmlAgilityPackSanitizerProviderTest and is intended
///to contain all HtmlAgilityPackSanitizerProviderTest Unit Tests
///</summary>
[TestFixture]
[Category("Unit")]
public class MarkupSanitizerTests

0 comments on commit 297f4a2

Please sign in to comment.
You can’t perform that action at this time.