Skip to content

Commit

Permalink
Fix charset issues.
Browse files Browse the repository at this point in the history
  • Loading branch information
coronag committed Jun 29, 2016
1 parent 995554f commit 321ce50
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 23 deletions.
18 changes: 16 additions & 2 deletions src/OpenGraph-Net.Tests/OpenGraphTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -189,15 +189,29 @@ public void ParseUrl_AmazonUrl_Test()
[Test]
public void ParseUrl_Vk_Test()
{
OpenGraph graph = OpenGraph.ParseUrl("https://vk.com/wall-41600377_66756");
var graph = ParseUrl("https://vk.com/wall-41600377_66756");
Assert.AreEqual(graph["description"], "Создайте себе горное настроение с нашим первым фан-китом по игре #SteepGame! -> http://ubi.li/u8w9n");
}

[Test]
public void ParseUrl_Periscope_Encoded_Test()
{
OpenGraph graph = OpenGraph.ParseUrl("https://www.periscope.tv/w/1DXxyZZZVykKM");
var graph = ParseUrl("https://www.periscope.tv/w/1DXxyZZZVykKM");
Assert.AreEqual(graph["image"], "https://tn.periscope.tv/lXc5gSh6UPaWdc37LtVCb3UdtSfvj2QNutojPK2du5YWrNchfI4wXpwwHKTyfDhmfT2ibsBZV4doQeWlhSvI4A==/chunk_314.jpg?Expires=1781852253&Signature=U5OY3Y2HRb4ETmakQAPwMcv~bqu6KygIxriooa41rk64RcDfjww~qpVgMR-T1iX4S9NxfvXHLMT3pEckBDEOicsNO7oUAo4NieH9GRB2Sv0EA7swxLojD~Zn98ThNWTF5fSzv6SSPjyvctsqBiRmvAN6x7fmMH6l3vzx8ePSCgdEm8-31lUAz7lReBNZQjYSi~C8AwqZVI0Mx6y8lNKklL~m0e6RTGdvr~-KIDewU3wpjSdX7AgpaXXjahk4x-ceUUKcH3T1j--ZjaY7nqPO9fbMZFNPs502A32mrcmaZCzvaD~AuoH~u3y44mJVjzHRrpTxHIBklqHxAgc7dzverg__&Key-Pair-Id=APKAIHCXHHQVRTVSFRWQ");
}

[Test]
public void ParseUrl_EncodingError_Test()
{
var ogs = ParseUrl("http://www.telerama.fr/cinema/realite-virtuelle-360-de-bonheur-a-ameliorer,144339.php?utm_medium=Social&utm_source=Twitter&utm_campaign=Echobox&utm_term=Autofeed#link_time=1466595239");
Assert.AreEqual(ogs["title"], "Réalité virtuelle : 360° de bonheur à améliorer");
Assert.AreEqual(ogs["description"], "Le cinéma à 360° a désormais son festival. Organisé par le Forum des images, le premier Paris Virtual Film Festival a donc vu le jour....");
}


protected OpenGraph ParseUrl(string u)
{
return OpenGraph.ParseUrl(u);
}
}
}
49 changes: 30 additions & 19 deletions src/OpenGraph-Net/HttpDownloader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -136,32 +136,43 @@ private void SetEncodingFromHeader(HttpWebResponse response)

private string CheckMetaCharSetAndReEncode(Stream memStream, string html)
{
Match m = new Regex(@"<meta\s+.*?charset\s*=\s*(?<charset>[A-Za-z0-9_-]+)", RegexOptions.Singleline | RegexOptions.IgnoreCase).Match(html);
if (m.Success)
try
{
string charset = m.Groups["charset"].Value.ToLower() ?? "iso-8859-1";
if ((charset == "unicode") || (charset == "utf-16"))
{
charset = "utf-8";
}
var m = new Regex(@"<meta\s+.*?charset\s*=\s*(?<charset>[A-Za-z0-9_-]+)", RegexOptions.Singleline | RegexOptions.IgnoreCase).Match(html);
var charset = m.Success ? m.Groups["charset"].Value.ToLower() : GetCharsetFrom(html);

try
{
Encoding metaEncoding = Encoding.GetEncoding(charset);
if (Encoding != metaEncoding)
{
memStream.Position = 0L;
StreamReader recodeReader = new StreamReader(memStream, metaEncoding);
html = recodeReader.ReadToEnd().Trim();
recodeReader.Close();
}
}
catch (ArgumentException)
if (string.IsNullOrWhiteSpace(charset)) return html;

if ((charset == "unicode") || (charset == "utf-16")) charset = "utf-8";

var metaEncoding = Encoding.GetEncoding(charset);
if (Encoding != metaEncoding)
{
memStream.Position = 0L;
var recodeReader = new StreamReader(memStream, metaEncoding);
html = recodeReader.ReadToEnd().Trim();
recodeReader.Close();
}
}
catch (ArgumentException)
{
}

return html;
}

private string GetCharsetFrom(string strWebPage)
{
if (strWebPage == null) return null;

const string charsetSearchedKey = "charset=\"";
var charsetStart = strWebPage.IndexOf(charsetSearchedKey, StringComparison.Ordinal);

if (charsetStart <= 0) return null;

charsetStart += charsetSearchedKey.Length;
var charsetEnd = strWebPage.IndexOfAny(new[] { ' ', '\"', ';' }, charsetStart);
return strWebPage.Substring(charsetStart, charsetEnd - charsetStart);
}
}
}
4 changes: 2 additions & 2 deletions src/OpenGraph-Net/Properties/AssemblyInfo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,5 @@
// You can specify all the values or you can default the Build and Revision Numbers
// by using the '*' as shown below:
// [assembly: AssemblyVersion("1.0.*")]
[assembly: AssemblyVersion("1.2.0.3")]
[assembly: AssemblyFileVersion("1.2.0.3")]
[assembly: AssemblyVersion("1.2.0.4")]
[assembly: AssemblyFileVersion("1.2.0.4")]

0 comments on commit 321ce50

Please sign in to comment.