Set the default content_type

polyrabbit · Oct 29, 2015 · 342c8ce · 342c8ce
1 parent 5a2449d
commit 342c8ce
Showing 1 changed file with 2 additions and 1 deletion.
diff --git a/page_content_extractor/__init__.py b/page_content_extractor/__init__.py
@@ -33,7 +33,8 @@ def legendary_parser_factory(url):
         except Exception as e:
             logger.info('%s is not an embeddable, try another(%s)', resp.url, e)
 
-    ct = resp.headers.get('content-type', '').lower()
+    # if no content-type is provided, Chrome set as an html
+    ct = resp.headers.get('content-type', 'text').lower()
     if ct.startswith('text'):
         logger.info('Get an %s to parse', ct)
         return HtmlContentExtractor(resp.text, resp.url)