Merge pull request #2 from JJ/master

raiben · web-flow · commit 053be6026d30 · 2019-06-09T20:55:21.000+02:00
Some changes in docs and file name
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+tropescraper.egg-info/
+__pycache__
+scraper_cache/
+*~
+.cache
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,10 @@
+dist: xenial
+language: python
+python:
+  - "3.6"
+  - "3.7"
+  - "3.6-dev"  # 3.6 development branch
+  - "3.7-dev"  # 3.7 development branch
+install:
+  - pip install -e .
+script: pytest
diff --git a/README.md b/README.md
@@ -1,2 +1,20 @@
 # tropescraper
-A tropes scraper
+
+A scraper for the website TV tropes.
+
+
+## Install
+
+Install all dependencies with:
+
+    pip install -e .
+    
+(pip should be installed and available).
+
+## Run
+
+Run it with
+
+    bin/scrape-tvtropes
+    
+It will take a good while while it scrapes ~12k films.
diff --git a/bin/scrape-tvtropes b/bin/scrape-tvtropes
@@ -7,11 +7,10 @@ import os
 from tropescraper.tvtropes_scraper import TVTropesScraper
 
 if len(sys.argv) != 2:
-    command = sys.argv[0].split(os.sep)[-1]
-    print(f'Error: Invalid usage\nPlease execute \'{command} <target_file.json>\'')
-    sys.exit(1)
-
-file_name = sys.argv[1]
+    file_name = "tvtropes.json"
+else:
+    
+    file_name = sys.argv[1]
 
 logging.basicConfig(level=logging.INFO)
 scraper = TVTropesScraper()
diff --git a/tropescraper/cache_information.py b/tropescraper/cache_information.py
diff --git a/tropescraper/test_web_page_retriever.py b/tropescraper/test_web_page_retriever.py
@@ -0,0 +1,16 @@
+import unittest
+from tropescraper.web_page_retriever import WebPageRetriever
+
+class TestWebPageRetriever(unittest.TestCase):
+
+    def setUp(self):
+        self.scraper = WebPageRetriever(0.5,"https://tvtropes.org/pmwiki/pmwiki.php/Film/FantasticBeastsAndWhereToFindThem","/tmp")
+
+    def test_class(self):
+        self.assertIsInstance( self.scraper, WebPageRetriever, "Correct class" )
+
+    def test_retrieve(self):
+        content = self.scraper.retrieve()
+        self.assertNotEqual( content, "",  "Retrieves something")
+        content2 = self.scraper.retrieve()
+        self.assertEqual( content, content2, "Retrieves from cache")
diff --git a/tropescraper/web_page_retriever.py b/tropescraper/web_page_retriever.py
@@ -4,10 +4,11 @@
 import os
 from datetime import datetime
 from time import sleep, ctime
+from collections import namedtuple
 
 import requests
 
-from tropescraper.cache_information import CacheInformation
+CacheInformation = namedtuple('CacheInformation', ['size', 'files_count', 'creation_date'])
 
 
 class WebPageRetriever(object):