Merge pull request #23 from kamidipreetham/develop

Release v0.5.2
preetham · Feb 18, 2020 · 6521fcb · 6521fcb
2 parents 443cf00 + 4f399ea
commit 6521fcb
Show file tree

Hide file tree

Showing 14 changed files with 99 additions and 1,328 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -109,4 +109,5 @@ venv.bak/
 
 #data folders
 dataset
+test-dataset
 .DS_Store
diff --git a/.gitignore b/.gitignore
@@ -109,4 +109,5 @@ venv.bak/
 
 #data folders
 dataset
+test-dataset
 .DS_Store
diff --git a/Pipfile b/Pipfile
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -3,37 +3,45 @@ alabaster==0.7.12
 atomicwrites==1.3.0
 attrs==19.1.0
 autopep8==1.4.4
+awscli==1.16.196
 babel==2.7.0
 bandit==1.6.2
 bleach==3.1.0
+botocore==1.12.186
 certifi==2019.6.16
 chardet==3.0.4
+colorama==0.3.9
 coverage==4.5.3
 docutils==0.14
 gitdb2==2.0.5
 gitpython==2.1.11
-hypothesis==4.26.4
+hypothesis==4.27.0
 idna==2.8
 imagesize==1.1.0
 importlib-metadata==0.18
 jinja2==2.10.1
+jmespath==0.9.4
 markupsafe==1.1.1
 more-itertools==7.1.0
 packaging==19.0
 pbr==5.4.0
 pkginfo==1.5.0.1
 pluggy==0.12.0
 py==1.8.0
+pyasn1==0.4.5
 pycodestyle==2.5.0
 pygments==2.4.2
 pyparsing==2.4.0
 pytest-cov==2.7.1
 pytest==5.0.1
+python-dateutil==2.8.0
 pytz==2019.1
-pyyaml==5.1.1
+pyyaml==5.1 ; python_version != '2.6'
 readme-renderer==24.0
 requests-toolbelt==0.9.1
 requests==2.22.0
+rsa==3.4.2
+s3transfer==0.2.1
 six==1.12.0
 smmap2==2.0.5
 snowballstemmer==1.9.0

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,4 @@
 -i https://pypi.org/simple
--e git+https://github.com/twintproject/twint.git@ad27650fbc0bf8c3f2c78449088a5ede7239f53a#egg=twint
 aiodns==2.0.0
 aiohttp-socks==0.2.2
 aiohttp==3.5.4
@@ -18,7 +17,7 @@ elasticsearch==7.0.2
 eventlet==0.25.0
 fake-useragent==0.1.11
 flask-cors==3.0.8
-flask==1.1.0
+flask==1.1.1
 geographiclib==1.49
 geopy==1.20.0
 greenlet==0.4.15
@@ -34,10 +33,10 @@ markupsafe==1.1.1
 monotonic==1.5
 multidict==4.5.2
 networkx==2.3
-nltk==3.4.4
+nltk>=3.4.5
 numpy==1.16.4
 pandas==0.24.2
-pillow==6.1.0
+pillow>=6.2.0
 pycares==3.0.0
 pycodestyle==2.5.0
 pycparser==2.19
@@ -54,6 +53,7 @@ scikit-learn==0.21.2
 scipy==1.3.0
 six==1.12.0
 soupsieve==1.9.2
+twint==2.1.13
 typing-extensions==3.7.4 ; python_version < '3.7'
 typing==3.7.4 ; python_version < '3.7'
 urllib3==1.25.3

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -36,3 +36,31 @@ def tweet_data():
         for row in csvreader:
             tweet_list.append(row[10])
     return tweet_list[1:]
+
+
+@pytest.fixture
+def test_data():
+    import subprocess
+    from verifytweet import util
+
+    local_dir_path = os.path.abspath('./tests/static/test-dataset')
+    cmd_process = subprocess.run([
+        'aws', 's3', 'sync', 's3://verifytweet-dataset',
+        os.path.abspath(local_dir_path)
+    ])
+    cmd_process.check_returncode()
+    dataset = list()
+    for (dirpath, dirnames, filenames) in os.walk(local_dir_path):
+        if filenames:
+            subset = dict()
+            subset_type = dirpath.split('/')[-1]
+            subset_truth = True if dirpath.split('/')[-2] == "real" else False
+            file_paths = list()
+            for filename in filenames:
+                if util.uploader.allowed_file(filename):
+                    file_paths.append(os.path.join(os.path.abspath(dirpath), filename))
+            subset['expected_value'] = subset_truth
+            subset['type'] = subset_type
+            subset['files'] = file_paths
+            dataset.append(subset)
+    return dataset
diff --git a/tests/test_search_service.py b/tests/test_search_service.py
@@ -59,12 +59,10 @@ def test_search_valid_input():
     """Test search for valid input
     """
     test_user_id = 'elonmusk'
-    test_datetime = datetime.datetime.strptime('2019-07-06', '%Y-%m-%d')
     test_tweet_snippet = 'Sharknado'
     test_tweet = 'Sharknado is real'
     module_result, module_status = twint_search.search(test_user_id,
-                                                       test_tweet_snippet,
-                                                       test_datetime)
+                                                       test_tweet_snippet)
     assert module_status == result.ResultStatus.ALL_OKAY
     assert len(module_result) > 0
     assert isinstance(module_result[0].tweet, str)

diff --git a/tests/test_validator.py b/tests/test_validator.py
@@ -42,14 +42,6 @@ def test_validator_invalid_type_input():
         validator.verify_validity([[]])
 
 
-def test_validator_invalid_input():
-    """Test verify validity for invalid input
-    """
-    test_numpy_array = numpy.array([[None, None], [None, None]])
-    with pytest.raises(ValueError):
-        validator.verify_validity(test_numpy_array)
-
-
 def test_validator_valid_similarity_matrix():
     """Test verfiy validity for valid similarity matrix
     """

diff --git a/tests/test_verification_accuracy.py b/tests/test_verification_accuracy.py
@@ -0,0 +1,50 @@
+# Verify Tweet verifies tweets of a public user
+# from tweet screenshots: real or generated from
+# tweet generators.
+# Copyright (C) 2019 Preetham Kamidi
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import pytest
+
+from sklearn.metrics import accuracy_score
+
+from verifytweet import controller
+from verifytweet import result
+
+def test_overall_accuracy(test_data):
+    overall_expected_output = list()
+    overall_actual_output = list()
+    subset_accuracy = list()
+    non_api_controller = controller.NonAPIApproach()
+    for subset in test_data:
+        subset_expected_output = [subset['expected_value']] * len(
+            subset['files'])
+        overall_expected_output.extend(subset_expected_output)
+        actual_output = list()
+        accuracy_dict = dict()
+        for file_path in subset['files']:
+            module_result, module_status = non_api_controller.exec(
+                file_path)
+            validity = True if module_status == result.ResultStatus.ALL_OKAY else False
+            actual_output.append(validity)
+            overall_actual_output.append(validity)
+        accuracy_dict['type'] = subset['type']
+        accuracy_dict['expected'] = subset['expected_value']
+        accuracy_dict['accuracy'] = accuracy_score(subset_expected_output, actual_output)
+        subset_accuracy.append(accuracy_dict)
+    accuracy = accuracy_score(overall_expected_output, overall_actual_output)
+    print(f'Subset Accuracy: {subset_accuracy}')
+    print(f'Overall Accuracy: {accuracy}')
+    assert accuracy > 0.7
diff --git a/verifytweet/config/settings.py b/verifytweet/config/settings.py
@@ -39,11 +39,11 @@ class Config(object):
 
     IMAGEMAGICK_PATH = os.getenv('IMAGEMAGICK_PATH') if os.getenv(
         'IMAGEMAGICK_PATH') else "convert"
-    UPSCALE_RESOLUTION = "300x300"
     FILE_DIRECTORY = tempfile.mkdtemp()
     TWEET_MAX_STORE = 150
     RUN_METHOD = "cli"
-    LOG_LEVEL = logging.DEBUG if os.getenv('DEBUG') else logging.INFO
+    LOG_LEVEL = logging.DEBUG if os.getenv('VERIFYTWEET_DEBUG') else logging.INFO
+    ALLOWED_EXTENSIONS = set(["png", "jpg", "jpeg"])
     SIMILARITY_THRESHOLD = 0.6
 
 
@@ -74,7 +74,6 @@ class WebConfig(Config):
     MAX_CONTENT_LENGTH = 2097152
     WORKER_COUNT = no_of_workers()
     WORKER_CLASS = "eventlet"
-    ALLOWED_EXTENSIONS = set(["png", "jpg", "jpeg"])
 
 
 run_method = "web" if "VERIFYTWEET_RUN_FOR_WEB" in os.environ else "cli"

diff --git a/verifytweet/services/image.py b/verifytweet/services/image.py
@@ -74,7 +74,7 @@ def rescale(file_path):
         new_file_path = os.path.join(app_config.FILE_DIRECTORY,
                                      str(uuid.uuid1()) + '.png')
         cmd = [
-            'convert', file_path, '-resample', app_config.UPSCALE_RESOLUTION,
+            'convert', file_path, '-resample', '300x300',
             '-alpha', 'off', '-colorspace', 'Gray', '-threshold', '75%',
             '-density', '300x300', '-units', 'PixelsPerCentimeter', '-blur',
             '1x65000', '-level', '50x100%', new_file_path

diff --git a/verifytweet/services/search.py b/verifytweet/services/search.py
@@ -156,7 +156,7 @@ def search(self, user_id: str, tweet_snippet: str,
         twint_config.Username = user_id
         if date:
             twint_config.Since = date_checker.format_for_date(date)
-            twint_config.Until = date_checker.format_for_date(date + datetime.timedelta(days=1))
+            twint_config.Until = date_checker.format_for_date(date + datetime.timedelta(days=2))
         else:
             twint_config.Search = tweet_snippet
         twint_config.Limit = app_config.TWEET_MAX_STORE

diff --git a/verifytweet/util/validator.py b/verifytweet/util/validator.py
@@ -35,8 +35,6 @@ def verify_validity(similarity_matrix: ndarray):
     """
     if not isinstance(similarity_matrix, ndarray):
         raise TypeError('Similarity matrix must type numpy.ndarray')
-    if not similarity_matrix.all():
-        raise ValueError('Similarity matrix must be a valid numpy array')
     row = similarity_matrix[0]
     for column_index in range(1, row.shape[0]):
         if row[column_index] > app_config.SIMILARITY_THRESHOLD: