diff --git a/tests/test_search_service.py b/tests/test_search_service.py
index 32a0187..a54ee72 100644
--- a/tests/test_search_service.py
+++ b/tests/test_search_service.py
@@ -49,7 +49,10 @@ def test_search_invalid_input():
"""Test search for valid type invalid input
"""
with pytest.raises(ValueError):
- twint_search.search('', datetime.datetime.now(), '')
+ twint_search.search(
+ '',
+ '',
+ datetime.datetime.now())
def test_search_valid_input():
@@ -60,8 +63,8 @@ def test_search_valid_input():
test_tweet_snippet = 'Sharknado'
test_tweet = 'Sharknado is real'
module_result, module_status = twint_search.search(test_user_id,
- test_datetime,
- test_tweet_snippet)
+ test_tweet_snippet,
+ test_datetime)
assert module_status == result.ResultStatus.ALL_OKAY
assert len(module_result) > 0
assert isinstance(module_result[0].tweet, str)
diff --git a/tests/test_text_service.py b/tests/test_text_service.py
index 36ce3f3..d45062f 100644
--- a/tests/test_text_service.py
+++ b/tests/test_text_service.py
@@ -112,7 +112,7 @@ def test_clean_text_valid_input():
test_str = "Ms. Tree caught the Falcon fairing!!"
module_result, module_status = data_parser.clean_text(test_str)
assert module_status == result.ResultStatus.ALL_OKAY
- assert module_result == "Ms Tree caught Falcon"
+ assert module_result == "caught Falcon fairing"
def test_get_similarity_empty_input():
diff --git a/tests/test_validator.py b/tests/test_validator.py
index efe08d1..58324ad 100644
--- a/tests/test_validator.py
+++ b/tests/test_validator.py
@@ -53,8 +53,8 @@ def test_validator_invalid_input():
def test_validator_valid_similarity_matrix():
"""Test verfiy validity for valid similarity matrix
"""
- test_numpy_array = numpy.array([[0.7, 0.6], [0.5, 0.1]])
- module_result, result_status = validator.verify_validity(test_numpy_array)
+ test_numpy_array = numpy.array([[1., 0.7, 0.6], [0.5, 0.1, 1.]])
+ module_result, match_index, result_status = validator.verify_validity(test_numpy_array)
assert result_status == result.ResultStatus.ALL_OKAY
assert module_result == True
@@ -63,6 +63,6 @@ def test_validator_invalid_similarity_matrix():
"""Test verfiy validity for valid similarity matrix
"""
test_numpy_array = numpy.array([[0.1, 0.1], [0.1, 0.1]])
- module_result, result_status = validator.verify_validity(test_numpy_array)
+ module_result, match_index, result_status = validator.verify_validity(test_numpy_array)
assert result_status == result.ResultStatus.ALL_OKAY
assert module_result == False
diff --git a/verifytweet/cli.py b/verifytweet/cli.py
index eebd571..46d329a 100644
--- a/verifytweet/cli.py
+++ b/verifytweet/cli.py
@@ -52,14 +52,14 @@ def run_as_command(filepath):
try:
verify_controller = controller.NonAPIApproach()
tweet_obj, controller_status = verify_controller.exec(filepath)
+ if controller_status == ResultStatus.MODULE_FAILURE:
+ print(f"Something went wrong, Please try again!")
+ elif controller_status == ResultStatus.NO_RESULT:
+ print(f"Fake Tweet!")
+ else:
+ print(f"\nVerified Tweet!")
+ print(
+ f"**** Username: {tweet_obj.username} ****\n**** Tweet: {tweet_obj.tweet} ****\n**** Likes: {tweet_obj.likes_count} ****\n**** Retweets: {tweet_obj.retweets_count} ****\n**** Link: {tweet_obj.link} ****"
+ )
except Exception as e:
logger.exception(e)
- if controller_status == ResultStatus.MODULE_FAILURE:
- print(f"Something went wrong, Please try again!")
- elif controller_status == ResultStatus.NO_RESULT:
- print(f"Fake Tweet!")
- else:
- print(f"\nVerified Tweet!")
- print(
- f"**** Username: {tweet_obj.username} ****\n**** Tweet: {tweet_obj.tweet} ****\n**** Likes: {tweet_obj.likes_count} ****\n**** Retweets: {tweet_obj.retweets_count} ****\n**** Link: {tweet_obj.link} ****"
- )
diff --git a/verifytweet/config/settings.py b/verifytweet/config/settings.py
index 721523f..3d4dcde 100644
--- a/verifytweet/config/settings.py
+++ b/verifytweet/config/settings.py
@@ -44,6 +44,7 @@ class Config(object):
TWEET_MAX_STORE = 150
RUN_METHOD = "cli"
LOG_LEVEL = logging.DEBUG if os.getenv('DEBUG') else logging.INFO
+ SIMILARITY_THRESHOLD = 0.6
class TwitterAPIConfig(Config):
@@ -60,7 +61,6 @@ class TwitterAPIConfig(Config):
TWEET_COUNT_KEY = "count"
TWEET_MAX_OLD = 7
TWEET_TEXT_KEY = "text"
- SIMILARITY_THRESHOLD = 0.6
class WebConfig(Config):
diff --git a/verifytweet/services/controller.py b/verifytweet/services/controller.py
index 7f1134d..0266fa2 100644
--- a/verifytweet/services/controller.py
+++ b/verifytweet/services/controller.py
@@ -16,11 +16,12 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see .
+import os
+
import verifytweet.services.image as image_service
import verifytweet.services.text as text_service
import verifytweet.services.search as search_service
import verifytweet.util.date_checker as date_checker
-import verifytweet.util.validator as validator
import verifytweet.util.common as common
from verifytweet.util.logging import logger
@@ -69,27 +70,11 @@ def exec(self, file_path: str):
return (None, ResultStatus.MODULE_FAILURE)
if search_status != ResultStatus.ALL_OKAY:
return (None, search_status)
-
- try:
- text_processor = text_service.TextProcessor()
- similarity_matrix, processor_status = text_processor.get_similarity(
- entities['tweet'], same_day_tweets)
- except Exception as e:
- logger.exception(e)
- return (None, ResultStatus.MODULE_FAILURE)
- if processor_status != ResultStatus.ALL_OKAY:
- return (None, processor_status)
-
- try:
- valid_tweet, validator_status = validator.verify_validity(
- similarity_matrix)
- except Exception as e:
- logger.exception(e)
- return (None, ResultStatus.MODULE_FAILURE)
+ validity, match_index, validator_status = common.calculate_and_validate(
+ entities=entities, same_day_tweets=same_day_tweets)
if validator_status != ResultStatus.ALL_OKAY:
- return (None, validator_status)
- logger.info('Tweet Validity: ' + str(valid_tweet))
- return (valid_tweet, ResultStatus.ALL_OKAY)
+ return (None, ResultStatus.MODULE_FAILURE)
+ return (same_day_tweets[match_index], ResultStatus.ALL_OKAY)
class NonAPIApproach(object):
@@ -136,11 +121,19 @@ def exec(self, file_path):
try:
search_controller = search_service.TwintSearch()
search_results, search_status = search_controller.search(
- entities['user_id'], entities['date'], tweet_snippet)
+ entities['user_id'], tweet_snippet, entities['date'])
except Exception as e:
logger.exception(e)
return (None, ResultStatus.MODULE_FAILURE)
if search_status != ResultStatus.ALL_OKAY:
return (None, search_status)
-
+ if not entities['date']:
+ same_day_tweets = list()
+ for tweet_obj in search_results:
+ same_day_tweets.append(tweet_obj.tweet)
+ validity, match_index, validator_status = common.calculate_and_validate(
+ entities=entities, same_day_tweets=same_day_tweets)
+ if validator_status != ResultStatus.ALL_OKAY:
+ return (None, ResultStatus.MODULE_FAILURE)
+ return (search_results[match_index], ResultStatus.ALL_OKAY)
return (search_results[0], ResultStatus.ALL_OKAY)
diff --git a/verifytweet/services/image.py b/verifytweet/services/image.py
index ac54e17..ccf098c 100644
--- a/verifytweet/services/image.py
+++ b/verifytweet/services/image.py
@@ -16,7 +16,9 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see .
+import os
import subprocess
+import uuid
import PIL
import pytesseract
@@ -51,6 +53,10 @@ def get_text(self, file_path: str):
logger.info('Extracting text from rescaled image...')
img = PIL.Image.open(new_file_path)
text = pytesseract.image_to_string(image=img)
+ try:
+ os.remove(new_file_path)
+ except Exception as e:
+ logger.exception(e)
if not text:
return (None, ResultStatus.NO_RESULT)
return (text, ResultStatus.ALL_OKAY)
@@ -65,7 +71,8 @@ def rescale(file_path):
if not file_path:
raise ValueError('File path cannot be empty')
logger.info('Rescaling Image to 300 dpi...')
- new_file_path = file_path.rsplit('.', 1)[0] + '.png'
+ new_file_path = os.path.join(app_config.FILE_DIRECTORY,
+ str(uuid.uuid1()) + '.png')
cmd = [
'convert', file_path, '-resample', app_config.UPSCALE_RESOLUTION,
'-alpha', 'off', '-colorspace', 'Gray', '-threshold', '75%',
diff --git a/verifytweet/services/search.py b/verifytweet/services/search.py
index 15f3b25..8db6f8c 100644
--- a/verifytweet/services/search.py
+++ b/verifytweet/services/search.py
@@ -89,7 +89,7 @@ def aggregate_tweets(self, user_id: str, date: datetime.datetime):
date) and date_checker.valid_date(tweet_date):
logger.debug('Tweet found...: ' +
str(entry[app_config.TWEET_TEXT_KEY]))
- same_day_tweets.append(entry[app_config.TWEET_TEXT_KEY])
+ same_day_tweets.append(entry)
if not same_day_tweets:
return (same_day_tweets, ResultStatus.NO_RESULT)
return (same_day_tweets, ResultStatus.ALL_OKAY)
@@ -130,8 +130,8 @@ class TwintSearch(object):
def __init__(self):
pass
- def search(self, user_id: str, date: datetime.datetime,
- tweet_snippet: str):
+ def search(self, user_id: str, tweet_snippet: str,
+ date: datetime.datetime = None):
"""Searches for tweets
Retrieves tweets of given username, date as well as tweet snippet using Twint.
@@ -145,18 +145,20 @@ def search(self, user_id: str, date: datetime.datetime,
([], ResultStatus.ALL_OKAY)
"""
- if not isinstance(user_id, str) or not isinstance(
- date, datetime.datetime) or not (tweet_snippet, str):
+ if not isinstance(user_id, str) or not (tweet_snippet, str):
raise TypeError(
'User ID and tweet_snippet must be type string, date must be type datetime.datetime'
)
- if not user_id or not date or not tweet_snippet:
+ if not user_id or not tweet_snippet:
raise ValueError('User ID, Tweet or Date cannot be empty')
results = list()
twint_config = twint.Config()
twint_config.Username = user_id
- twint_config.Search = tweet_snippet
- twint_config.Since = date_checker.format_for_date(date)
+ if date:
+ twint_config.Since = date_checker.format_for_date(date)
+ twint_config.Until = date_checker.format_for_date(date + datetime.timedelta(days=1))
+ else:
+ twint_config.Search = tweet_snippet
twint_config.Limit = app_config.TWEET_MAX_STORE
twint_config.Store_object = True
twint_config.Store_object_tweets_list = results
diff --git a/verifytweet/services/text.py b/verifytweet/services/text.py
index aeef507..07ddcde 100644
--- a/verifytweet/services/text.py
+++ b/verifytweet/services/text.py
@@ -37,6 +37,8 @@
USERNAME_REGEX = r'@(\w{1,15})\b'
DATETIME_REGEX = r'((1[0-2]|0?[1-9]):([0-5][0-9]) ?([AaPp][Mm]))\s-\s\d{1,2}\s\w+\s\d{4}'
+ALPHANUM_REGEX = r'[^A-Za-z0-9]+'
+
class DataParser(object):
"""Parses data from extracted text
@@ -72,19 +74,27 @@ def get_entities(self, extracted_text: str):
logger.info('Parsing data out of extracted text...')
username_match = re.search(USERNAME_REGEX, extracted_text)
datetime_match = re.search(DATETIME_REGEX, extracted_text)
- if not username_match or not datetime_match:
+ if not username_match:
return (dict({
'user_id': None,
'tweet': None,
'datetime': None
}), ResultStatus.NO_RESULT)
user_id = username_match.group()[1:]
+ tweet_start_index = username_match.end()
+ tweet_end_index = len(
+ extracted_text
+ ) - 1 if not datetime_match else datetime_match.start()
+ tweet = extracted_text[tweet_start_index:tweet_end_index].strip()
+ if not datetime_match:
+ return (dict({
+ 'user_id': user_id,
+ 'tweet': tweet,
+ 'date': None
+ }), ResultStatus.ALL_OKAY)
date_str = datetime_match.group().replace('-', '')
processed_datetime = date_parser.parse(date_str).replace(
tzinfo=datetime.timezone.utc)
- username_end_index = username_match.end()
- date_start_index = datetime_match.start()
- tweet = extracted_text[username_end_index:date_start_index].strip()
return (dict({
'user_id': user_id,
'tweet': tweet,
@@ -114,7 +124,7 @@ def clean_text(self, extracted_text: str):
logger.exception(e)
return (None, ResultStatus.MODULE_FAILURE)
filtered_sentence = [w for w in word_tokens if not w in stopwords]
- picked_words = filtered_sentence[0:min([len(filtered_sentence), 4])]
+ picked_words = filtered_sentence[2:min([len(filtered_sentence), 6])]
tweet_snippet = " ".join(picked_words)
if not tweet_snippet:
return (tweet_snippet, ResultStatus.NO_RESULT)
diff --git a/verifytweet/util/common.py b/verifytweet/util/common.py
index 9ebd934..b5c5f80 100644
--- a/verifytweet/util/common.py
+++ b/verifytweet/util/common.py
@@ -19,6 +19,7 @@
import verifytweet.services.image as image_service
import verifytweet.services.text as text_service
+import verifytweet.util.validator as validator
from verifytweet.util.logging import logger
from verifytweet.util.result import ResultStatus
@@ -62,3 +63,41 @@ def extract_and_parse(file_path: str):
return (None, parser_status)
logger.debug('Entities: ' + str(entities))
return (entities, parser_status)
+
+
+def calculate_and_validate(entities: dict, same_day_tweets: list):
+ """Calculates similarity matrix and validates tweet
+
+ Calculates a similarity matrix from same day tweet
+ corpus using text service and validates tweet
+ using validator
+
+ Args:
+ entities: represents dictionary of entities extracted from text
+ same_day_tweets: list of strings representing same day tweets
+
+ Returns:
+ valid_tweet: Validity status of tweet
+ status: Enum ResultStatus representing result status
+
+ """
+ try:
+ text_processor = text_service.TextProcessor()
+ similarity_matrix, processor_status = text_processor.get_similarity(
+ entities['tweet'], same_day_tweets)
+ except Exception as e:
+ logger.exception(e)
+ return (None, None, ResultStatus.MODULE_FAILURE)
+ if processor_status != ResultStatus.ALL_OKAY:
+ return (None, None, processor_status)
+
+ try:
+ valid_tweet, match_index, validator_status = validator.verify_validity(
+ similarity_matrix)
+ except Exception as e:
+ logger.exception(e)
+ return (None, None, ResultStatus.MODULE_FAILURE)
+ if validator_status != ResultStatus.ALL_OKAY:
+ return (None, None, validator_status)
+ logger.debug('Tweet Validity: ' + str(valid_tweet))
+ return (valid_tweet, match_index-1, ResultStatus.ALL_OKAY)
diff --git a/verifytweet/util/logging.py b/verifytweet/util/logging.py
index ef0b7e1..63d51ab 100644
--- a/verifytweet/util/logging.py
+++ b/verifytweet/util/logging.py
@@ -21,7 +21,7 @@
from verifytweet.config.settings import app_config
-logger = logging.getLogger('verify_logger')
+logger = logging.getLogger()
logger.setLevel(app_config.LOG_LEVEL)
handler = logging.StreamHandler(sys.stdout)
diff --git a/verifytweet/util/validator.py b/verifytweet/util/validator.py
index c2eb457..5473234 100644
--- a/verifytweet/util/validator.py
+++ b/verifytweet/util/validator.py
@@ -37,8 +37,8 @@ def verify_validity(similarity_matrix: ndarray):
raise TypeError('Similarity matrix must type numpy.ndarray')
if not similarity_matrix.all():
raise ValueError('Similarity matrix must be a valid numpy array')
- for row in similarity_matrix:
- for column in row:
- if column > app_config.SIMILARITY_THRESHOLD:
- return (True, ResultStatus.ALL_OKAY)
- return (False, ResultStatus.ALL_OKAY)
+ row = similarity_matrix[0]
+ for column_index in range(1, row.shape[0]):
+ if row[column_index] > app_config.SIMILARITY_THRESHOLD:
+ return (True, column_index, ResultStatus.ALL_OKAY)
+ return (False, None, ResultStatus.ALL_OKAY)