diff --git a/tests/test_search_service.py b/tests/test_search_service.py index 32a0187..a54ee72 100644 --- a/tests/test_search_service.py +++ b/tests/test_search_service.py @@ -49,7 +49,10 @@ def test_search_invalid_input(): """Test search for valid type invalid input """ with pytest.raises(ValueError): - twint_search.search('', datetime.datetime.now(), '') + twint_search.search( + '', + '', + datetime.datetime.now()) def test_search_valid_input(): @@ -60,8 +63,8 @@ def test_search_valid_input(): test_tweet_snippet = 'Sharknado' test_tweet = 'Sharknado is real' module_result, module_status = twint_search.search(test_user_id, - test_datetime, - test_tweet_snippet) + test_tweet_snippet, + test_datetime) assert module_status == result.ResultStatus.ALL_OKAY assert len(module_result) > 0 assert isinstance(module_result[0].tweet, str) diff --git a/tests/test_text_service.py b/tests/test_text_service.py index 36ce3f3..d45062f 100644 --- a/tests/test_text_service.py +++ b/tests/test_text_service.py @@ -112,7 +112,7 @@ def test_clean_text_valid_input(): test_str = "Ms. Tree caught the Falcon fairing!!" module_result, module_status = data_parser.clean_text(test_str) assert module_status == result.ResultStatus.ALL_OKAY - assert module_result == "Ms Tree caught Falcon" + assert module_result == "caught Falcon fairing" def test_get_similarity_empty_input(): diff --git a/tests/test_validator.py b/tests/test_validator.py index efe08d1..58324ad 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -53,8 +53,8 @@ def test_validator_invalid_input(): def test_validator_valid_similarity_matrix(): """Test verfiy validity for valid similarity matrix """ - test_numpy_array = numpy.array([[0.7, 0.6], [0.5, 0.1]]) - module_result, result_status = validator.verify_validity(test_numpy_array) + test_numpy_array = numpy.array([[1., 0.7, 0.6], [0.5, 0.1, 1.]]) + module_result, match_index, result_status = validator.verify_validity(test_numpy_array) assert result_status == result.ResultStatus.ALL_OKAY assert module_result == True @@ -63,6 +63,6 @@ def test_validator_invalid_similarity_matrix(): """Test verfiy validity for valid similarity matrix """ test_numpy_array = numpy.array([[0.1, 0.1], [0.1, 0.1]]) - module_result, result_status = validator.verify_validity(test_numpy_array) + module_result, match_index, result_status = validator.verify_validity(test_numpy_array) assert result_status == result.ResultStatus.ALL_OKAY assert module_result == False diff --git a/verifytweet/cli.py b/verifytweet/cli.py index eebd571..46d329a 100644 --- a/verifytweet/cli.py +++ b/verifytweet/cli.py @@ -52,14 +52,14 @@ def run_as_command(filepath): try: verify_controller = controller.NonAPIApproach() tweet_obj, controller_status = verify_controller.exec(filepath) + if controller_status == ResultStatus.MODULE_FAILURE: + print(f"Something went wrong, Please try again!") + elif controller_status == ResultStatus.NO_RESULT: + print(f"Fake Tweet!") + else: + print(f"\nVerified Tweet!") + print( + f"**** Username: {tweet_obj.username} ****\n**** Tweet: {tweet_obj.tweet} ****\n**** Likes: {tweet_obj.likes_count} ****\n**** Retweets: {tweet_obj.retweets_count} ****\n**** Link: {tweet_obj.link} ****" + ) except Exception as e: logger.exception(e) - if controller_status == ResultStatus.MODULE_FAILURE: - print(f"Something went wrong, Please try again!") - elif controller_status == ResultStatus.NO_RESULT: - print(f"Fake Tweet!") - else: - print(f"\nVerified Tweet!") - print( - f"**** Username: {tweet_obj.username} ****\n**** Tweet: {tweet_obj.tweet} ****\n**** Likes: {tweet_obj.likes_count} ****\n**** Retweets: {tweet_obj.retweets_count} ****\n**** Link: {tweet_obj.link} ****" - ) diff --git a/verifytweet/config/settings.py b/verifytweet/config/settings.py index 721523f..3d4dcde 100644 --- a/verifytweet/config/settings.py +++ b/verifytweet/config/settings.py @@ -44,6 +44,7 @@ class Config(object): TWEET_MAX_STORE = 150 RUN_METHOD = "cli" LOG_LEVEL = logging.DEBUG if os.getenv('DEBUG') else logging.INFO + SIMILARITY_THRESHOLD = 0.6 class TwitterAPIConfig(Config): @@ -60,7 +61,6 @@ class TwitterAPIConfig(Config): TWEET_COUNT_KEY = "count" TWEET_MAX_OLD = 7 TWEET_TEXT_KEY = "text" - SIMILARITY_THRESHOLD = 0.6 class WebConfig(Config): diff --git a/verifytweet/services/controller.py b/verifytweet/services/controller.py index 7f1134d..0266fa2 100644 --- a/verifytweet/services/controller.py +++ b/verifytweet/services/controller.py @@ -16,11 +16,12 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +import os + import verifytweet.services.image as image_service import verifytweet.services.text as text_service import verifytweet.services.search as search_service import verifytweet.util.date_checker as date_checker -import verifytweet.util.validator as validator import verifytweet.util.common as common from verifytweet.util.logging import logger @@ -69,27 +70,11 @@ def exec(self, file_path: str): return (None, ResultStatus.MODULE_FAILURE) if search_status != ResultStatus.ALL_OKAY: return (None, search_status) - - try: - text_processor = text_service.TextProcessor() - similarity_matrix, processor_status = text_processor.get_similarity( - entities['tweet'], same_day_tweets) - except Exception as e: - logger.exception(e) - return (None, ResultStatus.MODULE_FAILURE) - if processor_status != ResultStatus.ALL_OKAY: - return (None, processor_status) - - try: - valid_tweet, validator_status = validator.verify_validity( - similarity_matrix) - except Exception as e: - logger.exception(e) - return (None, ResultStatus.MODULE_FAILURE) + validity, match_index, validator_status = common.calculate_and_validate( + entities=entities, same_day_tweets=same_day_tweets) if validator_status != ResultStatus.ALL_OKAY: - return (None, validator_status) - logger.info('Tweet Validity: ' + str(valid_tweet)) - return (valid_tweet, ResultStatus.ALL_OKAY) + return (None, ResultStatus.MODULE_FAILURE) + return (same_day_tweets[match_index], ResultStatus.ALL_OKAY) class NonAPIApproach(object): @@ -136,11 +121,19 @@ def exec(self, file_path): try: search_controller = search_service.TwintSearch() search_results, search_status = search_controller.search( - entities['user_id'], entities['date'], tweet_snippet) + entities['user_id'], tweet_snippet, entities['date']) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) if search_status != ResultStatus.ALL_OKAY: return (None, search_status) - + if not entities['date']: + same_day_tweets = list() + for tweet_obj in search_results: + same_day_tweets.append(tweet_obj.tweet) + validity, match_index, validator_status = common.calculate_and_validate( + entities=entities, same_day_tweets=same_day_tweets) + if validator_status != ResultStatus.ALL_OKAY: + return (None, ResultStatus.MODULE_FAILURE) + return (search_results[match_index], ResultStatus.ALL_OKAY) return (search_results[0], ResultStatus.ALL_OKAY) diff --git a/verifytweet/services/image.py b/verifytweet/services/image.py index ac54e17..ccf098c 100644 --- a/verifytweet/services/image.py +++ b/verifytweet/services/image.py @@ -16,7 +16,9 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +import os import subprocess +import uuid import PIL import pytesseract @@ -51,6 +53,10 @@ def get_text(self, file_path: str): logger.info('Extracting text from rescaled image...') img = PIL.Image.open(new_file_path) text = pytesseract.image_to_string(image=img) + try: + os.remove(new_file_path) + except Exception as e: + logger.exception(e) if not text: return (None, ResultStatus.NO_RESULT) return (text, ResultStatus.ALL_OKAY) @@ -65,7 +71,8 @@ def rescale(file_path): if not file_path: raise ValueError('File path cannot be empty') logger.info('Rescaling Image to 300 dpi...') - new_file_path = file_path.rsplit('.', 1)[0] + '.png' + new_file_path = os.path.join(app_config.FILE_DIRECTORY, + str(uuid.uuid1()) + '.png') cmd = [ 'convert', file_path, '-resample', app_config.UPSCALE_RESOLUTION, '-alpha', 'off', '-colorspace', 'Gray', '-threshold', '75%', diff --git a/verifytweet/services/search.py b/verifytweet/services/search.py index 15f3b25..8db6f8c 100644 --- a/verifytweet/services/search.py +++ b/verifytweet/services/search.py @@ -89,7 +89,7 @@ def aggregate_tweets(self, user_id: str, date: datetime.datetime): date) and date_checker.valid_date(tweet_date): logger.debug('Tweet found...: ' + str(entry[app_config.TWEET_TEXT_KEY])) - same_day_tweets.append(entry[app_config.TWEET_TEXT_KEY]) + same_day_tweets.append(entry) if not same_day_tweets: return (same_day_tweets, ResultStatus.NO_RESULT) return (same_day_tweets, ResultStatus.ALL_OKAY) @@ -130,8 +130,8 @@ class TwintSearch(object): def __init__(self): pass - def search(self, user_id: str, date: datetime.datetime, - tweet_snippet: str): + def search(self, user_id: str, tweet_snippet: str, + date: datetime.datetime = None): """Searches for tweets Retrieves tweets of given username, date as well as tweet snippet using Twint. @@ -145,18 +145,20 @@ def search(self, user_id: str, date: datetime.datetime, ([], ResultStatus.ALL_OKAY) """ - if not isinstance(user_id, str) or not isinstance( - date, datetime.datetime) or not (tweet_snippet, str): + if not isinstance(user_id, str) or not (tweet_snippet, str): raise TypeError( 'User ID and tweet_snippet must be type string, date must be type datetime.datetime' ) - if not user_id or not date or not tweet_snippet: + if not user_id or not tweet_snippet: raise ValueError('User ID, Tweet or Date cannot be empty') results = list() twint_config = twint.Config() twint_config.Username = user_id - twint_config.Search = tweet_snippet - twint_config.Since = date_checker.format_for_date(date) + if date: + twint_config.Since = date_checker.format_for_date(date) + twint_config.Until = date_checker.format_for_date(date + datetime.timedelta(days=1)) + else: + twint_config.Search = tweet_snippet twint_config.Limit = app_config.TWEET_MAX_STORE twint_config.Store_object = True twint_config.Store_object_tweets_list = results diff --git a/verifytweet/services/text.py b/verifytweet/services/text.py index aeef507..07ddcde 100644 --- a/verifytweet/services/text.py +++ b/verifytweet/services/text.py @@ -37,6 +37,8 @@ USERNAME_REGEX = r'@(\w{1,15})\b' DATETIME_REGEX = r'((1[0-2]|0?[1-9]):([0-5][0-9]) ?([AaPp][Mm]))\s-\s\d{1,2}\s\w+\s\d{4}' +ALPHANUM_REGEX = r'[^A-Za-z0-9]+' + class DataParser(object): """Parses data from extracted text @@ -72,19 +74,27 @@ def get_entities(self, extracted_text: str): logger.info('Parsing data out of extracted text...') username_match = re.search(USERNAME_REGEX, extracted_text) datetime_match = re.search(DATETIME_REGEX, extracted_text) - if not username_match or not datetime_match: + if not username_match: return (dict({ 'user_id': None, 'tweet': None, 'datetime': None }), ResultStatus.NO_RESULT) user_id = username_match.group()[1:] + tweet_start_index = username_match.end() + tweet_end_index = len( + extracted_text + ) - 1 if not datetime_match else datetime_match.start() + tweet = extracted_text[tweet_start_index:tweet_end_index].strip() + if not datetime_match: + return (dict({ + 'user_id': user_id, + 'tweet': tweet, + 'date': None + }), ResultStatus.ALL_OKAY) date_str = datetime_match.group().replace('-', '') processed_datetime = date_parser.parse(date_str).replace( tzinfo=datetime.timezone.utc) - username_end_index = username_match.end() - date_start_index = datetime_match.start() - tweet = extracted_text[username_end_index:date_start_index].strip() return (dict({ 'user_id': user_id, 'tweet': tweet, @@ -114,7 +124,7 @@ def clean_text(self, extracted_text: str): logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) filtered_sentence = [w for w in word_tokens if not w in stopwords] - picked_words = filtered_sentence[0:min([len(filtered_sentence), 4])] + picked_words = filtered_sentence[2:min([len(filtered_sentence), 6])] tweet_snippet = " ".join(picked_words) if not tweet_snippet: return (tweet_snippet, ResultStatus.NO_RESULT) diff --git a/verifytweet/util/common.py b/verifytweet/util/common.py index 9ebd934..b5c5f80 100644 --- a/verifytweet/util/common.py +++ b/verifytweet/util/common.py @@ -19,6 +19,7 @@ import verifytweet.services.image as image_service import verifytweet.services.text as text_service +import verifytweet.util.validator as validator from verifytweet.util.logging import logger from verifytweet.util.result import ResultStatus @@ -62,3 +63,41 @@ def extract_and_parse(file_path: str): return (None, parser_status) logger.debug('Entities: ' + str(entities)) return (entities, parser_status) + + +def calculate_and_validate(entities: dict, same_day_tweets: list): + """Calculates similarity matrix and validates tweet + + Calculates a similarity matrix from same day tweet + corpus using text service and validates tweet + using validator + + Args: + entities: represents dictionary of entities extracted from text + same_day_tweets: list of strings representing same day tweets + + Returns: + valid_tweet: Validity status of tweet + status: Enum ResultStatus representing result status + + """ + try: + text_processor = text_service.TextProcessor() + similarity_matrix, processor_status = text_processor.get_similarity( + entities['tweet'], same_day_tweets) + except Exception as e: + logger.exception(e) + return (None, None, ResultStatus.MODULE_FAILURE) + if processor_status != ResultStatus.ALL_OKAY: + return (None, None, processor_status) + + try: + valid_tweet, match_index, validator_status = validator.verify_validity( + similarity_matrix) + except Exception as e: + logger.exception(e) + return (None, None, ResultStatus.MODULE_FAILURE) + if validator_status != ResultStatus.ALL_OKAY: + return (None, None, validator_status) + logger.debug('Tweet Validity: ' + str(valid_tweet)) + return (valid_tweet, match_index-1, ResultStatus.ALL_OKAY) diff --git a/verifytweet/util/logging.py b/verifytweet/util/logging.py index ef0b7e1..63d51ab 100644 --- a/verifytweet/util/logging.py +++ b/verifytweet/util/logging.py @@ -21,7 +21,7 @@ from verifytweet.config.settings import app_config -logger = logging.getLogger('verify_logger') +logger = logging.getLogger() logger.setLevel(app_config.LOG_LEVEL) handler = logging.StreamHandler(sys.stdout) diff --git a/verifytweet/util/validator.py b/verifytweet/util/validator.py index c2eb457..5473234 100644 --- a/verifytweet/util/validator.py +++ b/verifytweet/util/validator.py @@ -37,8 +37,8 @@ def verify_validity(similarity_matrix: ndarray): raise TypeError('Similarity matrix must type numpy.ndarray') if not similarity_matrix.all(): raise ValueError('Similarity matrix must be a valid numpy array') - for row in similarity_matrix: - for column in row: - if column > app_config.SIMILARITY_THRESHOLD: - return (True, ResultStatus.ALL_OKAY) - return (False, ResultStatus.ALL_OKAY) + row = similarity_matrix[0] + for column_index in range(1, row.shape[0]): + if row[column_index] > app_config.SIMILARITY_THRESHOLD: + return (True, column_index, ResultStatus.ALL_OKAY) + return (False, None, ResultStatus.ALL_OKAY)