Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update: Search without date #17

Merged
merged 1 commit into from
Jul 9, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions tests/test_search_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ def test_search_invalid_input():
"""Test search for valid type invalid input
"""
with pytest.raises(ValueError):
twint_search.search('', datetime.datetime.now(), '')
twint_search.search(
'',
'',
datetime.datetime.now())


def test_search_valid_input():
Expand All @@ -60,8 +63,8 @@ def test_search_valid_input():
test_tweet_snippet = 'Sharknado'
test_tweet = 'Sharknado is real'
module_result, module_status = twint_search.search(test_user_id,
test_datetime,
test_tweet_snippet)
test_tweet_snippet,
test_datetime)
assert module_status == result.ResultStatus.ALL_OKAY
assert len(module_result) > 0
assert isinstance(module_result[0].tweet, str)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_text_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def test_clean_text_valid_input():
test_str = "Ms. Tree caught the Falcon fairing!!"
module_result, module_status = data_parser.clean_text(test_str)
assert module_status == result.ResultStatus.ALL_OKAY
assert module_result == "Ms Tree caught Falcon"
assert module_result == "caught Falcon fairing"


def test_get_similarity_empty_input():
Expand Down
6 changes: 3 additions & 3 deletions tests/test_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ def test_validator_invalid_input():
def test_validator_valid_similarity_matrix():
"""Test verfiy validity for valid similarity matrix
"""
test_numpy_array = numpy.array([[0.7, 0.6], [0.5, 0.1]])
module_result, result_status = validator.verify_validity(test_numpy_array)
test_numpy_array = numpy.array([[1., 0.7, 0.6], [0.5, 0.1, 1.]])
module_result, match_index, result_status = validator.verify_validity(test_numpy_array)
assert result_status == result.ResultStatus.ALL_OKAY
assert module_result == True

Expand All @@ -63,6 +63,6 @@ def test_validator_invalid_similarity_matrix():
"""Test verfiy validity for valid similarity matrix
"""
test_numpy_array = numpy.array([[0.1, 0.1], [0.1, 0.1]])
module_result, result_status = validator.verify_validity(test_numpy_array)
module_result, match_index, result_status = validator.verify_validity(test_numpy_array)
assert result_status == result.ResultStatus.ALL_OKAY
assert module_result == False
18 changes: 9 additions & 9 deletions verifytweet/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,14 @@ def run_as_command(filepath):
try:
verify_controller = controller.NonAPIApproach()
tweet_obj, controller_status = verify_controller.exec(filepath)
if controller_status == ResultStatus.MODULE_FAILURE:
print(f"Something went wrong, Please try again!")
elif controller_status == ResultStatus.NO_RESULT:
print(f"Fake Tweet!")
else:
print(f"\nVerified Tweet!")
print(
f"**** Username: {tweet_obj.username} ****\n**** Tweet: {tweet_obj.tweet} ****\n**** Likes: {tweet_obj.likes_count} ****\n**** Retweets: {tweet_obj.retweets_count} ****\n**** Link: {tweet_obj.link} ****"
)
except Exception as e:
logger.exception(e)
if controller_status == ResultStatus.MODULE_FAILURE:
print(f"Something went wrong, Please try again!")
elif controller_status == ResultStatus.NO_RESULT:
print(f"Fake Tweet!")
else:
print(f"\nVerified Tweet!")
print(
f"**** Username: {tweet_obj.username} ****\n**** Tweet: {tweet_obj.tweet} ****\n**** Likes: {tweet_obj.likes_count} ****\n**** Retweets: {tweet_obj.retweets_count} ****\n**** Link: {tweet_obj.link} ****"
)
2 changes: 1 addition & 1 deletion verifytweet/config/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class Config(object):
TWEET_MAX_STORE = 150
RUN_METHOD = "cli"
LOG_LEVEL = logging.DEBUG if os.getenv('DEBUG') else logging.INFO
SIMILARITY_THRESHOLD = 0.6


class TwitterAPIConfig(Config):
Expand All @@ -60,7 +61,6 @@ class TwitterAPIConfig(Config):
TWEET_COUNT_KEY = "count"
TWEET_MAX_OLD = 7
TWEET_TEXT_KEY = "text"
SIMILARITY_THRESHOLD = 0.6


class WebConfig(Config):
Expand Down
39 changes: 16 additions & 23 deletions verifytweet/services/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import os

import verifytweet.services.image as image_service
import verifytweet.services.text as text_service
import verifytweet.services.search as search_service
import verifytweet.util.date_checker as date_checker
import verifytweet.util.validator as validator
import verifytweet.util.common as common

from verifytweet.util.logging import logger
Expand Down Expand Up @@ -69,27 +70,11 @@ def exec(self, file_path: str):
return (None, ResultStatus.MODULE_FAILURE)
if search_status != ResultStatus.ALL_OKAY:
return (None, search_status)

try:
text_processor = text_service.TextProcessor()
similarity_matrix, processor_status = text_processor.get_similarity(
entities['tweet'], same_day_tweets)
except Exception as e:
logger.exception(e)
return (None, ResultStatus.MODULE_FAILURE)
if processor_status != ResultStatus.ALL_OKAY:
return (None, processor_status)

try:
valid_tweet, validator_status = validator.verify_validity(
similarity_matrix)
except Exception as e:
logger.exception(e)
return (None, ResultStatus.MODULE_FAILURE)
validity, match_index, validator_status = common.calculate_and_validate(
entities=entities, same_day_tweets=same_day_tweets)
if validator_status != ResultStatus.ALL_OKAY:
return (None, validator_status)
logger.info('Tweet Validity: ' + str(valid_tweet))
return (valid_tweet, ResultStatus.ALL_OKAY)
return (None, ResultStatus.MODULE_FAILURE)
return (same_day_tweets[match_index], ResultStatus.ALL_OKAY)


class NonAPIApproach(object):
Expand Down Expand Up @@ -136,11 +121,19 @@ def exec(self, file_path):
try:
search_controller = search_service.TwintSearch()
search_results, search_status = search_controller.search(
entities['user_id'], entities['date'], tweet_snippet)
entities['user_id'], tweet_snippet, entities['date'])
except Exception as e:
logger.exception(e)
return (None, ResultStatus.MODULE_FAILURE)
if search_status != ResultStatus.ALL_OKAY:
return (None, search_status)

if not entities['date']:
same_day_tweets = list()
for tweet_obj in search_results:
same_day_tweets.append(tweet_obj.tweet)
validity, match_index, validator_status = common.calculate_and_validate(
entities=entities, same_day_tweets=same_day_tweets)
if validator_status != ResultStatus.ALL_OKAY:
return (None, ResultStatus.MODULE_FAILURE)
return (search_results[match_index], ResultStatus.ALL_OKAY)
return (search_results[0], ResultStatus.ALL_OKAY)
9 changes: 8 additions & 1 deletion verifytweet/services/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import os
import subprocess
import uuid

import PIL
import pytesseract
Expand Down Expand Up @@ -51,6 +53,10 @@ def get_text(self, file_path: str):
logger.info('Extracting text from rescaled image...')
img = PIL.Image.open(new_file_path)
text = pytesseract.image_to_string(image=img)
try:
os.remove(new_file_path)
except Exception as e:
logger.exception(e)
if not text:
return (None, ResultStatus.NO_RESULT)
return (text, ResultStatus.ALL_OKAY)
Expand All @@ -65,7 +71,8 @@ def rescale(file_path):
if not file_path:
raise ValueError('File path cannot be empty')
logger.info('Rescaling Image to 300 dpi...')
new_file_path = file_path.rsplit('.', 1)[0] + '.png'
new_file_path = os.path.join(app_config.FILE_DIRECTORY,
str(uuid.uuid1()) + '.png')
cmd = [
'convert', file_path, '-resample', app_config.UPSCALE_RESOLUTION,
'-alpha', 'off', '-colorspace', 'Gray', '-threshold', '75%',
Expand Down
18 changes: 10 additions & 8 deletions verifytweet/services/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def aggregate_tweets(self, user_id: str, date: datetime.datetime):
date) and date_checker.valid_date(tweet_date):
logger.debug('Tweet found...: ' +
str(entry[app_config.TWEET_TEXT_KEY]))
same_day_tweets.append(entry[app_config.TWEET_TEXT_KEY])
same_day_tweets.append(entry)
if not same_day_tweets:
return (same_day_tweets, ResultStatus.NO_RESULT)
return (same_day_tweets, ResultStatus.ALL_OKAY)
Expand Down Expand Up @@ -130,8 +130,8 @@ class TwintSearch(object):
def __init__(self):
pass

def search(self, user_id: str, date: datetime.datetime,
tweet_snippet: str):
def search(self, user_id: str, tweet_snippet: str,
date: datetime.datetime = None):
"""Searches for tweets

Retrieves tweets of given username, date as well as tweet snippet using Twint.
Expand All @@ -145,18 +145,20 @@ def search(self, user_id: str, date: datetime.datetime,
([<tweet_obj>], ResultStatus.ALL_OKAY)

"""
if not isinstance(user_id, str) or not isinstance(
date, datetime.datetime) or not (tweet_snippet, str):
if not isinstance(user_id, str) or not (tweet_snippet, str):
raise TypeError(
'User ID and tweet_snippet must be type string, date must be type datetime.datetime'
)
if not user_id or not date or not tweet_snippet:
if not user_id or not tweet_snippet:
raise ValueError('User ID, Tweet or Date cannot be empty')
results = list()
twint_config = twint.Config()
twint_config.Username = user_id
twint_config.Search = tweet_snippet
twint_config.Since = date_checker.format_for_date(date)
if date:
twint_config.Since = date_checker.format_for_date(date)
twint_config.Until = date_checker.format_for_date(date + datetime.timedelta(days=1))
else:
twint_config.Search = tweet_snippet
twint_config.Limit = app_config.TWEET_MAX_STORE
twint_config.Store_object = True
twint_config.Store_object_tweets_list = results
Expand Down
20 changes: 15 additions & 5 deletions verifytweet/services/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@

USERNAME_REGEX = r'@(\w{1,15})\b'
DATETIME_REGEX = r'((1[0-2]|0?[1-9]):([0-5][0-9]) ?([AaPp][Mm]))\s-\s\d{1,2}\s\w+\s\d{4}'
ALPHANUM_REGEX = r'[^A-Za-z0-9]+'


class DataParser(object):
"""Parses data from extracted text
Expand Down Expand Up @@ -72,19 +74,27 @@ def get_entities(self, extracted_text: str):
logger.info('Parsing data out of extracted text...')
username_match = re.search(USERNAME_REGEX, extracted_text)
datetime_match = re.search(DATETIME_REGEX, extracted_text)
if not username_match or not datetime_match:
if not username_match:
return (dict({
'user_id': None,
'tweet': None,
'datetime': None
}), ResultStatus.NO_RESULT)
user_id = username_match.group()[1:]
tweet_start_index = username_match.end()
tweet_end_index = len(
extracted_text
) - 1 if not datetime_match else datetime_match.start()
tweet = extracted_text[tweet_start_index:tweet_end_index].strip()
if not datetime_match:
return (dict({
'user_id': user_id,
'tweet': tweet,
'date': None
}), ResultStatus.ALL_OKAY)
date_str = datetime_match.group().replace('-', '')
processed_datetime = date_parser.parse(date_str).replace(
tzinfo=datetime.timezone.utc)
username_end_index = username_match.end()
date_start_index = datetime_match.start()
tweet = extracted_text[username_end_index:date_start_index].strip()
return (dict({
'user_id': user_id,
'tweet': tweet,
Expand Down Expand Up @@ -114,7 +124,7 @@ def clean_text(self, extracted_text: str):
logger.exception(e)
return (None, ResultStatus.MODULE_FAILURE)
filtered_sentence = [w for w in word_tokens if not w in stopwords]
picked_words = filtered_sentence[0:min([len(filtered_sentence), 4])]
picked_words = filtered_sentence[2:min([len(filtered_sentence), 6])]
tweet_snippet = " ".join(picked_words)
if not tweet_snippet:
return (tweet_snippet, ResultStatus.NO_RESULT)
Expand Down
39 changes: 39 additions & 0 deletions verifytweet/util/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import verifytweet.services.image as image_service
import verifytweet.services.text as text_service
import verifytweet.util.validator as validator

from verifytweet.util.logging import logger
from verifytweet.util.result import ResultStatus
Expand Down Expand Up @@ -62,3 +63,41 @@ def extract_and_parse(file_path: str):
return (None, parser_status)
logger.debug('Entities: ' + str(entities))
return (entities, parser_status)


def calculate_and_validate(entities: dict, same_day_tweets: list):
"""Calculates similarity matrix and validates tweet

Calculates a similarity matrix from same day tweet
corpus using text service and validates tweet
using validator

Args:
entities: represents dictionary of entities extracted from text
same_day_tweets: list of strings representing same day tweets

Returns:
valid_tweet: Validity status of tweet
status: Enum ResultStatus representing result status

"""
try:
text_processor = text_service.TextProcessor()
similarity_matrix, processor_status = text_processor.get_similarity(
entities['tweet'], same_day_tweets)
except Exception as e:
logger.exception(e)
return (None, None, ResultStatus.MODULE_FAILURE)
if processor_status != ResultStatus.ALL_OKAY:
return (None, None, processor_status)

try:
valid_tweet, match_index, validator_status = validator.verify_validity(
similarity_matrix)
except Exception as e:
logger.exception(e)
return (None, None, ResultStatus.MODULE_FAILURE)
if validator_status != ResultStatus.ALL_OKAY:
return (None, None, validator_status)
logger.debug('Tweet Validity: ' + str(valid_tweet))
return (valid_tweet, match_index-1, ResultStatus.ALL_OKAY)
2 changes: 1 addition & 1 deletion verifytweet/util/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

from verifytweet.config.settings import app_config

logger = logging.getLogger('verify_logger')
logger = logging.getLogger()
logger.setLevel(app_config.LOG_LEVEL)

handler = logging.StreamHandler(sys.stdout)
Expand Down
10 changes: 5 additions & 5 deletions verifytweet/util/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ def verify_validity(similarity_matrix: ndarray):
raise TypeError('Similarity matrix must type numpy.ndarray')
if not similarity_matrix.all():
raise ValueError('Similarity matrix must be a valid numpy array')
for row in similarity_matrix:
for column in row:
if column > app_config.SIMILARITY_THRESHOLD:
return (True, ResultStatus.ALL_OKAY)
return (False, ResultStatus.ALL_OKAY)
row = similarity_matrix[0]
for column_index in range(1, row.shape[0]):
if row[column_index] > app_config.SIMILARITY_THRESHOLD:
return (True, column_index, ResultStatus.ALL_OKAY)
return (False, None, ResultStatus.ALL_OKAY)