Skip to content

Commit

Permalink
Merge pull request #17 from kamidipreetham/feature/text-search
Browse files Browse the repository at this point in the history
Update: Search without date
  • Loading branch information
Preetham Kamidi committed Jul 9, 2019
2 parents 0d18dd2 + 211cd56 commit 65f497b
Show file tree
Hide file tree
Showing 12 changed files with 114 additions and 60 deletions.
9 changes: 6 additions & 3 deletions tests/test_search_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ def test_search_invalid_input():
"""Test search for valid type invalid input
"""
with pytest.raises(ValueError):
twint_search.search('', datetime.datetime.now(), '')
twint_search.search(
'',
'',
datetime.datetime.now())


def test_search_valid_input():
Expand All @@ -60,8 +63,8 @@ def test_search_valid_input():
test_tweet_snippet = 'Sharknado'
test_tweet = 'Sharknado is real'
module_result, module_status = twint_search.search(test_user_id,
test_datetime,
test_tweet_snippet)
test_tweet_snippet,
test_datetime)
assert module_status == result.ResultStatus.ALL_OKAY
assert len(module_result) > 0
assert isinstance(module_result[0].tweet, str)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_text_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def test_clean_text_valid_input():
test_str = "Ms. Tree caught the Falcon fairing!!"
module_result, module_status = data_parser.clean_text(test_str)
assert module_status == result.ResultStatus.ALL_OKAY
assert module_result == "Ms Tree caught Falcon"
assert module_result == "caught Falcon fairing"


def test_get_similarity_empty_input():
Expand Down
6 changes: 3 additions & 3 deletions tests/test_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ def test_validator_invalid_input():
def test_validator_valid_similarity_matrix():
"""Test verfiy validity for valid similarity matrix
"""
test_numpy_array = numpy.array([[0.7, 0.6], [0.5, 0.1]])
module_result, result_status = validator.verify_validity(test_numpy_array)
test_numpy_array = numpy.array([[1., 0.7, 0.6], [0.5, 0.1, 1.]])
module_result, match_index, result_status = validator.verify_validity(test_numpy_array)
assert result_status == result.ResultStatus.ALL_OKAY
assert module_result == True

Expand All @@ -63,6 +63,6 @@ def test_validator_invalid_similarity_matrix():
"""Test verfiy validity for valid similarity matrix
"""
test_numpy_array = numpy.array([[0.1, 0.1], [0.1, 0.1]])
module_result, result_status = validator.verify_validity(test_numpy_array)
module_result, match_index, result_status = validator.verify_validity(test_numpy_array)
assert result_status == result.ResultStatus.ALL_OKAY
assert module_result == False
18 changes: 9 additions & 9 deletions verifytweet/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,14 @@ def run_as_command(filepath):
try:
verify_controller = controller.NonAPIApproach()
tweet_obj, controller_status = verify_controller.exec(filepath)
if controller_status == ResultStatus.MODULE_FAILURE:
print(f"Something went wrong, Please try again!")
elif controller_status == ResultStatus.NO_RESULT:
print(f"Fake Tweet!")
else:
print(f"\nVerified Tweet!")
print(
f"**** Username: {tweet_obj.username} ****\n**** Tweet: {tweet_obj.tweet} ****\n**** Likes: {tweet_obj.likes_count} ****\n**** Retweets: {tweet_obj.retweets_count} ****\n**** Link: {tweet_obj.link} ****"
)
except Exception as e:
logger.exception(e)
if controller_status == ResultStatus.MODULE_FAILURE:
print(f"Something went wrong, Please try again!")
elif controller_status == ResultStatus.NO_RESULT:
print(f"Fake Tweet!")
else:
print(f"\nVerified Tweet!")
print(
f"**** Username: {tweet_obj.username} ****\n**** Tweet: {tweet_obj.tweet} ****\n**** Likes: {tweet_obj.likes_count} ****\n**** Retweets: {tweet_obj.retweets_count} ****\n**** Link: {tweet_obj.link} ****"
)
2 changes: 1 addition & 1 deletion verifytweet/config/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class Config(object):
TWEET_MAX_STORE = 150
RUN_METHOD = "cli"
LOG_LEVEL = logging.DEBUG if os.getenv('DEBUG') else logging.INFO
SIMILARITY_THRESHOLD = 0.6


class TwitterAPIConfig(Config):
Expand All @@ -60,7 +61,6 @@ class TwitterAPIConfig(Config):
TWEET_COUNT_KEY = "count"
TWEET_MAX_OLD = 7
TWEET_TEXT_KEY = "text"
SIMILARITY_THRESHOLD = 0.6


class WebConfig(Config):
Expand Down
39 changes: 16 additions & 23 deletions verifytweet/services/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import os

import verifytweet.services.image as image_service
import verifytweet.services.text as text_service
import verifytweet.services.search as search_service
import verifytweet.util.date_checker as date_checker
import verifytweet.util.validator as validator
import verifytweet.util.common as common

from verifytweet.util.logging import logger
Expand Down Expand Up @@ -69,27 +70,11 @@ def exec(self, file_path: str):
return (None, ResultStatus.MODULE_FAILURE)
if search_status != ResultStatus.ALL_OKAY:
return (None, search_status)

try:
text_processor = text_service.TextProcessor()
similarity_matrix, processor_status = text_processor.get_similarity(
entities['tweet'], same_day_tweets)
except Exception as e:
logger.exception(e)
return (None, ResultStatus.MODULE_FAILURE)
if processor_status != ResultStatus.ALL_OKAY:
return (None, processor_status)

try:
valid_tweet, validator_status = validator.verify_validity(
similarity_matrix)
except Exception as e:
logger.exception(e)
return (None, ResultStatus.MODULE_FAILURE)
validity, match_index, validator_status = common.calculate_and_validate(
entities=entities, same_day_tweets=same_day_tweets)
if validator_status != ResultStatus.ALL_OKAY:
return (None, validator_status)
logger.info('Tweet Validity: ' + str(valid_tweet))
return (valid_tweet, ResultStatus.ALL_OKAY)
return (None, ResultStatus.MODULE_FAILURE)
return (same_day_tweets[match_index], ResultStatus.ALL_OKAY)


class NonAPIApproach(object):
Expand Down Expand Up @@ -136,11 +121,19 @@ def exec(self, file_path):
try:
search_controller = search_service.TwintSearch()
search_results, search_status = search_controller.search(
entities['user_id'], entities['date'], tweet_snippet)
entities['user_id'], tweet_snippet, entities['date'])
except Exception as e:
logger.exception(e)
return (None, ResultStatus.MODULE_FAILURE)
if search_status != ResultStatus.ALL_OKAY:
return (None, search_status)

if not entities['date']:
same_day_tweets = list()
for tweet_obj in search_results:
same_day_tweets.append(tweet_obj.tweet)
validity, match_index, validator_status = common.calculate_and_validate(
entities=entities, same_day_tweets=same_day_tweets)
if validator_status != ResultStatus.ALL_OKAY:
return (None, ResultStatus.MODULE_FAILURE)
return (search_results[match_index], ResultStatus.ALL_OKAY)
return (search_results[0], ResultStatus.ALL_OKAY)
9 changes: 8 additions & 1 deletion verifytweet/services/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import os
import subprocess
import uuid

import PIL
import pytesseract
Expand Down Expand Up @@ -51,6 +53,10 @@ def get_text(self, file_path: str):
logger.info('Extracting text from rescaled image...')
img = PIL.Image.open(new_file_path)
text = pytesseract.image_to_string(image=img)
try:
os.remove(new_file_path)
except Exception as e:
logger.exception(e)
if not text:
return (None, ResultStatus.NO_RESULT)
return (text, ResultStatus.ALL_OKAY)
Expand All @@ -65,7 +71,8 @@ def rescale(file_path):
if not file_path:
raise ValueError('File path cannot be empty')
logger.info('Rescaling Image to 300 dpi...')
new_file_path = file_path.rsplit('.', 1)[0] + '.png'
new_file_path = os.path.join(app_config.FILE_DIRECTORY,
str(uuid.uuid1()) + '.png')
cmd = [
'convert', file_path, '-resample', app_config.UPSCALE_RESOLUTION,
'-alpha', 'off', '-colorspace', 'Gray', '-threshold', '75%',
Expand Down
18 changes: 10 additions & 8 deletions verifytweet/services/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def aggregate_tweets(self, user_id: str, date: datetime.datetime):
date) and date_checker.valid_date(tweet_date):
logger.debug('Tweet found...: ' +
str(entry[app_config.TWEET_TEXT_KEY]))
same_day_tweets.append(entry[app_config.TWEET_TEXT_KEY])
same_day_tweets.append(entry)
if not same_day_tweets:
return (same_day_tweets, ResultStatus.NO_RESULT)
return (same_day_tweets, ResultStatus.ALL_OKAY)
Expand Down Expand Up @@ -130,8 +130,8 @@ class TwintSearch(object):
def __init__(self):
pass

def search(self, user_id: str, date: datetime.datetime,
tweet_snippet: str):
def search(self, user_id: str, tweet_snippet: str,
date: datetime.datetime = None):
"""Searches for tweets
Retrieves tweets of given username, date as well as tweet snippet using Twint.
Expand All @@ -145,18 +145,20 @@ def search(self, user_id: str, date: datetime.datetime,
([<tweet_obj>], ResultStatus.ALL_OKAY)
"""
if not isinstance(user_id, str) or not isinstance(
date, datetime.datetime) or not (tweet_snippet, str):
if not isinstance(user_id, str) or not (tweet_snippet, str):
raise TypeError(
'User ID and tweet_snippet must be type string, date must be type datetime.datetime'
)
if not user_id or not date or not tweet_snippet:
if not user_id or not tweet_snippet:
raise ValueError('User ID, Tweet or Date cannot be empty')
results = list()
twint_config = twint.Config()
twint_config.Username = user_id
twint_config.Search = tweet_snippet
twint_config.Since = date_checker.format_for_date(date)
if date:
twint_config.Since = date_checker.format_for_date(date)
twint_config.Until = date_checker.format_for_date(date + datetime.timedelta(days=1))
else:
twint_config.Search = tweet_snippet
twint_config.Limit = app_config.TWEET_MAX_STORE
twint_config.Store_object = True
twint_config.Store_object_tweets_list = results
Expand Down
20 changes: 15 additions & 5 deletions verifytweet/services/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@

USERNAME_REGEX = r'@(\w{1,15})\b'
DATETIME_REGEX = r'((1[0-2]|0?[1-9]):([0-5][0-9]) ?([AaPp][Mm]))\s-\s\d{1,2}\s\w+\s\d{4}'
ALPHANUM_REGEX = r'[^A-Za-z0-9]+'


class DataParser(object):
"""Parses data from extracted text
Expand Down Expand Up @@ -72,19 +74,27 @@ def get_entities(self, extracted_text: str):
logger.info('Parsing data out of extracted text...')
username_match = re.search(USERNAME_REGEX, extracted_text)
datetime_match = re.search(DATETIME_REGEX, extracted_text)
if not username_match or not datetime_match:
if not username_match:
return (dict({
'user_id': None,
'tweet': None,
'datetime': None
}), ResultStatus.NO_RESULT)
user_id = username_match.group()[1:]
tweet_start_index = username_match.end()
tweet_end_index = len(
extracted_text
) - 1 if not datetime_match else datetime_match.start()
tweet = extracted_text[tweet_start_index:tweet_end_index].strip()
if not datetime_match:
return (dict({
'user_id': user_id,
'tweet': tweet,
'date': None
}), ResultStatus.ALL_OKAY)
date_str = datetime_match.group().replace('-', '')
processed_datetime = date_parser.parse(date_str).replace(
tzinfo=datetime.timezone.utc)
username_end_index = username_match.end()
date_start_index = datetime_match.start()
tweet = extracted_text[username_end_index:date_start_index].strip()
return (dict({
'user_id': user_id,
'tweet': tweet,
Expand Down Expand Up @@ -114,7 +124,7 @@ def clean_text(self, extracted_text: str):
logger.exception(e)
return (None, ResultStatus.MODULE_FAILURE)
filtered_sentence = [w for w in word_tokens if not w in stopwords]
picked_words = filtered_sentence[0:min([len(filtered_sentence), 4])]
picked_words = filtered_sentence[2:min([len(filtered_sentence), 6])]
tweet_snippet = " ".join(picked_words)
if not tweet_snippet:
return (tweet_snippet, ResultStatus.NO_RESULT)
Expand Down
39 changes: 39 additions & 0 deletions verifytweet/util/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import verifytweet.services.image as image_service
import verifytweet.services.text as text_service
import verifytweet.util.validator as validator

from verifytweet.util.logging import logger
from verifytweet.util.result import ResultStatus
Expand Down Expand Up @@ -62,3 +63,41 @@ def extract_and_parse(file_path: str):
return (None, parser_status)
logger.debug('Entities: ' + str(entities))
return (entities, parser_status)


def calculate_and_validate(entities: dict, same_day_tweets: list):
"""Calculates similarity matrix and validates tweet
Calculates a similarity matrix from same day tweet
corpus using text service and validates tweet
using validator
Args:
entities: represents dictionary of entities extracted from text
same_day_tweets: list of strings representing same day tweets
Returns:
valid_tweet: Validity status of tweet
status: Enum ResultStatus representing result status
"""
try:
text_processor = text_service.TextProcessor()
similarity_matrix, processor_status = text_processor.get_similarity(
entities['tweet'], same_day_tweets)
except Exception as e:
logger.exception(e)
return (None, None, ResultStatus.MODULE_FAILURE)
if processor_status != ResultStatus.ALL_OKAY:
return (None, None, processor_status)

try:
valid_tweet, match_index, validator_status = validator.verify_validity(
similarity_matrix)
except Exception as e:
logger.exception(e)
return (None, None, ResultStatus.MODULE_FAILURE)
if validator_status != ResultStatus.ALL_OKAY:
return (None, None, validator_status)
logger.debug('Tweet Validity: ' + str(valid_tweet))
return (valid_tweet, match_index-1, ResultStatus.ALL_OKAY)
2 changes: 1 addition & 1 deletion verifytweet/util/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

from verifytweet.config.settings import app_config

logger = logging.getLogger('verify_logger')
logger = logging.getLogger()
logger.setLevel(app_config.LOG_LEVEL)

handler = logging.StreamHandler(sys.stdout)
Expand Down
10 changes: 5 additions & 5 deletions verifytweet/util/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ def verify_validity(similarity_matrix: ndarray):
raise TypeError('Similarity matrix must type numpy.ndarray')
if not similarity_matrix.all():
raise ValueError('Similarity matrix must be a valid numpy array')
for row in similarity_matrix:
for column in row:
if column > app_config.SIMILARITY_THRESHOLD:
return (True, ResultStatus.ALL_OKAY)
return (False, ResultStatus.ALL_OKAY)
row = similarity_matrix[0]
for column_index in range(1, row.shape[0]):
if row[column_index] > app_config.SIMILARITY_THRESHOLD:
return (True, column_index, ResultStatus.ALL_OKAY)
return (False, None, ResultStatus.ALL_OKAY)

0 comments on commit 65f497b

Please sign in to comment.