Skip to content

Commit

Permalink
Choose of parsers, some fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
soxoj committed Dec 10, 2021
1 parent 5421f14 commit d33fc0a
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 17 deletions.
14 changes: 13 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,13 @@ Social OSINT fundamentals - Codeby.net
Links: total collected 111 / unique with username in URL 97 / reliable 38 / documents 3
```

Advanced usage:
```
./marple.py soxoj --plugins metadata
./marple.py smirnov --engines google baidu -v
```

## Installation

All you need is Python3. And pip. And requirements, of course.
Expand Down Expand Up @@ -80,8 +87,13 @@ Other options:
--results-count RESULTS_COUNT
Count of results parsed from each search engine
--no-url-filter Disable filtering results by usernames in URLs
--plugin {socid_extractor,metadata,maigret}
--engines {baidu,dogpile,google,bing,ask,aol,torch,yandex,naver,paginated,yahoo,startpage,duckduckgo,qwant}
Engines to run (you can choose more than one)
--plugins {socid_extractor,metadata,maigret} [{socid_extractor,metadata,maigret} ...]
Additional plugins to analyze links
-v, --verbose Display junk score for each result
-d, --debug Display all the results from sources and debug messages
-l, --list Display only list of all the URLs
Expand Down
64 changes: 49 additions & 15 deletions marple.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import re
import os
from typing import List
from argparse import ArgumentParser, RawDescriptionHelpFormatter
from argparse import ArgumentParser as Arguments, RawDescriptionHelpFormatter
import urllib.parse

import aiohttp
Expand All @@ -32,6 +32,7 @@

links_blacklist = [
'books.google.ru',
'/search?q=',
]


Expand Down Expand Up @@ -110,7 +111,7 @@ def merge_links(links: List[Link], name: str, filter_by_urls: bool = True) -> Li

if filter_by_urls:
for l in links:
if name not in l.url.lower():
if name.lower() not in l.url.lower():
l.filtered = True

links = list(filter(blacklist_filter, links))
Expand Down Expand Up @@ -152,8 +153,11 @@ async def request(self, url, proxy=None):

async def run(self, storage, username, count=100, lang='en', proxy=None):
url = self.make_url(username, count, lang)
html = await self.request(url)
results = await self.parse(html, username)
try:
html = await self.request(url)
results = await self.parse(html, username)
except Exception as e:
return (self.name, f'Error of type "{type(e)}": {e}')

if not results:
return (self.name, f'Got no results')
Expand All @@ -174,6 +178,8 @@ async def run(self, storage, username, count=100, lang='en', proxy=None):
try:
yandex = yandex_search.Yandex()
results = yandex.search(username).items
except KeyError as e:
return (self.name, f'Not found env variable {str(e)}')
except Exception as e:
return (self.name, str(e))

Expand Down Expand Up @@ -210,7 +216,7 @@ async def parse(self, html, username):


# old unused parser
class DuckParser(Parser):
class DuckParserOld(Parser):
name = 'DuckDuckGo scraping'

def make_url(self, username, count, lang):
Expand Down Expand Up @@ -348,6 +354,8 @@ async def run(self, storage, username, count=100, lang='en', proxy=None):
search = SerpGoogle(params)
results = search.get_dict()
organic_results = results['organic_results']
except KeyError as e:
return (self.name, f'Not found env variable {str(e)}')
except Exception as e:
return (self.name, str(e))

Expand Down Expand Up @@ -376,6 +384,8 @@ async def run(self, storage, username, count=100, lang='en', proxy=None):
search = SerpBaidu(params)
results = search.get_dict()
organic_results = results['organic_results']
except KeyError as e:
return (self.name, f'Not found env variable {str(e)}')
except Exception as e:
return (self.name, str(e))

Expand Down Expand Up @@ -403,7 +413,8 @@ def __init__(self, results, links, errors, warnings):
self.warnings = warnings


async def marple(username, max_count, url_filter_enabled, is_debug=False, proxy=None):
async def marple(username, max_count, url_filter_enabled, is_debug=False, proxy=None,
custom_engines=None):
parsers = [
GoogleParser(),
YandexParser(),
Expand All @@ -420,6 +431,11 @@ async def marple(username, max_count, url_filter_enabled, is_debug=False, proxy=
BaiduParser(),
]

if custom_engines:
parsers = [
globals()[e.capitalize() + 'Parser']() for e in custom_engines
]

results = []
errors = []
warnings = []
Expand Down Expand Up @@ -451,8 +467,16 @@ async def marple(username, max_count, url_filter_enabled, is_debug=False, proxy=
)


def get_engines_names():
names = set()
for k in globals().keys():
if k.lower().endswith('parser') and k != 'Parser':
names.add(k.split('Parser')[0].lower())
return names


def main():
parser = ArgumentParser(
parser = Arguments(
formatter_class=RawDescriptionHelpFormatter,
description='Marple v0.0.1\n'
'Collect links to profiles by username through search engines',
Expand Down Expand Up @@ -484,8 +508,16 @@ def main():
help='Disable filtering results by usernames in URLs',
)
parser.add_argument(
'--plugin',
'--engines',
dest='engines',
nargs='+',
choices=get_engines_names(),
help=f'Engines to run (you can choose more than one)',
)
parser.add_argument(
'--plugins',
dest='plugins',
nargs='+',
default='',
choices={'maigret', 'socid_extractor', 'metadata'},
help='Additional plugins to analyze links',
Expand Down Expand Up @@ -534,7 +566,9 @@ def main():

loop = asyncio.get_event_loop()

result = loop.run_until_complete(marple(username, args.results_count, args.url_filter, is_debug=args.debug, proxy=args.proxy))
result = loop.run_until_complete(marple(username, args.results_count, args.url_filter,
is_debug=args.debug, proxy=args.proxy,
custom_engines=args.engines))

total_collected_count = len(result.all_links)
uniq_count = len(result.unique_links)
Expand All @@ -543,7 +577,7 @@ def main():
for r in result.all_links:
print(f'{r.url}\n{r.title}\n')

if args.plugins == 'maigret':
if 'maigret' in args.plugins:
try:
import maigret
db = maigret.MaigretDatabase().load_from_file(maigret.__path__[0]+'/resources/data.json')
Expand All @@ -553,7 +587,7 @@ def main():
print('\tpip3 install maigret')
exit()

if args.plugins == 'socid_extractor':
if 'socid_extractor' in args.plugins:
try:
import socid_extractor
except ImportError:
Expand Down Expand Up @@ -582,13 +616,13 @@ def is_likely_profile(r):
message = colored(f'[{r.junk_score}]', 'magenta') + ' ' + \
colored(f'[{r.source}]', 'green') + ' ' + message

if args.plugins == 'maigret' and maigret.db:
if 'maigret' in args.plugins and maigret.db:
if maigret.db.extract_ids_from_url(r.url):
message += colored(' [v] Maigret', 'green')
else:
message += colored(' [ ] Maigret', 'yellow')

if args.plugins == 'socid_extractor':
if 'socid_extractor' in args.plugins:
try:
req = requests.get(r.url)
extract_items = socid_extractor.extract(req.text)
Expand Down Expand Up @@ -620,12 +654,12 @@ def is_pdf_file(url):

print(f'{message}\n{r.title}')

if args.plugins == 'metadata':
if 'metadata' in args.plugins:
filename = r.url.split('/')[-1]

try:
if not os.path.exists(filename):
print(colored(f'Downloading {r.url} to file {filename}...', 'cyan'))
print(colored(f'Downloading {r.url} to file {filename} ...', 'cyan'))
req = requests.get(r.url)
with open(filename, 'wb') as f:
f.write(req.content)
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ maigret
aiohttp_socks
search_engines @ https://github.com/soxoj/Search-Engines-Scraper/archive/refs/heads/master.zip
tqdm
google-search-results
google-search-results
mock

0 comments on commit d33fc0a

Please sign in to comment.