Choose of parsers, some fixes

soxoj · Dec 10, 2021 · d33fc0a · d33fc0a
1 parent 5421f14
commit d33fc0a
Show file tree

Hide file tree

Showing 3 changed files with 64 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -51,6 +51,13 @@ Social OSINT fundamentals - Codeby.net
 Links: total collected 111 / unique with username in URL 97 / reliable 38 / documents 3
 ```
 
+Advanced usage:
+```
+./marple.py soxoj --plugins metadata
+
+./marple.py smirnov --engines google baidu -v
+```
+
 ## Installation
 
 All you need is Python3. And pip. And requirements, of course.
@@ -80,8 +87,13 @@ Other options:
   --results-count RESULTS_COUNT
                         Count of results parsed from each search engine
   --no-url-filter       Disable filtering results by usernames in URLs
-  --plugin {socid_extractor,metadata,maigret}
+
+  --engines {baidu,dogpile,google,bing,ask,aol,torch,yandex,naver,paginated,yahoo,startpage,duckduckgo,qwant}
+                        Engines to run (you can choose more than one)
+
+  --plugins {socid_extractor,metadata,maigret} [{socid_extractor,metadata,maigret} ...]
                         Additional plugins to analyze links
+
   -v, --verbose         Display junk score for each result
   -d, --debug           Display all the results from sources and debug messages
   -l, --list            Display only list of all the URLs

diff --git a/marple.py b/marple.py
@@ -6,7 +6,7 @@
 import re
 import os
 from typing import List
-from argparse import ArgumentParser, RawDescriptionHelpFormatter
+from argparse import ArgumentParser as Arguments, RawDescriptionHelpFormatter
 import urllib.parse
 
 import aiohttp
@@ -32,6 +32,7 @@
 
 links_blacklist = [
     'books.google.ru',
+    '/search?q=',
 ]
 
 
@@ -110,7 +111,7 @@ def merge_links(links: List[Link], name: str, filter_by_urls: bool = True) -> Li
 
     if filter_by_urls:
         for l in links:
-            if name not in l.url.lower():
+            if name.lower() not in l.url.lower():
                 l.filtered = True
 
     links = list(filter(blacklist_filter, links))
@@ -152,8 +153,11 @@ async def request(self, url, proxy=None):
 
     async def run(self, storage, username, count=100, lang='en', proxy=None):
         url = self.make_url(username, count, lang)
-        html = await self.request(url)
-        results = await self.parse(html, username)
+        try:
+            html = await self.request(url)
+            results = await self.parse(html, username)
+        except Exception as e:
+            return (self.name, f'Error of type "{type(e)}": {e}')
 
         if not results:
             return (self.name, f'Got no results')
@@ -174,6 +178,8 @@ async def run(self, storage, username, count=100, lang='en', proxy=None):
         try:
             yandex = yandex_search.Yandex()
             results = yandex.search(username).items
+        except KeyError as e:
+            return (self.name, f'Not found env variable {str(e)}')
         except Exception as e:
             return (self.name, str(e))
 
@@ -210,7 +216,7 @@ async def parse(self, html, username):
 
 
 # old unused parser
-class DuckParser(Parser):
+class DuckParserOld(Parser):
     name = 'DuckDuckGo scraping'
 
     def make_url(self, username, count, lang):
@@ -348,6 +354,8 @@ async def run(self, storage, username, count=100, lang='en', proxy=None):
             search = SerpGoogle(params)
             results = search.get_dict()
             organic_results = results['organic_results']
+        except KeyError as e:
+            return (self.name, f'Not found env variable {str(e)}')
         except Exception as e:
             return (self.name, str(e))
 
@@ -376,6 +384,8 @@ async def run(self, storage, username, count=100, lang='en', proxy=None):
             search = SerpBaidu(params)
             results = search.get_dict()
             organic_results = results['organic_results']
+        except KeyError as e:
+            return (self.name, f'Not found env variable {str(e)}')
         except Exception as e:
             return (self.name, str(e))
 
@@ -403,7 +413,8 @@ def __init__(self, results, links, errors, warnings):
         self.warnings = warnings
 
 
-async def marple(username, max_count, url_filter_enabled, is_debug=False, proxy=None):
+async def marple(username, max_count, url_filter_enabled, is_debug=False, proxy=None,
+                 custom_engines=None):
     parsers = [
         GoogleParser(),
         YandexParser(),
@@ -420,6 +431,11 @@ async def marple(username, max_count, url_filter_enabled, is_debug=False, proxy=
         BaiduParser(),
     ]
 
+    if custom_engines:
+        parsers = [
+            globals()[e.capitalize() + 'Parser']() for e in custom_engines
+        ]
+
     results = []
     errors = []
     warnings = []
@@ -451,8 +467,16 @@ async def marple(username, max_count, url_filter_enabled, is_debug=False, proxy=
         )
 
 
+def get_engines_names():
+    names = set()
+    for k in globals().keys():
+        if k.lower().endswith('parser') and k != 'Parser':
+            names.add(k.split('Parser')[0].lower())
+    return names
+
+
 def main():
-    parser = ArgumentParser(
+    parser = Arguments(
         formatter_class=RawDescriptionHelpFormatter,
         description='Marple v0.0.1\n'
         'Collect links to profiles by username through search engines',
@@ -484,8 +508,16 @@ def main():
         help='Disable filtering results by usernames in URLs',
     )
     parser.add_argument(
-        '--plugin',
+        '--engines',
+        dest='engines',
+        nargs='+',
+        choices=get_engines_names(),
+        help=f'Engines to run (you can choose more than one)',
+    )
+    parser.add_argument(
+        '--plugins',
         dest='plugins',
+        nargs='+',
         default='',
         choices={'maigret', 'socid_extractor', 'metadata'},
         help='Additional plugins to analyze links',
@@ -534,7 +566,9 @@ def main():
 
     loop = asyncio.get_event_loop()
 
-    result = loop.run_until_complete(marple(username, args.results_count, args.url_filter, is_debug=args.debug, proxy=args.proxy))
+    result = loop.run_until_complete(marple(username, args.results_count, args.url_filter,
+                                            is_debug=args.debug, proxy=args.proxy,
+                                            custom_engines=args.engines))
 
     total_collected_count = len(result.all_links)
     uniq_count = len(result.unique_links)
@@ -543,7 +577,7 @@ def main():
         for r in result.all_links:
             print(f'{r.url}\n{r.title}\n')
 
-    if args.plugins == 'maigret':
+    if 'maigret' in args.plugins:
         try:
             import maigret
             db = maigret.MaigretDatabase().load_from_file(maigret.__path__[0]+'/resources/data.json')
@@ -553,7 +587,7 @@ def main():
             print('\tpip3 install maigret')
             exit()
 
-    if args.plugins == 'socid_extractor':
+    if 'socid_extractor' in args.plugins:
         try:
             import socid_extractor
         except ImportError:
@@ -582,13 +616,13 @@ def is_likely_profile(r):
                 message = colored(f'[{r.junk_score}]', 'magenta') + ' ' + \
                           colored(f'[{r.source}]', 'green') + ' ' + message
 
-            if args.plugins == 'maigret' and maigret.db:
+            if 'maigret' in args.plugins and maigret.db:
                 if maigret.db.extract_ids_from_url(r.url):
                     message += colored(' [v] Maigret', 'green')
                 else:
                     message += colored(' [ ] Maigret', 'yellow')
 
-            if args.plugins == 'socid_extractor':
+            if 'socid_extractor' in args.plugins:
                 try:
                     req = requests.get(r.url)
                     extract_items = socid_extractor.extract(req.text)
@@ -620,12 +654,12 @@ def is_pdf_file(url):
 
             print(f'{message}\n{r.title}')
 
-            if args.plugins == 'metadata':
+            if 'metadata' in args.plugins:
                 filename = r.url.split('/')[-1]
 
                 try:
                     if not os.path.exists(filename):
-                        print(colored(f'Downloading {r.url} to file {filename}...', 'cyan'))
+                        print(colored(f'Downloading {r.url} to file {filename} ...', 'cyan'))
                         req = requests.get(r.url)
                         with open(filename, 'wb') as f:
                             f.write(req.content)

diff --git a/requirements.txt b/requirements.txt
@@ -9,4 +9,5 @@ maigret
 aiohttp_socks
 search_engines @ https://github.com/soxoj/Search-Engines-Scraper/archive/refs/heads/master.zip
 tqdm
-google-search-results
+google-search-results
+mock