CSV export, Yahoo

soxoj · Dec 7, 2021 · 60df5b7 · 60df5b7
1 parent 23e7551
commit 60df5b7
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -7,6 +7,7 @@ Collect links to profiles by username through search engines (see the full list
 Features:
 - multiple engines
 - proxy support
+- CSV file export
 - plugins
   - pdf metadata extraction
   - social media info [extraction](socid_extractor)
@@ -73,6 +74,7 @@ Other options:
   -d, --debug           Display all the results from sources and debug messages
   -l, --list            Display only list of all the URLs
   --proxy PROXY         Proxy string (e.g. https://user:pass@1.2.3.4:8080)
+  --csv CSV             Save results to the CSV file
 ```
 
 ## Supported sources
@@ -82,6 +84,7 @@ Other options:
 | Google              | scraping                              | None, works out of the box; frequent captcha  |
 | DuckDuckGo          | scraping                              | None, works out of the box                    |
 | Yandex              | XML API                               | [Register and get USER/API tokens](https://github.com/fluquid/yandex-search)   |
+| Yahoo               | scraping                              | only one page for now  |
 
 ## Development & testing
 

diff --git a/marple.py b/marple.py
@@ -1,10 +1,12 @@
 #!/usr/bin/env python3
 import asyncio
+import csv
 import json
 import re
 import os
 from typing import List
 from argparse import ArgumentParser, RawDescriptionHelpFormatter
+import urllib.parse
 
 import aiohttp
 import requests
@@ -218,6 +220,35 @@ async def parse(self, html, username):
         return results
 
 
+class YahooParser(Parser):
+    name = 'Yahoo scraping'
+
+    def make_url(self, username, count, lang):
+        return 'https://search.yahoo.com/search?p={}&ei=UTF-8&nojs=1'.format(username)
+
+    async def parse(self, html, username):
+        results = []
+
+        soup = bs(html, 'html.parser')
+        result_block = soup.find_all('div', class_='compTitle')
+
+        for result in result_block:
+            header = result.find('h3', class_='title')
+            if not header or not header.find('a'):
+                continue
+
+            pseudo_link = header.find('span').text
+            title = header.find('a').text[len(pseudo_link):]
+            link = header.find('a')['href']
+
+            link = urllib.parse.unquote(link.split('/RU=')[1].split('/RK=2')[0])
+
+            if link and title:
+                results.append(Link(link, title, username))
+
+        return results
+
+
 class MarpleResult:
     def __init__(self, results, links, errors, warnings):
         self.all_links = results
@@ -231,6 +262,7 @@ def marple(username, max_count, url_filter_enabled, is_debug=False, proxy=None):
         GoogleParser(),
         DuckParser(),
         YandexParser(),
+        YahooParser(),
     ]
 
     results = []
@@ -335,6 +367,12 @@ def main():
         default="",
         help="Proxy string (e.g. https://user:pass@1.2.3.4:8080)",
     )
+    parser.add_argument(
+        '--csv',
+        type=str,
+        default="",
+        help="Save results to the CSV file",
+    )
     args = parser.parse_args()
 
     username = args.username
@@ -378,9 +416,12 @@ def main():
 
     displayed_count = 0
 
+    def is_likely_profile(r):
+        return r.is_it_likely_username_profile() and r.junk_score <= args.threshold and not r.filtered
+
     # reliable links section
     for r in result.unique_links:
-        if r.is_it_likely_username_profile() and r.junk_score <= args.threshold and not r.filtered:
+        if is_likely_profile(r):
             displayed_count += 1
 
             message = r.url
@@ -404,9 +445,12 @@ def main():
 
     pdf_count = 0
 
+    def is_pdf_file(url):
+        return url.endswith('pdf') or '-pdf.' in url
+
     # pdf links section
     for r in result.unique_links:
-        if r.url.endswith('pdf') or '-pdf.' in r.url:
+        if is_pdf_file(r.url):
             if pdf_count == 0:
                 print(colored('PDF files', 'cyan'))
 
@@ -439,7 +483,6 @@ def main():
 
             print()
 
-
     # show status
     status_msg = f'Links: total collected {total_collected_count} / unique with username in URL {uniq_count} / reliable {displayed_count} / documents {pdf_count}'
 
@@ -452,6 +495,21 @@ def main():
 
     print(f"{colored(status_msg, 'cyan')}\n{colored(error_msg, 'yellow')}")
 
+    if args.csv:
+        with open(args.csv, 'w', newline='') as csvfile:
+            writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
+            writer.writerow(['URL', 'Title', 'Score', 'Is profile page', 'Is PDF'])
+
+            def write_links(condition):
+                for r in result.unique_links:
+                    if not condition(r):
+                        continue
+                    writer.writerow([r.url, r.title, r.junk_score, is_likely_profile(r), is_pdf_file(r.url)])
+
+            write_links(lambda x: is_likely_profile(x))
+            write_links(lambda x: not is_likely_profile(x))
+
+        print(colored(f'Results was saved to CSV file {args.csv}', 'red'))
 
 if __name__ == '__main__':
     main()