In [None]:
import requests, json
import newspaper
from lxml import html
import pandas as pd
from tqdm import tqdm
import time


base_url = 'https://cyware.com/cyber-security-news-articles'
categories_url = 'https://cyware.com'


In [None]:
resp = requests.get('https://cyware.com/cyber-security-news-articles')
tree = html.fromstring(resp.content)
categories_elements = tree.xpath("/html/body/div/section/div/nav/ul/li/ul/div/li/a[@*]")
categories = {}

for el in categories_elements:
    categories[el.xpath('text()')[0].__str__()] = el.attrib['href']

In [None]:
articles = {}
for idx, category in enumerate(list(categories.values())):
    url = "{}{}".format(categories_url, category)
    resp = requests.get("{}{}".format(url, "?p=15"))
    tree = html.fromstring(resp.content)
    articles_elements = tree.xpath("/html/body/div[2]/main/section/div/div[1]/div[2]/div[1]/div/div/div/div/h2/a")
    articles[list(categories.keys())[idx]] = {el.xpath('text()')[0]:el.attrib['href'] for el in articles_elements}

In [None]:
resp = requests.get("https://cyware.com/category/breaches-and-incidents-news?p=15")
tree = html.fromstring(resp.content)
articles_elements = tree.xpath("/html/body/div[2]/main/section/div/div[1]/div[2]/div[1]/div/div/div/div/h2/a")

In [None]:
count = 0
for category in list(articles.keys()):
    for title in list(articles[category].keys()):
        count+=1

with tqdm(total=count) as pbar:
    for category in list(articles.keys()):
        for title in list(articles[category].keys()):
            article_url = articles[category][title]
            article = newspaper.Article(article_url)
            article.download()
            if article.download_state == 0:
                time.sleep(3)
            try:
                article.parse()
            except newspaper.ArticleException:
                print("Article failed to download\nUrl:{}".format(article_url))
            text = article.text
            rows.append([category, categories[category], title, article_url, text])
            pbar.update(1)

In [None]:
method = 'POST'
timestamp=int(time.time())+7200
url = 'https://api.cyware.com/public/cyuserallstory/?l_time={}'.format(timestamp)
raw_headers = "User-Agent:PostmanRuntime/7.13.0\nAccept:*/*\nCache-Control:no-cache\nPostman-Token:4ae3e87e-7195-4654-9bd9-ebe230359fb7\ncookie:2fa_sessionid=fkk1a2qxxh0bc5cuxjmafezqjcjfgx6y\ncontent-length:0\nConnection:close\nHost:api.cyware.com"

headers = dict([[h.partition(':')[0], h.partition(':')[2]] for h in raw_headers.split('\n')])

In [None]:
import numpy as np
import multiprocessing as mp
from joblib import Parallel, delayed
import datetime


def fetch_and_analyze(el, api_page_url, next_api_page_url):
    text = ""
    
    try:
        if( "{}{}".format(el['url'], "/") == el['web_sp_link']):
            text = el['text']
        else:
            url = el['web_sp_link']
            article = newspaper.Article(url)
            article.download()

            max_time=0.0
            while((article.download_state == 0) & (max_time < 2)):
                time.sleep(0.1)
                max_time += 0.1

            article.parse()
            text = article.text
    except newspaper.ArticleException:
        print("Article failed to download.\nUrl:{}\nTime: {}".format(url, str(datetime.datetime.now())))
    finally:   
        return [text, el['title'],
                      el['image'],
                      el['category'],
                      el['category_slug'],
                      el['web_sp_link'],
                      el['p_time'],
                      api_page_url,
                      next_api_page_url]

articles_api = []

resp = requests.request(method, url, headers=headers)
json_content = json.loads(resp.text)
count = int(json_content['count'])
# count = 100
num_cores = mp.cpu_count()

with tqdm(total=count) as pbar:
    while((len(articles_api) < count) & ((url != "") | (url != None))):
        resp = requests.request(method, url, headers=headers)
        json_content = json.loads(resp.text)
        
        api_page_url = url
        next_api_page_url = json_content['links']['next']
        url = next_api_page_url
        
        inputs = json_content['results']

        rows = Parallel(n_jobs=num_cores)(delayed(fetch_and_analyze)(i, api_page_url, next_api_page_url) for i in inputs)
        articles_api.extend(rows)
        
        pbar.update(len(rows))

In [None]:
columns = ['text', 'title', 'image_url', 'category', 'category_slug', 'url', 'p_time', 'api_page_url', 'next_api_page_url']
df = pd.DataFrame(articles_api, columns=columns)

f = open('entries-cyware-api-2.csv', "w")
df.to_csv(path_or_buf=f)
f.close()

In [None]:
f = open('entries-cyware-api-2-cleaned.csv', 'r')
df = pd.read_csv(f, index_col=0)
f.close()

In [2]:
# Solution Without Paralleization

def howmany_within_range(row, minimum, maximum):
    """Returns how many numbers lie within `maximum` and `minimum` in a given `row`"""
    count = 0
    for n in row:
        if minimum <= n <= maximum:
            count = count + 1
    return count

results = []
for row in data:
    results.append(howmany_within_range(row, minimum=4, maximum=8))

print(results[:10])

[2, 3, 2, 3, 2, 3, 2, 2, 4, 3]
