In [1]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import re
import time

from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA1
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as SIA2
import nltk
nltk.download('vader_lexicon')
nltk.download('punkt')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/sankrandanloke/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sankrandanloke/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
def find_btc_webpage_appearances(btc_address):
  '''
  Function to take BTC address as input and output
  the list of website it appears in.
  '''
  search_url = "https://www.bitcoinwhoswho.com/address/" + btc_address

  raw_content = get_content(search_url)
  if raw_content == 0:
    print("Unable to reach ", search_url, " !!!")
    return

  soup = BeautifulSoup(raw_content, 'html.parser')
  website_appearances_raw = soup.find_all('div', id="url_bitcoin_found_table")
  webpage_appearances = []
  if(len(website_appearances_raw) != 0):
    website_appearances_raw_html = website_appearances_raw[0]
    for link in website_appearances_raw_html.find_all('a', href = True):
      webpage_appearances.append(link['href'])

  return webpage_appearances

def remove_tags(html):
  '''
  Function takes html response as input and removes
  all the html tags and returns a text blob.
  '''
  soup = BeautifulSoup(html, "html.parser")

  for data in soup(['style', 'script']):
    data.decompose()

  return ' '.join(soup.stripped_strings)

def title_SA(webpage_appearances, btc_address):
  '''
  Function takes website list and BTC address as inputs
  and returns for each website, the number of times it 
  appeared in it, title sentiment analysis, body sentiment
  analysis and the website url.
  '''
  freq_count = []
  body_sa = []
  title_sa = []
  urls = []
  for link in range(len(webpage_appearances)):
    content = get_content(webpage_appearances[link])
    if content == 0:
      continue
    soup = BeautifulSoup(content, 'html.parser')
    body = soup.get_text()
    title = ""
    if soup.title is not None:
      title = soup.title.text
    freq_count.append(btc_addr_frequency(content, btc_address))
    title_sa.append(sentimentAnalyzer3(title))
    body_sa.append(sentimentAnalyzer3(body))
    urls.append(webpage_appearances[link])
  return freq_count, title_sa, body_sa, urls

def list_websites_SA(btc_address):
  '''
  Function takes BTC address as input and returns
  JSON containing the URL, title sentiment analysis scores,
  body sentiment analysis scores and the number of times
  that BTC address appeared in that website.
  '''
  start = time.time()
  webpage_appearances = find_btc_webpage_appearances(btc_address)
  freqs, title_sa, body_sa, urls = title_SA(webpage_appearances, btc_address)
  resultSet = []
  # print('Url, Positive, Negative, Frequency')
  for item in range(len(title_sa)):
    # if (title_sa[item]['neg'] > title_sa[item]['pos']): # if negative score > positive score
    #   print(freqs[item], ' --- ', urls[item])
    resultSet.append({'url':urls[item],'title_sa':title_sa[item],'body_sa':body_sa[item],'freq':freqs[item]})
  # print(resultSet)
  print("\n\nTotal time: ", time.time()-start, " seconds")
  return resultSet

def list_neg_websites(result, title_sa_threshold, body_sa_threshold):
  '''
  Function takes result set, title SA threshold (for 
  negative score in title SA), body SA threshold (for negative
  score in body SA) as inputs and filters the result set 
  and returns it as output.
  '''
  out = []
  for item in result:
    if (item['title_sa']['neg']>title_sa_threshold or item['body_sa']['neg']>body_sa_threshold):
      out.append(item)
  return out
    
def sentimentAnalyzer1(sentence):
  '''
  Function takes a sentence as input and outputs
  its sentiment analysis scores.
  Uses nltk.sentiment.vader library's 
  SentimentIntensityAnalyzer.
  '''
  sid = SIA1()
  score1 = sid.polarity_scores(sentence)
  return score1
def sentimentAnalyzer2(sentence):
  '''
  Function takes a sentence as input and outputs
  its sentiment analysis scores.
  Uses vaderSentiment.vaderSentiment library's 
  SentimentIntensityAnalyzer.
  '''
  vs = SIA2()
  score2 = vs.polarity_scores(sentence)
  return score2
def sentimentAnalyzer3(blob):
  '''
  Function takes a text blob as input and outputs
  its sentiment analysis positive and negative scores. 
  It performs sentiment analysis for each sentence
  and averages all non-zero negative scores and all 
  non-zero positive scores to get document level
  sentiment analysis scores.
  '''
  filter_text = re.sub(r'(\n+)|(\.\n+)', '. ', blob)
  sid = SIA1()
  neg = 0
  neg_count = 0
  pos = 0
  pos_count = 0
  for sentence in nltk.sent_tokenize(filter_text):
    score = sid.polarity_scores(sentence)
    neg = neg + score['neg'] if score['neg'] != 0 else neg
    neg_count = neg_count + 1 if score['neg'] != 0 else neg_count
    pos = pos + score['pos'] if score['pos'] != 0 else pos
    pos_count = pos_count + 1 if score['pos'] != 0 else pos_count
  neg_count = 1 if neg_count == 0 else neg_count
  pos_count = 1 if pos_count == 0 else pos_count
  return {'neg': neg/neg_count, 'pos': pos/pos_count}

def btc_addr_frequency(html, btc_address):
  '''
  Function takes html content, BTC address as input
  and returns the number of times the BTC address 
  has appeared in the html content.
  '''
  # text = remove_tags(html)
  text = html.decode('ISO-8859-1')
  indices = [i for i in range(len(text)) if text.startswith(btc_address, i)]
  return len(indices)

def get_content(link):
  '''
  Function takes a URL as input and returns the 
  html content from the GET call to that link.
  '''
  try:
    req = Request(link, headers={'User-Agent':'Mozilla/5.0'})
    content = urlopen(req).read()
    return content
  except Exception as e:
    print('Exception details: ', e, '\nLink: ', link, '\n')
    return 0

In [3]:
btc_address = '12t9YDPgwueZ9NyMgw519p7AA8isjr6SMw'
# btc_address = '3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy'
result = list_websites_SA(btc_address)
result

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Exception details:  <urlopen error [Errno 61] Connection refused> 
Link:  https://arhivach.net/thread/261173/ 

Exception details:  <urlopen error [Errno 61] Connection refused> 
Link:  https://arhivach.net/thread/261420/ 

Exception details:  <urlopen error [Errno 61] Connection refused> 
Link:  https://arhivach.net/thread/261563/ 

Exception details:  <urlopen error [Errno 61] Connection refused> 
Link:  https://arhivach.net/thread/261857/ 

Exception details:  Remote end closed connection without response 
Link:  https://github.com/keithcollins/actual_ransom/blob/master/tx.json 

Exception details:  HTTP Error 403: Forbidden 
Link:  https://www.cryptocoinsnews.com/nearly-53k-bitcoin-ransoms-paid-wannacry-ransomware/ 

Exception details:  HTTP Error 503: Service Temporarily Unavailable 
Link:  https://bitcointalk.org/index.php?topic=1916199.0;all 



Total time:  47.50879621505737  seconds


[{'url': 'https://www.pcrisk.com/removal-guides/20077-niros-ransomware',
  'title_sa': {'neg': 0.0, 'pos': 0.0},
  'body_sa': {'neg': 0.22553164556962024, 'pos': 0.22033620689655178},
  'freq': 2},
 {'url': 'https://www.pcrisk.com/removal-guides/19843-fbi-screenlocker',
  'title_sa': {'neg': 0.0, 'pos': 0.0},
  'body_sa': {'neg': 0.2916666666666667, 'pos': 0.2308767123287671},
  'freq': 2},
 {'url': 'https://wanna-cry-profits.herokuapp.com/',
  'title_sa': {'neg': 0.0, 'pos': 0.744},
  'body_sa': {'neg': 0.315, 'pos': 0.5004},
  'freq': 2},
 {'url': 'https://www.recordedfuture.com/wannacry-ransomware-analysis/',
  'title_sa': {'neg': 0.437, 'pos': 0.0},
  'body_sa': {'neg': 0.24189705882352938, 'pos': 0.3169871794871795},
  'freq': 2},
 {'url': 'https://www.crowdstrike.com/blog/falcon-intelligence-report-wanna-ransomware-spreads-rapidly-continually-encrypts-victim-files/',
  'title_sa': {'neg': 0.33, 'pos': 0.138},
  'body_sa': {'neg': 0.17304545454545456, 'pos': 0.2062142857142857},
 

In [4]:
title_sa_threshold = 0.1
body_sa_threshold = 0.3
list_neg_websites(result, title_sa_threshold, body_sa_threshold)

[{'url': 'https://wanna-cry-profits.herokuapp.com/',
  'title_sa': {'neg': 0.0, 'pos': 0.744},
  'body_sa': {'neg': 0.315, 'pos': 0.5004},
  'freq': 2},
 {'url': 'https://www.recordedfuture.com/wannacry-ransomware-analysis/',
  'title_sa': {'neg': 0.437, 'pos': 0.0},
  'body_sa': {'neg': 0.24189705882352938, 'pos': 0.3169871794871795},
  'freq': 2},
 {'url': 'https://www.crowdstrike.com/blog/falcon-intelligence-report-wanna-ransomware-spreads-rapidly-continually-encrypts-victim-files/',
  'title_sa': {'neg': 0.33, 'pos': 0.138},
  'body_sa': {'neg': 0.17304545454545456, 'pos': 0.2062142857142857},
  'freq': 1},
 {'url': 'https://unit42.paloaltonetworks.com/unit42-threat-brief-wanacrypt0r-know/',
  'title_sa': {'neg': 0.405, 'pos': 0.0},
  'body_sa': {'neg': 0.21133333333333323, 'pos': 0.21663157894736845},
  'freq': 2},
 {'url': 'https://www.fortinet.com/blog/threat-research/wannacry-evolving-history-from-beta-to-2-0.html',
  'title_sa': {'neg': 0.0, 'pos': 0.0},
  'body_sa': {'neg': 0