In [None]:
from bs4 import BeautifulSoup
import scrapy, time
from scrapy.crawler import CrawlerProcess
import pandas as pd
import numpy as np
from collections import Counter

In [None]:
pop_df = pd.read_csv("/mnt/idms/fberes/data/bitcoin_ln_research/most_pop_nodes.csv")

In [None]:
N = len(pop_df)#50

In [None]:
nodes = list(pop_df["index"][:N])

nodes = [
            "03021c5f5f57322740e4ee6936452add19dc7ea7ccf90635f95119ab82a62ae268",
            "03c2abfa93eacec04721c019644584424aab2ba4dff3ac9bdab4e9c97007491dda"
        ]

In [None]:
class LNNodeParser(scrapy.Spider):
    name = "ln_node_parser"
    idx = 0
    
    def start_requests(self):
        urls = ["https://1ml.com/node/%s" % n for n in nodes]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        page = response.url.split("/")[-1]
        filename = '%s.html' % page
        with open("/mnt/idms/fberes/data/bitcoin_ln_research/1ml/%s" % filename, 'wb') as f:
            f.write(response.body)
        self.log('Saved file %s' % filename)

process = CrawlerProcess()
process.crawl(LNNodeParser)
process.start()

process.stop()

In [None]:
def get_info_type(li_item):
    info_type = "other"
    for t in li_item.span.attrs["class"]:
        if "icon-" in t:
            info_type = t.replace("icon-","")
            break
    return (info_type, li_item.get_text())
    
def get_node_info(info_part):
    if info_part == None:
        return dict([])
    else:
        return dict(get_info_type(item) for item in info_part.find_all("li"))

def extract_labels(labels_part):
    if labels_part == None:
        return None
    else:
        labels = []
        for item in labels_part.find_all("a"):
            labels.append(item.get_text())
        return labels
    
def extract_node_meta_data(node_id):
    with open("/mnt/idms/fberes/data/bitcoin_ln_research/1ml/%s.html" % node_id) as f:
        html = f.read()
    soup = BeautifulSoup(html, 'html.parser')
    info_part = soup.find('ul', {"class":"wordwrap"})
    labels_part = soup.find('ul', {"class":"tags"})
    meta_data = get_node_info(info_part)
    meta_data["labels"] = extract_labels(labels_part)
    meta_data["pub_key"] = node_id
    return meta_data

In [None]:
meta_records = []
for i, n in enumerate(nodes):
    meta_records.append(extract_node_meta_data(n))
    print(i)

In [None]:
len(meta_records)

meta_records

In [None]:
meta_df = pd.DataFrame(meta_records)

In [None]:
meta_df.head()

meta_df.loc[8]["pub_key"]

In [None]:
(meta_df.isnull().sum() / N).sort_values()

In [None]:
meta_df.to_csv("/mnt/idms/fberes/data/bitcoin_ln_research/node_meta.csv", index=False)

In [None]:
meta_with_labels_df = meta_df[~meta_df["labels"].isnull()]

In [None]:
meta_with_labels_df["num_labels"] = meta_with_labels_df["labels"].apply(len)

In [None]:
meta_with_labels_df["num_labels"].value_counts()

In [None]:
len(meta_with_labels_df)

In [None]:
meta_with_labels_df.to_csv("/mnt/idms/fberes/data/bitcoin_ln_research/node_meta_with_labels.csv", index=False)

In [None]:
all_labels = []
for lab_list in list(meta_with_labels_df["labels"]):
    all_labels += lab_list

In [None]:
cnt = Counter(all_labels)

In [None]:
meta_with_labels_df.head(20)

In [None]:
len(cnt), len(meta_df)

### Label merging ideas:
  
- G1: every kind of games
- G2: Banking, Excange (not G1)
- G3: Wallet (not G1 or G2)
- G4: every kind of Store (not G1, G2, G3)
- G5: all remaining service providers (not G1, G2, G3, G4)

Ami csak 'Open Source' vagy 'Testing' azokat nem venném figyelembe

In [None]:
cnt.most_common()

In [None]:
label_pos = dict(zip(cnt.keys(), range(len(cnt))))

In [None]:
pivot_records = []
onehot_labels = np.zeros((len(meta_with_labels_df), len(cnt)))
for idx, labels in enumerate(list(meta_with_labels_df["labels"])):
    for lab in labels:
        pivot_records.append([idx, lab])
        onehot_labels[idx, label_pos[lab]] = 1

In [None]:
onehot_labels_df = pd.DataFrame(onehot_labels, columns=cnt.keys())

In [None]:
pivot_records = pd.DataFrame(pivot_records, columns=["item","label"])
pivot_records.head()

In [None]:
onehot_labels_df[onehot_labels_df['Internet of Things'] == 1].sum()

In [None]:
onehot_labels_df[onehot_labels_df['Banking'] == 1].sum()