# Crawler

In [1]:
import requests
import os

from urllib.parse import quote, urlsplit

class Document:
    def __init__(self, url):
        self.url = url
        
    def get(self):
        if not self.load():
            if not self.download():
                raise FileNotFoundError(self.url)
            else:
                self.persist()
    
    def __get_filename(self):
        name = 'data/' + str(hash(self.url))  # use the hash as a file name
        return name
    
    def download(self):
        try:
            r = requests.get(self.url)
            if r.status_code // 100 not in (2, 3):  # either 2.. or 3..
                return False
            self.content = r.content
            return True
        except Exception:
            return False
        
    def persist(self):
        if self.content is None:  # If there is nothing to save
            return False
        
        file_name = self.__get_filename()
        file = open(file_name, "wb")
        file.write(self.content)
        return True
            
    def load(self):
        #TODO load content from hard drive, store it in self.content and return True in case of success
        path = self.__get_filename()
        folder = "/".join(path.split("/")[:-1])
        file = path.split("/")[-1]
        
        if file not in os.listdir(folder):  # if there is no such file the folder
            return False
        
        file = open(path, "rb")
        self.content = file.read()
        return True


In [2]:
doc = Document('http://sprotasov.ru/data/iu.txt')

doc.get()
assert doc.content, "Document download failed"
assert "Code snippets, demos and labs for the course" in str(doc.content), "Document content error"

doc.get()
assert doc.load(), "Load should return true for saved document"
assert "Code snippets, demos and labs for the course" in str(doc.content), "Document load from disk error"

In [3]:
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.parse
import re

class HtmlDocument(Document):
    
    def __tag_visible(self, element):
        if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
            return False
        if isinstance(element, Comment):
            return False
        return True
    
    def parse(self):
        #TODO exctact plain text, images and links from the document
        self.anchors = []
        self.images = []
        self.text = ""
        
        html_page = self.content
        soup = BeautifulSoup(html_page)
        
        for link in soup.findAll('a', href=True):
            if re.search("(tel)|(sms)|(mailto):", link['href']) is None:  # if link contains neither 'tel:' nor 'sms:' nor 'mailto:'
                full_link = urllib.parse.urljoin(self.url, link['href'])  # completes the link if it is relative
                self.anchors.append((link.text, full_link))        
            
        for img in soup.findAll('img'):
            link = urllib.parse.urljoin(self.url, img.get('src'))
            self.images.append(link)
        
        for txt in soup.findAll(text=True):
            if self.__tag_visible(txt):  # is tag is visible to user
                self.text += txt
        

In [4]:
doc = HtmlDocument("http://sprotasov.ru")
doc.get()
doc.parse()

assert "тестирующий сервер codetest" in doc.text, "Error parsing text"
assert "http://sprotasov.ru/images/phone.png" in doc.images, "Error parsing images"
assert any(p[1] == "http://university.innopolis.ru/" for p in doc.anchors), "Error parsing links"

In [5]:
from collections import Counter
import re


class HtmlDocumentTextData:
    
    def __init__(self, url):
        self.doc = HtmlDocument(url)
        self.doc.get()
        self.doc.parse()
    
    def get_sentences(self, lower=False):
        txt = self.doc.text
        txt = re.sub('[!@#$.\-+*—,\(\):]', ' ', txt)  # replace all punctuation signs with spaces
        txt = re.sub('[0-9]', ' ', txt)               # replace all digits with spaces
        if lower:
            result = [x.lower() for x in txt.split()]     # lower all letters and delete all doubled spaces
        else:
            result = txt.split()     # delete all doubled spaces
        return result
    
    def get_word_stats(self):
        return Counter(self.get_sentences())

In [6]:
doc = HtmlDocumentTextData("https://university.innopolis.ru")

print(doc.get_word_stats().most_common(10))
assert [x for x in doc.get_word_stats().most_common(10) if x[0] in ('иннополис', 'Иннополис')], 'иннополис sould be among most common'

[('и', 58), ('по', 32), ('Иннополис', 30), ('в', 28), ('ул', 25), ('на', 24), ('Февраля', 17), ('ост', 16), ('со', 12), ('стороны', 12)]


# Person entity extraction

In [7]:
class Speaker:
    def __init__(self, first, second):
        self.first = first
        self.second = second
        self.full = f"{self.first} {self.second}" 
    
    def __str__(self):
        return f"Person ({self.full})"    
    
    def __repr__(self):
        return self.__str__()
    
    def get_id(self):
        return hash(self.full)
    
    def __ne__(self, other):
        return self.full != other.full


class Event:
    def __init__(self, name, url):
        self.name = name
        self.url = url
        
    def __str__(self):
        return f"Event ({self.name} | {self.url})"
    
    def __repr__(self):
        return self.__str__()
    
    def get_id(self):
        return self.url.split("/")[-2]
        

In [8]:
from natasha import NamesExtractor

def extract_persons(html_doc_text):
    text = " ".join(html_doc_text.get_sentences())
    extractor = NamesExtractor()
    matches = extractor(text)
    names = []
    
    for match in matches:
        fact = match.fact
        if fact.first and fact.last:
            person = Speaker(fact.first, fact.last)
            names.append(person) 
    return names

t = HtmlDocumentTextData('https://itiskfu.timepad.ru/event/804451/')
persons = extract_persons(t)
print(persons)

[Person (вячеслав благирёв), Person (сергей шабанов), Person (роман доронин)]


# Sites parsing

In [9]:
from bs4 import BeautifulSoup

SITE = 'https://timepad.ru/afisha/moscow/search/it/all/{}'

def get_pages_count(initial):
    doc = HtmlDocument(initial)
    doc.get()
    doc.parse()
    link_to_last = [x[1] for x in doc.anchors if x[0] == "Последняя"][0]
    print(link_to_last)
    last_num = int(link_to_last.split("/")[-2])
    return last_num



def get_events_pages(initial):
    page_max = get_pages_count(initial.format(''))
    events_pages = list()
    print(page_max)
    for page_num in range(1, page_max + 1):
        if page_num == 1:
            url = initial.format("")
        else:
            url = initial.format(str(page_num) + '/')
        doc = HtmlDocument(url)
        doc.get()
        doc.parse()
        links = [anchor[1] for anchor in doc.anchors if anchor[0] in ('Билеты', 'Бесплатно')]
        
        print(url, len(links))
        for link in links:
            #print(link)
            events_pages.append(link)
    return events_pages 


if "events_pages.txt" in os.listdir():
    f = open("events_pages.txt", "r")
    events_pages = [x.strip() for x in f.readlines()]
else:
    print("started")
    events_pages = get_events_pages(SITE)
    f = open("events_pages.txt", "w")
    for link in events_pages:
        print(link, file=f)
    f.close()
print(len(events_pages), len(set(events_pages)))

7756 7756


In [10]:
ph = dict()
node_from = []
node_to = []
for events_page in events_pages:
    try:
        hdtd = HtmlDocumentTextData(events_page)
    except:
        continue
    persons = extract_persons(hdtd)
    c = 0
    
    for p1 in persons:
        for p2 in persons:
            if p1 != p2:
                node_from.append(p1)
                node_to.append(p2)
                c += 1
    print(events_page, "|", c)
    
print("Done with parsing sites")

https://abct.timepad.ru/event/1246150/ | 8
https://arcadia.timepad.ru/event/1249879/ | 0
https://ux-marafon.timepad.ru/event/1217997/ | 0
https://apd.timepad.ru/event/644155/ | 0
https://ux-marafon.timepad.ru/event/1043530/ | 0
https://ux-marafon.timepad.ru/event/985526/ | 0
https://ux-marafon.timepad.ru/event/899615/ | 0
https://ux-marafon.timepad.ru/event/864040/ | 0
https://ux-marafon.timepad.ru/event/762740/ | 0
https://ux-marafon.timepad.ru/event/734904/ | 30
https://ux-marafon.timepad.ru/event/732665/ | 30
https://ux-marafon.timepad.ru/event/555320/ | 56
https://ux-marafon.timepad.ru/event/457616/ | 84
https://rsds.timepad.ru/event/353907/ | 42
https://realme-russia.timepad.ru/event/1254318/ | 0
https://digitalbanana.timepad.ru/event/1257022/ | 0
https://pao-megafon--org.timepad.ru/event/1247322/ | 52
https://jedi-university.timepad.ru/event/1119828/ | 0
https://gdg-kaliningrad.timepad.ru/event/1253022/ | 0
https://kompaniya-gendalf-event.timepad.ru/event/1250060/ | 0
https://dig

In [11]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

new_from = node_from.copy()
from collections import Counter
cnt = Counter(node_from)
for x in cnt.most_common(10):
    print(x)
 
df_from = [person.full for person in node_from]
df_to = [person.full for person in node_to] 
"""
df = pd.DataFrame({ 
    'from': df_from, 
    'to': df_to
})
 
# Build your graph

G=nx.from_pandas_edgelist(df, 'from', 'to')
 
# Plot it
nx.draw(G, with_labels=False)
plt.show()
plt.savefig('graph.png')"""


(Person (тэглайна другой), 62)
(Person (дмитрий глазырин), 62)
(Person (георгия квасник), 62)
(Person (антон колесник), 62)
(Person (евгений костылев), 62)
(Person (сергей куньев), 62)
(Person (михаил менделевич), 62)
(Person (алексей соболев), 62)
(Person (алексей андреев), 62)
(Person (артур арсенов), 62)


"\ndf = pd.DataFrame({ \n    'from': df_from, \n    'to': df_to\n})\n \n# Build your graph\n\nG=nx.from_pandas_edgelist(df, 'from', 'to')\n \n# Plot it\nnx.draw(G, with_labels=False)\nplt.show()\nplt.savefig('graph.png')"

In [15]:
import pandas as pd
import networkx as nx
import plotly.graph_objects as go


df = pd.DataFrame({ 
    'from': df_from, 
    'to': df_to
})
G=nx.from_pandas_edgelist(df, 'from', 'to')

pos=nx.fruchterman_reingold_layout(G)
print(pos)
Xv=[pos[item][0] for item in pos]
Yv=[pos[item][1] for item in pos]
print(Xv)

edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []
for node in G.nodes():
    #print(node)
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        # colorscale options
        #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
        #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
        #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
        colorscale='YlGnBu',
        reversescale=True,
        color=[],
        size=10,
        colorbar=dict(
            thickness=15,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        ),
        line_width=2))


node_adjacencies = []
node_text = [str(node) for node in G.nodes()]
print("txt", node_text)
for node, adjacencies in enumerate(G.adjacency()):
    node_adjacencies.append(len(adjacencies[1]))
    
    txt = '# of connections: '+str(len(adjacencies[1]))
    txt = f"(# {len(adjacencies[1])})"
    node_text[node] = node_text[node] + txt

node_trace.marker.color = node_adjacencies
node_trace.text = node_text

fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='Network graph made with Python',
                titlefont_size=16,
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                )
fig.show()



print("done")

{'андрей гирин': array([0.1845843 , 0.72202875]), 'айдар зарифуллин': array([0.18737582, 0.7330398 ]), 'дмитрий сатин': array([0.02309747, 0.07626122]), 'дарья пархоменко': array([0.04390203, 0.10628658]), 'александр постовалов': array([0.04998058, 0.11747781]), 'юрий марков': array([0.04391912, 0.10628406]), 'виталий мазуревич': array([0.04356616, 0.1064993 ]), 'анаит бадалян': array([0.04395793, 0.1062847 ]), 'дмитрия сатина': array([0.01159044, 0.06730413]), 'александр кулачиков': array([0.01624415, 0.07316267]), 'виталий черемисинов': array([0.02132312, 0.05383952]), 'елена пяткова': array([0.01018149, 0.06738421]), 'сергей никольский': array([0.01111377, 0.06725233]), 'ольга мазаева': array([0.01063819, 0.06684861]), 'евгений гапон': array([-0.01360462,  0.04370197]), 'мерлена леруа': array([-0.00177803,  0.00844511]), 'алин ермаков': array([0.01607225, 0.05318944]), 'марин суслов': array([0.0241881 , 0.05669491]), 'сергей комарденков': array([0.01340009, 0.04924576]), 'максим коз

In [13]:
print("events", len(events_pages))
print("persons", len(set(node_from)))
print("edges", len(node_from))
print("hello")

events 7756
persons 9488
edges 59368
hello


In [None]:
print("nodes", G.number_of_nodes())
print("edges", G.number_of_edges())
max_degree = max(G.degree, key=lambda x : x[1])
print("max_degreee", max_degree)
print("conn new", len(max(nx.connected_components(G), key = lambda x : len(x))))
print("connected components", max(nx.connected_components(G)))
nx.write_adjlist(G,"test.adjlist")
print("diam", nx.diameter(G))
d = 0
for c in nx.connected_components(G):
    d = max(d, nx.diameter(c))