In [1]:
import re
from xml.etree import ElementTree as ET
import requests
import bs4

In [5]:
class Comment:
    def __init__(self, raw):
        self.content = self.compute_content(raw.find('p'))
        karma = raw.find('div', attrs={'class': 'comment_feedback'})
        self.id = re.search(r'#\d+',
                            raw.find('div', attrs={'class': 'userinfo'}).text).group(0)

        votes = raw.find('span', attrs={'class': 'comment_votes'})
        self.votes = int(votes.text)
        self.votes *= -1 if votes['class'][1] == 'negative' else 1
        self.total_votes = int(re.search(r'\d+',
                                        karma.findAll('small')[-1].text.strip()
                                       ).group(0))

    def to_xml(self):
        top = ET.Element('comment')
        
        ident = ET.SubElement(top, 'id')
        ident.text = self.id
        
        votes = ET.SubElement(top, 'votes')
        total_votes = ET.SubElement(votes, 'total_votes')
        total_votes.text = str(self.total_votes)
        absolute_votes = ET.SubElement(votes, 'absolute_votes')
        absolute_votes.text = str(self.votes)
        
        content = ET.SubElement(top, 'content')
        content.text = self.content
        
        return top

    def compute_content(self, raw):
        childs = list(raw.children)
        idx = 0
        for i, child in enumerate(childs):

            if hasattr(child, 'text') and re.match(r'#\d+\s+\w+ dijo:', child.text):
                idx = i
                break

        return ' '.join(filter(lambda x: isinstance(x, str), childs[idx:])).strip()

    def __str__(self):
        return self.content

    def __repr__(self):
        return self.content

In [7]:
class ADV:
    def __init__(self, raw: bs4.element.Tag):
        comments = raw.find('div', attrs={'class': 'comment_tag'}).find('a')
        meta = raw.find('div', attrs={'class': 'pre'})
        metrics = raw.find('div', attrs={'class': 'meta'}).findAll('span')

        self.content = raw.find('p', attrs={'class': 'story_content storyTitle'}).find('a').contents[0]

        self._comments_url = comments['href'].split('#')[0]

        self.date = re.search(r'\d [A-Z][a-z]+ \d{4}', meta.text.strip()).group(0) # TODO: parse this shit
        self.category = meta.find('a').contents[0]

        self.metrics = dict(map(lambda x: (x.contents[0].contents[0],
                                           int(re.sub('\D','', x.contents[1]))),
                                metrics))
        
    def to_xml(self):
        top = ET.Element('post')
        
        date = ET.SubElement(top, 'date')
        date.text = self.date
        
        category = ET.SubElement(top, 'category')
        category.text = self.category
        
        # content = ET.
        
        
        
    def get_comments(self):
        rc = requests.get(self._comments_url)
        comment_soup = bs4.BeautifulSoup(rc.content, 'html5lib')
        comments = comment_soup.findAll('div', attrs={'class': 'comment_box'})

        return [Comment(comment) for comment in comments]

    def __str__(self):
        return self.content

    def __repr__(self):
        return self.content

In [8]:
url = 'https://www.ascodevida.com/'
req = requests.get(url)
soup = bs4.BeautifulSoup(req.content, 'html5lib')

main_page = soup.find('div', attrs={'id': 'main'})
posts = main_page.findAll('div', attrs={'class': 'box story'})

adv = [ADV(post) for post in posts]

In [9]:
comments = adv[4].get_comments()

In [14]:
adv[4]

Hoy, y desde que empezó la cuarentena lo más imponente que he hecho fue ayudar a una señora en el Mercadona porque no llegaba a por la última caja de empanadillas congeladas y solo se le ocurre llamar a una muchacha igual de bajita que ella. ADV

In [13]:
print(prettify(comments[1].to_xml()))

<?xml version="1.0" ?>
<comment>
  <id>#2</id>
  <votes>
    <total_votes>1</total_votes>
    <absolute_votes>-1</absolute_votes>
  </votes>
  <content>Menudo &quot;Dramon&quot; tiene que ser ayudar a una señora en el super siendo bajita</content>
</comment>



In [12]:
from xml.etree import ElementTree
from xml.dom import minidom

def prettify(elem):
    """Return a pretty-printed XML string for the Element.
    """
    rough_string = ElementTree.tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    return reparsed.toprettyxml(indent="  ")

In [30]:
x0 = comments[0].to_xml()
x1 = comments[1].to_xml()

In [None]:
# TODO create database

In [36]:
print(prettify(x0))

<?xml version="1.0" ?>
<comment>
  <id>#1</id>
  <votes>
    <total_votes>2</total_votes>
    <absolute_votes>2</absolute_votes>
  </votes>
  <content>Jajajajajaja Qué bueno! Pero, que cabrito el chaval! Jajajajajaja 
Pues, te bañó en leche! No te quejes</content>
</comment>



In [35]:
print(prettify(x1))

<?xml version="1.0" ?>
<comment>
  <id>#2</id>
  <votes>
    <total_votes>2</total_votes>
    <absolute_votes>2</absolute_votes>
  </votes>
  <content>ODIO que la gente desperdicie comida.</content>
</comment>

