Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

extract code, text as list #26

Closed
wants to merge 11 commits into from
98 changes: 62 additions & 36 deletions nidaba/core/objects.py
@@ -1,30 +1,76 @@
from pyparsing import makeHTMLTags, SkipTo
from bs4 import BeautifulSoup
from bs4.element import NavigableString

from .parser import strip_tags

class SEObject(object):
"""
Base Object for SE Objects
"""
def __init__(self, data):
super(SEObject, self).__init__()

class Base(object):
def __init__(self):
super(Base, self).__init__()


class Comment(Base):
class User(SEObject):
"""
Stack Overflow Comment object which will hold information for use in Nidaba analysis.
Stack Overflow User object which will hold information for use in Nidaba analysis.
"""

def __init__(self, data):
"""
:param data: dict containing user information.
:return: None
"""
super(SEObject, self).__init__()
self._data = data


class Post(SEObject):
"""
Base object for Question, Answer, Comments
"""

def __init__(self, data):
"""
:param data: Dict containing comment information.
:return: None
"""
super(SEObject, self).__init__()
self._data = data
self.body = self._data.get('Body', '')
self.text = self._get_text(self.body)
self.code = self._get_code(self.body)

@classmethod
def _get_code(cls, html):
return [i.get_text() for i in BeautifulSoup(html).find_all('code')]

@classmethod
def _get_text(cls, html):
soup = BeautifulSoup(html)
[s.extract() for s in soup('code')]
return [i for i in soup.recursiveChildGenerator() \
if type(i) == NavigableString]


super(Comment, self).__init__()
class Comment(Post):
"""
Stack Overflow Comment object which will hold information for use in Nidaba analysis.
"""

def __init__(self, data):
"""
:param data: Dict containing comment information.
:return: None
"""
super(SEObject, self).__init__()
self._data = data
self.body = self._data.get('Body', '')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why have you added these lines to Comment when they're also done in the Post class, from which Comment inherits? Same for Question and Answer

self.text = self._get_text(self.body)
self.code = self._get_code(self.body)


class Answer(Base):
class Answer(Post):
"""
Stack Overflow Answer object which will hold information for use in Nidaba analysis.
"""
Expand All @@ -35,12 +81,13 @@ def __init__(self, data):
:return: None
"""

super(Answer, self).__init__()

self._data = data
super(SEObject, self).__init__()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be super(Answer, self).__init__(). Same for the other classes.

self.body = self._data.get('Body', '')
self.text = self._get_text(self.body)
self.code = self._get_code(self.body)


class Question(object):
class Question(Post):
"""
Stack Overflow Question object which will hold information for use in Nidaba analysis
"""
Expand All @@ -53,11 +100,10 @@ def __init__(self, data, answers=None, comments=None):
:return: None
"""

super(Question, self).__init__()

super(SEObject, self).__init__()
self._data = data
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

self._data = data can be added to the SEObject class as it's called for all of its subclasses.

self.body = self._data.get('Body', '')
self.text = strip_tags(self.body)
self.text = self._get_text(self.body)
self.code = self._get_code(self.body)

if answers is None:
Expand All @@ -69,23 +115,3 @@ def __init__(self, data, answers=None, comments=None):
self.comments = []
else:
self.comments = [Comment(comm) for comm in comments]

@classmethod
def _get_code(cls, html):
code_start, code_end = makeHTMLTags('code')
code = code_start + SkipTo(code_end).setResultsName('body') + code_end
return [token.body for token, start, end in code.scanString(html)]


class User(object):
"""
Stack Overflow User object which will hold information for use in Nidaba analysis.
"""

def __init__(self, data):
"""
:param data: dict containing user information.
:return: None
"""

self._data = data
4 changes: 0 additions & 4 deletions nidaba/core/parser.py

This file was deleted.

1 change: 1 addition & 0 deletions nidaba/core/test/__init__.py
@@ -0,0 +1 @@
# Feature extraction tests
8 changes: 8 additions & 0 deletions nidaba/core/test/test_post_object.py
@@ -0,0 +1,8 @@
from ..objects import Post
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure on this filename. For now we could just have test_objects.py and keep them all together. If the tests end up being very long then we can split them up later.



def test_post_object():
d = {'Body': '<p>bar</p><code>x=1</code>'}
p = Post(d)
assert p.code == ['x=1']
assert p.text == ['bar']
6 changes: 6 additions & 0 deletions nidaba/features/_util/question.py
Expand Up @@ -26,6 +26,12 @@ def title_capitalisation_percentage(s):

### Body

def code_percentage(code, text):
code_size = sum(sum(len(j.strip()) for j in i) for i in code)
text_size = sum(sum(len(j.strip()) for j in i) for i in text)

return code_size/(code_size + text_size)

### Code

### Tags
Expand Down
6 changes: 6 additions & 0 deletions nidaba/features/test/test_questions_util.py
Expand Up @@ -26,6 +26,12 @@ def test_title_capitalisation_percentage():

### Body

def code_percentage():
code = [' x = 1 \n\n\n ', ' z = 2', ]
text = ['i want\t \t ', 'o ', ]
assert question.code_percentage(code, text) == 0.5
assert question.code_percentage(code, text) != 0.5

### Code

### Tags
Expand Down