Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

extract code, text as list #26

Closed
wants to merge 11 commits into from
94 changes: 52 additions & 42 deletions nidaba/core/objects.py
@@ -1,46 +1,82 @@
from pyparsing import makeHTMLTags, SkipTo
from bs4 import BeautifulSoup
from bs4.element import NavigableString

from .parser import strip_tags

class SEObject(object):
"""
Base Object for SE Objects
"""
def __init__(self, data):
self._data = data


class User(SEObject):
"""
Stack Overflow User object which will hold information for use in Nidaba analysis.
"""

class Base(object):
def __init__(self):
super(Base, self).__init__()
def __init__(self, data):
"""
:param data: dict containing user information.
:return: None
"""
super().__init__(data)


class Comment(Base):
class Post(SEObject):
"""
Stack Overflow Comment object which will hold information for use in Nidaba analysis.
Base object for Question, Answer, Comments
"""

def __init__(self, data):
"""
:param data: Dict containing comment information.
:return: None
"""
super().__init__(data)
self.body = self._data.get('Body', '')
self.text = self._get_text(self.body)
self.code = self._get_code(self.body)

super(Comment, self).__init__()
@classmethod
def _get_code(cls, html):
return [i.get_text() for i in BeautifulSoup(html).find_all('code')]

self._data = data
@classmethod
def _get_text(cls, html):
soup = BeautifulSoup(html)
[s.extract() for s in soup('code')]
return [i for i in soup.recursiveChildGenerator() \
if type(i) == NavigableString]


class Answer(Base):
class Comment(Post):
"""
Stack Overflow Answer object which will hold information for use in Nidaba analysis.
Stack Overflow Comment object which will hold information for use in Nidaba analysis.
"""

def __init__(self, data):
"""
:param data: dict containing answer information.
:param data: Dict containing comment information.
:return: None
"""
super().__init__(data)

super(Answer, self).__init__()

self._data = data
class Answer(Post):
"""
Stack Overflow Answer object which will hold information for use in Nidaba analysis.
"""

def __init__(self, data):
"""
:param data: dict containing answer information.
:return: None
"""
super().__init__(data)


class Question(object):
class Question(Post):
"""
Stack Overflow Question object which will hold information for use in Nidaba analysis
"""
Expand All @@ -52,13 +88,7 @@ def __init__(self, data, answers=None, comments=None):
:param comments: List of dicts containing comment information
:return: None
"""

super(Question, self).__init__()

self._data = data
self.body = self._data.get('Body', '')
self.text = strip_tags(self.body)
self.code = self._get_code(self.body)
super().__init__(data)

if answers is None:
self.answers = []
Expand All @@ -69,23 +99,3 @@ def __init__(self, data, answers=None, comments=None):
self.comments = []
else:
self.comments = [Comment(comm) for comm in comments]

@classmethod
def _get_code(cls, html):
code_start, code_end = makeHTMLTags('code')
code = code_start + SkipTo(code_end).setResultsName('body') + code_end
return [token.body for token, start, end in code.scanString(html)]


class User(object):
"""
Stack Overflow User object which will hold information for use in Nidaba analysis.
"""

def __init__(self, data):
"""
:param data: dict containing user information.
:return: None
"""

self._data = data
4 changes: 0 additions & 4 deletions nidaba/core/parser.py

This file was deleted.

1 change: 1 addition & 0 deletions nidaba/core/test/__init__.py
@@ -0,0 +1 @@
# Feature extraction tests
36 changes: 36 additions & 0 deletions nidaba/core/test/test_objects.py
@@ -0,0 +1,36 @@
from ..objects import Post, Question, Answer, User, Comment


def test_post_object():
d = {'Body': '<p>bar</p><code>x=1</code>'}
p = Post(d)
assert p.code == ['x=1']
assert p.text == ['bar']


def test_answer_object():
d = {'Body': '<p>bar</p><code>x=1</code>'}
a = Answer(d)
assert a.code == ['x=1']
assert a.text == ['bar']


def test_comment_object():
d = {'Body': '<p>bar</p><code>x=1</code>'}
c = Comment(d)
assert c.code == ['x=1']
assert c.text == ['bar']


def test_question_object():
d = {'Body': '<p>bar</p><code>x=1</code>'}
q = Question(d, answers=[{'a': 1}])
assert q.code == ['x=1']
assert q.text == ['bar']
assert q.answers != [Answer({'a': 1}) ]


def test_user_object():
d = {'Body': '<p>bar</p><code>x=1</code>'}
u = User(d)
assert u._data == d
6 changes: 6 additions & 0 deletions nidaba/features/_util/question.py
Expand Up @@ -26,6 +26,12 @@ def title_capitalisation_percentage(s):

### Body

def code_percentage(code, text):
code_size = sum(sum(len(j.strip()) for j in i) for i in code)
text_size = sum(sum(len(j.strip()) for j in i) for i in text)

return code_size/(code_size + text_size)

### Code

### Tags
Expand Down
6 changes: 6 additions & 0 deletions nidaba/features/test/test_questions_util.py
Expand Up @@ -26,6 +26,12 @@ def test_title_capitalisation_percentage():

### Body

def code_percentage():
code = [' x = 1 \n\n\n ', ' z = 2', ]
text = ['i want\t \t ', 'o ', ]
assert question.code_percentage(code, text) == 0.5
assert question.code_percentage(code, text) != 0.5

### Code

### Tags
Expand Down