New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
extract code, text as list #26
Changes from 6 commits
61b933c
de3be50
7e2348d
a1c242b
42d61e7
e1e2ce8
be9f141
59f11da
4f4cdd9
66f294a
a04ef2e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,30 +1,76 @@ | ||
from pyparsing import makeHTMLTags, SkipTo | ||
from bs4 import BeautifulSoup | ||
from bs4.element import NavigableString | ||
|
||
from .parser import strip_tags | ||
|
||
class SEObject(object): | ||
""" | ||
Base Object for SE Objects | ||
""" | ||
def __init__(self, data): | ||
super(SEObject, self).__init__() | ||
|
||
class Base(object): | ||
def __init__(self): | ||
super(Base, self).__init__() | ||
|
||
|
||
class Comment(Base): | ||
class User(SEObject): | ||
""" | ||
Stack Overflow Comment object which will hold information for use in Nidaba analysis. | ||
Stack Overflow User object which will hold information for use in Nidaba analysis. | ||
""" | ||
|
||
def __init__(self, data): | ||
""" | ||
:param data: dict containing user information. | ||
:return: None | ||
""" | ||
super(SEObject, self).__init__() | ||
self._data = data | ||
|
||
|
||
class Post(SEObject): | ||
""" | ||
Base object for Question, Answer, Comments | ||
""" | ||
|
||
def __init__(self, data): | ||
""" | ||
:param data: Dict containing comment information. | ||
:return: None | ||
""" | ||
super(SEObject, self).__init__() | ||
self._data = data | ||
self.body = self._data.get('Body', '') | ||
self.text = self._get_text(self.body) | ||
self.code = self._get_code(self.body) | ||
|
||
@classmethod | ||
def _get_code(cls, html): | ||
return [i.get_text() for i in BeautifulSoup(html).find_all('code')] | ||
|
||
@classmethod | ||
def _get_text(cls, html): | ||
soup = BeautifulSoup(html) | ||
[s.extract() for s in soup('code')] | ||
return [i for i in soup.recursiveChildGenerator() \ | ||
if type(i) == NavigableString] | ||
|
||
|
||
super(Comment, self).__init__() | ||
class Comment(Post): | ||
""" | ||
Stack Overflow Comment object which will hold information for use in Nidaba analysis. | ||
""" | ||
|
||
def __init__(self, data): | ||
""" | ||
:param data: Dict containing comment information. | ||
:return: None | ||
""" | ||
super(SEObject, self).__init__() | ||
self._data = data | ||
self.body = self._data.get('Body', '') | ||
self.text = self._get_text(self.body) | ||
self.code = self._get_code(self.body) | ||
|
||
|
||
class Answer(Base): | ||
class Answer(Post): | ||
""" | ||
Stack Overflow Answer object which will hold information for use in Nidaba analysis. | ||
""" | ||
|
@@ -35,12 +81,13 @@ def __init__(self, data): | |
:return: None | ||
""" | ||
|
||
super(Answer, self).__init__() | ||
|
||
self._data = data | ||
super(SEObject, self).__init__() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be |
||
self.body = self._data.get('Body', '') | ||
self.text = self._get_text(self.body) | ||
self.code = self._get_code(self.body) | ||
|
||
|
||
class Question(object): | ||
class Question(Post): | ||
""" | ||
Stack Overflow Question object which will hold information for use in Nidaba analysis | ||
""" | ||
|
@@ -53,11 +100,10 @@ def __init__(self, data, answers=None, comments=None): | |
:return: None | ||
""" | ||
|
||
super(Question, self).__init__() | ||
|
||
super(SEObject, self).__init__() | ||
self._data = data | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
self.body = self._data.get('Body', '') | ||
self.text = strip_tags(self.body) | ||
self.text = self._get_text(self.body) | ||
self.code = self._get_code(self.body) | ||
|
||
if answers is None: | ||
|
@@ -69,23 +115,3 @@ def __init__(self, data, answers=None, comments=None): | |
self.comments = [] | ||
else: | ||
self.comments = [Comment(comm) for comm in comments] | ||
|
||
@classmethod | ||
def _get_code(cls, html): | ||
code_start, code_end = makeHTMLTags('code') | ||
code = code_start + SkipTo(code_end).setResultsName('body') + code_end | ||
return [token.body for token, start, end in code.scanString(html)] | ||
|
||
|
||
class User(object): | ||
""" | ||
Stack Overflow User object which will hold information for use in Nidaba analysis. | ||
""" | ||
|
||
def __init__(self, data): | ||
""" | ||
:param data: dict containing user information. | ||
:return: None | ||
""" | ||
|
||
self._data = data |
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# Feature extraction tests |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
from ..objects import Post | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure on this filename. For now we could just have |
||
|
||
|
||
def test_post_object(): | ||
d = {'Body': '<p>bar</p><code>x=1</code>'} | ||
p = Post(d) | ||
assert p.code == ['x=1'] | ||
assert p.text == ['bar'] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why have you added these lines to
Comment
when they're also done in thePost
class, from whichComment
inherits? Same forQuestion
andAnswer