sopython · ChillarAnand · Nov 24, 2014 · Nov 24, 2014 · Nov 25, 2014 · Nov 25, 2014
diff --git a/nidaba/core/objects.py b/nidaba/core/objects.py
@@ -1,30 +1,76 @@
-from pyparsing import makeHTMLTags, SkipTo
+from bs4 import BeautifulSoup
+from bs4.element import NavigableString
 
-from .parser import strip_tags
 
+class SEObject(object):
+    """
+    Base Object for SE Objects
+    """
+    def __init__(self, data):
+        super(SEObject, self).__init__()
 
-class Base(object):
-    def __init__(self):
-        super(Base, self).__init__()
 
 
-class Comment(Base):
+class User(SEObject):
     """
-    Stack Overflow Comment object which will hold information for use in Nidaba analysis.
+    Stack Overflow User object which will hold information for use in Nidaba analysis.
+    """
+
+    def __init__(self, data):
+        """
+        :param data: dict containing user information.
+        :return: None
+        """
+        super(SEObject, self).__init__()
+        self._data = data
+
+
+class Post(SEObject):
+    """
+    Base object for Question, Answer, Comments
     """
 
     def __init__(self, data):
         """
         :param data: Dict containing comment information.
         :return: None
         """
+        super(SEObject, self).__init__()
+        self._data = data
+        self.body = self._data.get('Body', '')
+        self.text = self._get_text(self.body)
+        self.code = self._get_code(self.body)
+
+    @classmethod
+    def _get_code(cls, html):
+        return [i.get_text() for i in BeautifulSoup(html).find_all('code')]
+
+    @classmethod
+    def _get_text(cls, html):
+        soup = BeautifulSoup(html)
+        [s.extract() for s in soup('code')]
+        return [i for i in soup.recursiveChildGenerator() \
+               if type(i) == NavigableString]
+
 
-        super(Comment, self).__init__()
+class Comment(Post):
+    """
+    Stack Overflow Comment object which will hold information for use in Nidaba analysis.
+    """
 
+    def __init__(self, data):
+        """
+        :param data: Dict containing comment information.
+        :return: None
+        """
+        super(SEObject, self).__init__()
         self._data = data
+        self.body = self._data.get('Body', '')
+        self.text = self._get_text(self.body)
+        self.code = self._get_code(self.body)
 
 
-class Answer(Base):
+class Answer(Post):
     """
     Stack Overflow Answer object which will hold information for use in Nidaba analysis.
     """
@@ -35,12 +81,13 @@ def __init__(self, data):
         :return: None
         """
 
-        super(Answer, self).__init__()
-
-        self._data = data
+        super(SEObject, self).__init__()
+        self.body = self._data.get('Body', '')
+        self.text = self._get_text(self.body)
+        self.code = self._get_code(self.body)
 
 
-class Question(object):
+class Question(Post):
     """
     Stack Overflow Question object which will hold information for use in Nidaba analysis
     """
@@ -53,11 +100,10 @@ def __init__(self, data, answers=None, comments=None):
         :return: None
         """
 
-        super(Question, self).__init__()
-
+        super(SEObject, self).__init__()
         self._data = data
         self.body = self._data.get('Body', '')
-        self.text = strip_tags(self.body)
+        self.text = self._get_text(self.body)
         self.code = self._get_code(self.body)
 
         if answers is None:
@@ -69,23 +115,3 @@ def __init__(self, data, answers=None, comments=None):
             self.comments = []
         else:
             self.comments = [Comment(comm) for comm in comments]
-
-    @classmethod
-    def _get_code(cls, html):
-        code_start, code_end = makeHTMLTags('code')
-        code = code_start + SkipTo(code_end).setResultsName('body') + code_end
-        return [token.body for token, start, end in code.scanString(html)]
-
-
-class User(object):
-    """
-    Stack Overflow User object which will hold information for use in Nidaba analysis.
-    """
-
-    def __init__(self, data):
-        """
-        :param data: dict containing user information.
-        :return: None
-        """
-
-        self._data = data
diff --git a/nidaba/core/parser.py b/nidaba/core/parser.py
diff --git a/nidaba/core/test/__init__.py b/nidaba/core/test/__init__.py
@@ -0,0 +1 @@
+# Feature extraction tests
diff --git a/nidaba/core/test/test_post_object.py b/nidaba/core/test/test_post_object.py
@@ -0,0 +1,8 @@
+from ..objects import Post
+
+
+def test_post_object():
+    d = {'Body': '<p>bar</p><code>x=1</code>'}
+    p = Post(d)
+    assert p.code == ['x=1']
+    assert p.text == ['bar']
diff --git a/nidaba/features/_util/question.py b/nidaba/features/_util/question.py
@@ -26,6 +26,12 @@ def title_capitalisation_percentage(s):
 
 ### Body
 
+def code_percentage(code, text):
+    code_size = sum(sum(len(j.strip()) for j in i) for i in code)
+    text_size = sum(sum(len(j.strip()) for j in i) for i in text)
+
+    return code_size/(code_size + text_size)
+
 ### Code
 
 ### Tags

diff --git a/nidaba/features/test/test_questions_util.py b/nidaba/features/test/test_questions_util.py
@@ -26,6 +26,12 @@ def test_title_capitalisation_percentage():
 
 ### Body
 
+def code_percentage():
+    code = ['    x = 1    \n\n\n ', '              z = 2', ]
+    text = ['i want\t \t ',  'o       ', ]
+    assert question.code_percentage(code, text) == 0.5
+    assert question.code_percentage(code, text) != 0.5
+
 ### Code
 
 ### Tags