algorithm errors fixed

rounakdatta · Jul 10, 2018 · 0fa3d39 · 0fa3d39
1 parent b2d4913
commit 0fa3d39
Show file tree

Hide file tree

Showing 5 changed files with 44 additions and 15 deletions.
diff --git a/research/nlp_tests.ipynb b/research/nlp_tests.ipynb
@@ -387,20 +387,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "        jumps                    \n",
-      "  ________|______________         \n",
-      " |        |             over     \n",
-      " |        |              |        \n",
-      " |       fox            dog      \n",
-      " |    ____|_____      ___|____    \n",
-      " .  The quick brown the      lazy\n",
+      "                                 is                                                  \n",
+      "  _______________________________|_________                                           \n",
+      " |     |                                   is                                        \n",
+      " |     |       ____________________________|________________________                  \n",
+      " |     |      |    |   |   |              are             |         |                \n",
+      " |     |      |    |   |   |      _________|_______       |         |                 \n",
+      " |     |      |    |   |   |     |         |      easy    |         do               \n",
+      " |     |      |    |   |   |     |         |       |      |      ___|_____            \n",
+      " |  downside  |    |   |   |     |      programs  use    easy   |      analysis      \n",
+      " |     |      |    |   |   |     |         |       |      |     |    _____|_______    \n",
+      " .    The    that  ,   ,   it because statistical  to  equally  to the          wrong\n",
       "\n"
      ]
     },
@@ -410,7 +414,7 @@
        "[None]"
       ]
      },
-     "execution_count": 1,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -422,7 +426,7 @@
     "\n",
     "en_nlp = spacy.load('en')\n",
     "\n",
-    "doc = en_nlp(\"The quick brown fox jumps over the lazy dog.\")\n",
+    "doc = en_nlp(\"The downside is that, because statistical programs are easy to use, it is equally easy to do the wrong analysis.\")\n",
     "\n",
     "def to_nltk_tree(node):\n",
     "    if node.n_lefts + node.n_rights > 0:\n",

diff --git a/research/sentence_structure.ipynb b/research/sentence_structure.ipynb
@@ -52,25 +52,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "cIrnmjio\n"
+      "nsubjauxrootpunct\n"
      ]
     }
    ],
    "source": [
     "import spacy\n",
     "nlp = spacy.load('en_core_web_sm')\n",
-    "doc = nlp(\"I am going out with my family.\")\n",
+    "doc = nlp(\"We are walking.\")\n",
     "\n",
     "sent_struct = []\n",
     "for token in doc:\n",
-    "    sent_struct.append(deps_dict[token.dep_.lower()])\n",
+    "    sent_struct.append(token.dep_.lower())\n",
     "\n",
     "sentence_code = ''.join(sent_struct)\n",
     "print(sentence_code)"

diff --git a/src/checker.py b/src/checker.py
@@ -45,6 +45,8 @@ def decode_coding(code):
 	return decoded_list
 
 def VB_VB_VB_correction(payload, raw_text, error_count): # correct errors of type has-been-walking
+	if 'been' not in raw_text.split():
+		return raw_text, error_count
 	if(payload.tag_[:2] != 'VB' and payload.tag_[:2] != 'NN'  and payload.tag_[:2] != 'JJ'):
 		return raw_text, error_count
 	for ch in payload.children:
@@ -92,14 +94,37 @@ def VB_VB_VB_correction(payload, raw_text, error_count): # correct errors of typ
 def VB_VB_correction(payload, raw_text, error_count): # correct errors of type is-walking OR has-cooked
 	if(payload.tag_[:2] != 'VB'):
 		return raw_text, error_count
+	nounBeforeVerb = False
+	nounAfterVerb = False
+	verbFound = False
+	if(payload.text == 'is' or payload.text == 'was' or payload.text == 'are' or payload.text == 'were'):
+		return raw_text, error_count
+
 	for ch in payload.children:
+		if(ch.tag_[:2] == 'VB'):
+			verbFound = True
+		if((not verbFound) and (ch.dep_ == 'nsubj')):
+			print(ch.lower_)
+			nounBeforeVerb = True
+		if(verbFound and (ch.dep_ == 'nsubj')):
+			nounAfterVerb = True
+
+		ifHave = False
+		ifBeen = False
 		if(ch.tag_[:2] == 'VB'): # this might need to be removed
 			dummy, error_count = VB_VB_VB_correction(ch, raw_text, error_count)	
 			try:
 				if(ch.lower_ == 'has') or (ch.lower_ == 'have') or (ch.lower_ == 'had'):
+					ifHave = True
+				if(ch.lower_ == 'been' or payload.text == "been"):
+					ifBeen = True
+
+				if(ifHave and ifBeen):
 					x = conjugate(verb=lemma(payload.text), tense=PAST+PARTICIPLE, mood=INDICATIVE, person=1, number=PL)
-				else:
+				elif(nounBeforeVerb and ((ch.lower_ == 'is') or (ch.lower_ == 'are') or (ch.lower_ == 'was') or (ch.lower_ == 'was') or (ch.lower_ == 'were'))):
 					x = conjugate(verb=lemma(payload.text), tense=PRESENT, mood=INDICATIVE, aspect=PROGRESSIVE, person=1, number=PL)
+				else:
+					x = payload.text
 
 				if(x != payload.text):
 					error_count += 1

diff --git a/test/sample1.docx b/test/sample1.docx
diff --git a/test/sample2.docx b/test/sample2.docx