diff --git a/research/nlp_tests.ipynb b/research/nlp_tests.ipynb index bd13164..c9b00fd 100644 --- a/research/nlp_tests.ipynb +++ b/research/nlp_tests.ipynb @@ -387,20 +387,24 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " jumps \n", - " ________|______________ \n", - " | | over \n", - " | | | \n", - " | fox dog \n", - " | ____|_____ ___|____ \n", - " . The quick brown the lazy\n", + " is \n", + " _______________________________|_________ \n", + " | | is \n", + " | | ____________________________|________________________ \n", + " | | | | | | are | | \n", + " | | | | | | _________|_______ | | \n", + " | | | | | | | | easy | do \n", + " | | | | | | | | | | ___|_____ \n", + " | downside | | | | | programs use easy | analysis \n", + " | | | | | | | | | | | _____|_______ \n", + " . The that , , it because statistical to equally to the wrong\n", "\n" ] }, @@ -410,7 +414,7 @@ "[None]" ] }, - "execution_count": 1, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -422,7 +426,7 @@ "\n", "en_nlp = spacy.load('en')\n", "\n", - "doc = en_nlp(\"The quick brown fox jumps over the lazy dog.\")\n", + "doc = en_nlp(\"The downside is that, because statistical programs are easy to use, it is equally easy to do the wrong analysis.\")\n", "\n", "def to_nltk_tree(node):\n", " if node.n_lefts + node.n_rights > 0:\n", diff --git a/research/sentence_structure.ipynb b/research/sentence_structure.ipynb index ed5b1ae..11557ed 100644 --- a/research/sentence_structure.ipynb +++ b/research/sentence_structure.ipynb @@ -52,25 +52,25 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "cIrnmjio\n" + "nsubjauxrootpunct\n" ] } ], "source": [ "import spacy\n", "nlp = spacy.load('en_core_web_sm')\n", - "doc = nlp(\"I am going out with my family.\")\n", + "doc = nlp(\"We are walking.\")\n", "\n", "sent_struct = []\n", "for token in doc:\n", - " sent_struct.append(deps_dict[token.dep_.lower()])\n", + " sent_struct.append(token.dep_.lower())\n", "\n", "sentence_code = ''.join(sent_struct)\n", "print(sentence_code)" diff --git a/src/checker.py b/src/checker.py index 5a999e3..8a69b30 100644 --- a/src/checker.py +++ b/src/checker.py @@ -45,6 +45,8 @@ def decode_coding(code): return decoded_list def VB_VB_VB_correction(payload, raw_text, error_count): # correct errors of type has-been-walking + if 'been' not in raw_text.split(): + return raw_text, error_count if(payload.tag_[:2] != 'VB' and payload.tag_[:2] != 'NN' and payload.tag_[:2] != 'JJ'): return raw_text, error_count for ch in payload.children: @@ -92,14 +94,37 @@ def VB_VB_VB_correction(payload, raw_text, error_count): # correct errors of typ def VB_VB_correction(payload, raw_text, error_count): # correct errors of type is-walking OR has-cooked if(payload.tag_[:2] != 'VB'): return raw_text, error_count + nounBeforeVerb = False + nounAfterVerb = False + verbFound = False + if(payload.text == 'is' or payload.text == 'was' or payload.text == 'are' or payload.text == 'were'): + return raw_text, error_count + for ch in payload.children: + if(ch.tag_[:2] == 'VB'): + verbFound = True + if((not verbFound) and (ch.dep_ == 'nsubj')): + print(ch.lower_) + nounBeforeVerb = True + if(verbFound and (ch.dep_ == 'nsubj')): + nounAfterVerb = True + + ifHave = False + ifBeen = False if(ch.tag_[:2] == 'VB'): # this might need to be removed dummy, error_count = VB_VB_VB_correction(ch, raw_text, error_count) try: if(ch.lower_ == 'has') or (ch.lower_ == 'have') or (ch.lower_ == 'had'): + ifHave = True + if(ch.lower_ == 'been' or payload.text == "been"): + ifBeen = True + + if(ifHave and ifBeen): x = conjugate(verb=lemma(payload.text), tense=PAST+PARTICIPLE, mood=INDICATIVE, person=1, number=PL) - else: + elif(nounBeforeVerb and ((ch.lower_ == 'is') or (ch.lower_ == 'are') or (ch.lower_ == 'was') or (ch.lower_ == 'was') or (ch.lower_ == 'were'))): x = conjugate(verb=lemma(payload.text), tense=PRESENT, mood=INDICATIVE, aspect=PROGRESSIVE, person=1, number=PL) + else: + x = payload.text if(x != payload.text): error_count += 1 diff --git a/test/sample1.docx b/test/sample1.docx deleted file mode 100644 index 46e1751..0000000 Binary files a/test/sample1.docx and /dev/null differ diff --git a/test/sample2.docx b/test/sample2.docx deleted file mode 100644 index 7e27732..0000000 Binary files a/test/sample2.docx and /dev/null differ