# This notebook enables comparing stanza and YAP for Hebrew text analysis

In [35]:
import pandas as pd
import stanza
import requests
import json
from time import sleep
from pandas.io.json import json_normalize

In [31]:
class Processor:
    def __init__(self):
        self.heb_nlp = stanza.Pipeline(lang='he', processors='tokenize,mwt,pos,lemma,depparse')
        #replace MY_TOKEN with the token you got from the langndata website
        self.yap_token="MY_TOKEN"
    
    def print_stanza_analysis(self, text):
        text += " XX"
        doc=self.heb_nlp(text)
        lst=[]
        for sen in doc.sentences:
            for token in sen.tokens:
                for word in token.words:
                    features=[(word.text,
                               word.lemma,
                               word.upos,
                               word.xpos,
                               word.head,
                               word.deprel,
                               word.feats)]

                    df=pd.DataFrame(features, columns=["text", "lemma", "upos", "xpos", "head", "deprel","feats"])
                    lst.append(df)
        tot_df=pd.concat(lst, ignore_index=True)
        tot_df=tot_df.shift(1).iloc[1:]
        tot_df["head"]=tot_df["head"].astype(int)
        print(tot_df.head(50))
        
    def print_yap_analysis(self, text):
        text= text.replace(r'"', r'\"')
        url = f'https://www.langndata.com/api/heb_parser?token={self.yap_token}'
        _json='{"data":"'+text.strip()+'"}'
#         print(url)
#         print(_json)
        headers = {'content-type': 'application/json'}
        sleep(0.5)
        r = requests.post(url,  data=_json.encode('utf-8'), headers={'Content-type': 'application/json; charset=utf-8'})
        json_obj=r.json()
        print
        md_lattice=json_obj["md_lattice"]
        res_df=pd.io.json.json_normalize([md_lattice[i] for i in md_lattice.keys()])
        print(res_df)

In [30]:
text="""
הוא הפיל הגדול ביותר בגן החיות.
"""
processor=Processor()
processor.print_stanza_analysis(text)
#processor.print_yap_analysis(text)

2020-05-07 11:13:37 INFO: Loading these models for language: he (Hebrew):
| Processor | Package |
-----------------------
| tokenize  | htb     |
| mwt       | htb     |
| pos       | htb     |
| lemma     | htb     |
| depparse  | htb     |

2020-05-07 11:13:37 INFO: Use device: cpu
2020-05-07 11:13:37 INFO: Loading: tokenize
2020-05-07 11:13:37 INFO: Loading: mwt
2020-05-07 11:13:37 INFO: Loading: pos
2020-05-07 11:13:40 INFO: Loading: lemma
2020-05-07 11:13:40 INFO: Loading: depparse
2020-05-07 11:13:42 INFO: Done loading processors!


     text  lemma   upos   xpos  head           deprel  \
1     הוא    הוא   PRON   PRON     2            nsubj   
2    הפיל   הפיל   VERB   VERB     0             root   
3       ה      ה    DET    DET     4          det:def   
4    גדול   גדול    ADJ    ADJ     2        parataxis   
5   ביותר  ביותר    ADV    ADV     4           advmod   
6       ב      ב    ADP    ADP     7             case   
7      גן     גן   NOUN   NOUN     2              obl   
8       ה      ה    DET    DET     9          det:def   
9    חיות    חיה   NOUN   NOUN     7  compound:smixut   
10      .      .  PUNCT  PUNCT     2            punct   

                                                feats  
1       Gender=Masc|Number=Sing|Person=3|PronType=Prs  
2   Gender=Masc|HebBinyan=HIFIL|Number=Sing|Person...  
3                                        PronType=Art  
4                             Gender=Masc|Number=Sing  
5                                                None  
6                                   

In [32]:
text="""
הוא הפיל הגדול ביותר בגן החיות.
"""
processor=Processor()
#processor.print_stanza_analysis(text)
processor.print_yap_analysis(text)

2020-05-07 11:17:19 INFO: Loading these models for language: he (Hebrew):
| Processor | Package |
-----------------------
| tokenize  | htb     |
| mwt       | htb     |
| pos       | htb     |
| lemma     | htb     |
| depparse  | htb     |

2020-05-07 11:17:19 INFO: Use device: cpu
2020-05-07 11:17:19 INFO: Loading: tokenize
2020-05-07 11:17:19 INFO: Loading: mwt
2020-05-07 11:17:19 INFO: Loading: pos
2020-05-07 11:17:21 INFO: Loading: lemma
2020-05-07 11:17:21 INFO: Loading: depparse
2020-05-07 11:17:23 INFO: Done loading processors!


https://www.langndata.com/api/heb_parser?token=7af849ffe433366fa59173445cee7ac7
{"data":"הוא הפיל הגדול ביותר בגן החיות."}
    empty gen  lemma num num_2 num_last num_s_p per          pos        pos_2  \
0      -1   M    הוא   0     1        1       S   3          PRP          PRP   
1      -1  -1      ה   1     2        2      -1  -1          DEF          DEF   
2      -1   M    פיל   2     3        2       S  -1           NN           NN   
3      -1  -1      ה   3     4        3      -1  -1          DEF          DEF   
4      -1   M   גדול   4     5        3       S  -1           JJ           JJ   
5      -1  -1  ביותר   5     6        4      -1  -1           RB           RB   
6      -1  -1      ב   6     7        5      -1  -1  PREPOSITION  PREPOSITION   
7      -1  -1      ה   7     8        5      -1  -1          DEF          DEF   
8      -1   M     גן   8     9        5       S  -1           NN           NN   
9      -1  -1      ה   9    10        6      -1  -1          DEF   