In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../data/analysis/data.tsv', sep='\t', header=None)
df.fillna('NA', inplace=True)
df.columns = ['source', 'reference', 'output', 'ref_derivation', 'out_derivation', 'parser_error', 'src_unigram', 'lp_nmt']
df.head()

Unnamed: 0,source,reference,output,ref_derivation,out_derivation,parser_error,src_unigram,lp_nmt
0,"Aujourd&apos; hui , comme le sait très bien M....","These days , in spite of the generic_year_ne d...","Today , as Mr generic_proper_ne knows very wel...",(root_strict (flr-hd_nwh_c (np_adv_c (sp-hd_n_...,(root_strict (flr-hd_nwh_c (np_adv_c (hdn_bnp-...,,2820.71357,12.470377
1,"En tant que rapporteur , je me déclare satisfa...","As generic_mass_count_noun , I can say that I ...","As generic_mass_count_noun , I am satisfied wi...",(root_strict (flr-hd_nwh_c (hd-cmp_u_c (as_nba...,(root_strict (flr-hd_nwh_c (hd-cmp_u_c (as_nba...,,776.987937,3.555381
2,"Après tout , Nous nous retrouvons , somme tout...",It is after all an generic_adj supply in certa...,"After all , we have seen a lack of flexibility...",(root_strict (hd-aj_scp-pr_c (sb-hd_mc_c (hdn_...,(root_strict (hd-aj_scp-pr_c (flr-hd_nwh_c (w_...,,2166.446535,13.572359
3,J&apos; espère vraiment que nous devrons plus ...,I very much hope it will not take much longer ...,I very much hope that we will have to wait a l...,(root_informal (sb-hd_mc_c (hdn_bnp-qnt_c (i (...,,timed out (60 s) timed out (60 s),2481.521948,23.029966
4,C&apos; est pourquoi j&apos; ai voté contre ce...,I therefore voted against the recommendation s...,That is why I voted against this recommendatio...,(root_strict (hd-aj_scp_c (sb-hd_mc_c (hdn_bnp...,(root_strict (hd-aj_scp-pr_c (sb-hd_mc_c (hdn_...,,4164.627924,15.833892


In [3]:
df['grammatical'] = df['out_derivation'] != 'NA'
print(df['grammatical'].value_counts())
print()
print(df['grammatical'].value_counts(normalize=True))

True     188992
False     14785
Name: grammatical, dtype: int64

True     0.927445
False    0.072555
Name: grammatical, dtype: float64


In [4]:
def root(s):
    return 'NA' if ' ' not in s else s[1:s.index(' ')]
df['ref_root_cond'] = df['ref_derivation'].apply(root)
df['out_root_cond'] = df['out_derivation'].apply(root)

print('Reference: ')
print(df['ref_root_cond'].value_counts(normalize=True))
print()
print('Translation: ')
print(df['out_root_cond'].value_counts(normalize=True))

Reference: 
root_strict      0.646795
root_informal    0.315055
root_frag        0.023918
root_inffrag     0.014231
Name: ref_root_cond, dtype: float64

Translation: 
root_strict      0.632142
root_informal    0.254872
NA               0.072893
root_frag        0.026485
root_inffrag     0.013608
Name: out_root_cond, dtype: float64


In [5]:
print(df[df.out_derivation == 'NA']['parser_error'].apply(lambda x: x[:10]).value_counts())
print()
pd.options.display.max_colwidth = 100

timed out     7208
NA            5966
no lexicon    1598
resource l       8
Chart is n       4
memory lim       1
Name: parser_error, dtype: int64



In [6]:
print(df[(df.out_derivation == 'NA') & (df.parser_error == 'NA')]['output'].sample(10))
output = open('/iesl/canvas/jwei/nmt_hpsg/data/analysis/gug.csv', 'wt')
df['output_length'] = df['output'].apply(lambda x: len(x.split()))
output.write(df[(df.out_derivation == 'NA') & (df.parser_error == 'NA') & (df.output_length < 10)]
             [['source', 'reference','output']]
             .sample(100).to_csv())
output.close()

149777    , for example , in areas inhabited by Muslims , girls who go to Christian schools are forced to ...
53353     The first that I would like to raise , and has already been mentioned , are the generic_proper_n...
181291                                   generic_adj sector is one of the most international cultural areas .
88418     Consequently , in the up up to the meeting in generic_proper_ne , we assume that the subject in ...
33120     These include the up up to the programme and the proper functioning of the Commission &apos; s s...
34366                must the European Union define the penalties , be it prison , fines or other penalties ?
25721     Furthermore , generic_proper_ne generic_card_ne million of the generic_mass_count_noun programme...
53312          You have accepted the role of in in generic_proper_ne of the Council for the next six months .
91595     Has it been expected that the European Parliament will have any influence over the composition o...
1167      

In [10]:
reg_df = pd.DataFrame(df[df.parser_error == 'NA'])
#reg_df = df
reg_df['gug'] = reg_df.out_derivation != 'NA' 
reg_df['src_len'] = reg_df['source'].apply(lambda x: len(x.split()))
reg_df['out_len'] = reg_df['output'].apply(lambda x: len(x.split()))

reg_df['mean_lp_nmt'] = reg_df['lp_nmt'] / reg_df['out_len']
reg_df['mean_uni_lp'] = reg_df['src_unigram'] / reg_df['src_len']
reg_df['intercept'] = 1

reg_df.head()

Unnamed: 0,source,reference,output,ref_derivation,out_derivation,parser_error,src_unigram,lp_nmt,grammatical,ref_root_cond,out_root_cond,output_length,gug,src_len,out_len,mean_lp_nmt,mean_uni_lp,intercept
0,"Aujourd&apos; hui , comme le sait très bien M. Flynn , bien qu&apos; une directive sur l&apos; é...","These days , in spite of the generic_year_ne directive - with which Mr generic_proper_ne is very...","Today , as Mr generic_proper_ne knows very well , although a directive on equal treatment of men...","(root_strict (flr-hd_nwh_c (np_adv_c (sp-hd_n_c (these_det (""these"" ))(hdn-aj_redrel-pr_c (hdn-a...","(root_strict (flr-hd_nwh_c (np_adv_c (hdn_bnp-qnt_c (w_comma_plr (today_np (""today,"" )))))(aj-hd...",,2820.71357,12.470377,True,root_strict,root_strict,41,True,46,41,0.304156,61.31986,1
1,"En tant que rapporteur , je me déclare satisfait du résultat final .","As generic_mass_count_noun , I can say that I am satisfied with the final results .","As generic_mass_count_noun , I am satisfied with the end result .","(root_strict (flr-hd_nwh_c (hd-cmp_u_c (as_nbar (""as"" ))(w_comma_plr (n_ms-cnt_ilr (generic_mass...","(root_strict (flr-hd_nwh_c (hd-cmp_u_c (as_nbar (""as"" ))(w_comma_plr (n_ms-cnt_ilr (generic_mass...",,776.987937,3.555381,True,root_strict,root_strict,11,True,13,11,0.323216,59.768303,1
2,"Après tout , Nous nous retrouvons , somme toute , face à un manque d&apos; élasticité de l&apos;...","It is after all an generic_adj supply in certain circumstances , just as it is with doctors in t...","After all , we have seen a lack of flexibility in the supply in certain circumstances , as in th...","(root_strict (hd-aj_scp-pr_c (sb-hd_mc_c (hdn_bnp-qnt_c (it (""it"" )))(hd-cmp_u_c (be_c_is (""is"" ...","(root_strict (hd-aj_scp-pr_c (flr-hd_nwh_c (w_comma_plr (after_all_adv (""after all,"" )))(sb-hd_n...",,2166.446535,13.572359,True,root_strict,root_strict,26,True,34,26,0.522014,63.719016,1
3,J&apos; espère vraiment que nous devrons plus attendre longtemps pour obtenir une basez digne de...,I very much hope it will not take much longer until we really do achieve a basis of trust so tha...,I very much hope that we will have to wait a long time to obtain a dignified generic_mass_count_...,"(root_informal (sb-hd_mc_c (hdn_bnp-qnt_c (i (""i"" )))(vp-vp_crd-fin-t_c (aj-hd_int_c (very_much_...",,timed out (60 s) timed out (60 s),2481.521948,23.029966,False,root_informal,,34,False,37,34,0.677352,67.068161,1
4,"C&apos; est pourquoi j&apos; ai voté contre cette recommandation , puisque je pense que la coopé...",I therefore voted against the recommendation since I think enhanced cooperation is unacceptable ...,"That is why I voted against this recommendation , as I believe that enhanced cooperation is unac...","(root_strict (hd-aj_scp_c (sb-hd_mc_c (hdn_bnp-qnt_c (i (""i"" )))(aj-hd_scp-xp_c (therefore_adv (...","(root_strict (hd-aj_scp-pr_c (sb-hd_mc_c (hdn_bnp-qnt_c (hdn_optcmp_c (that_deix (""that"" ))))(hd...",,4164.627924,15.833892,True,root_strict,root_strict,48,True,55,48,0.329873,75.720508,1


In [8]:
for col in ['mean_lp_nmt', 'mean_uni_lp', 'out_len']:
    reg_df[col] = (reg_df[col] - reg_df[col].mean()) / reg_df[col].std()
reg_df[['mean_lp_nmt', 'mean_uni_lp', 'out_len', 'intercept']].head()

Unnamed: 0,mean_lp_nmt,mean_uni_lp,out_len,intercept
0,-0.662868,-0.514249,1.590792,1
1,-0.577865,-0.701333,-1.014223,1
2,0.308679,-0.224964,0.288285,1
3,1.001415,0.178868,0.982955,1
4,-0.548181,1.222149,2.198628,1


In [9]:
import statsmodels.api as sm
lr = sm.Logit(reg_df['gug'], reg_df[['mean_lp_nmt', 'mean_uni_lp', 'out_len', 'intercept']])
lr = lr.fit()
lr.summary()

Optimization terminated successfully.
         Current function value: 0.212791
         Iterations 8


0,1,2,3
Dep. Variable:,gug,No. Observations:,203777.0
Model:,Logit,Df Residuals:,203773.0
Method:,MLE,Df Model:,3.0
Date:,"Thu, 23 Aug 2018",Pseudo R-squ.:,0.1822
Time:,19:52:26,Log-Likelihood:,-43362.0
converged:,True,LL-Null:,-53022.0
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
mean_lp_nmt,-0.2031,0.008,-26.221,0.000,-0.218,-0.188
mean_uni_lp,0.0342,0.012,2.870,0.004,0.011,0.058
out_len,-1.1200,0.009,-125.437,0.000,-1.137,-1.102
intercept,3.0933,0.012,254.158,0.000,3.069,3.117
