# Analysing the feature importance

blablabla

## Preparation

In [1]:
%cd ..

C:\Users\Stefan.Beuchert\Desktop\backup_from_kubectl


In [2]:
#!pip install spacy
#!pip install wasabi==0.9.1
#!python -m spacy download de_core_news_sm

In [3]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

import numpy as np
import matplotlib.pyplot as plt

import nltk
import spacy

from IPython.display import display_html 

In [4]:
# config
number_of_features = 10 # the default value
number_of_samples = 5000 # the default value
source_file = f'data/German_QuAD_with_lime/lime_ns_{number_of_samples}_nf_{number_of_features}/Combined_Data_GermanQuAD_explained_lime.json'

nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
german_stop_words = nltk.corpus.stopwords.words('german')

spacy_nlp = spacy.load("de_core_news_sm")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Stefan.Beuchert\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Stefan.Beuchert\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Helper functions

In [None]:
# enhance data with POS and NER

def enhance_data(data_df):
    data_df['word'] = data_df['word'].apply(str.lower)
    
    # add pos tagger ATTENTION currently the pos is calculated from a list of words, those words should not be in a list and change the outcome. Pos should be calculated at the basis of the full input of the model, not reduced output.
    list_pos_for_list_of_words = nltk.tag.pos_tag(data_df['word'].tolist())
    data_df['pos'] = [i[1] for i in list_pos_for_list_of_words]
    
    # add NER ATTENTION this part is really really bad, the results are not to trust!!!!
    # get ents
    list_of_tokens = data_df['word'].tolist()

    # split list in half to get ner (there is a maximum of tokens we can use)
    list_of_tokens_tmp_a = list_of_tokens[:len(list_of_tokens)//2]
    list_of_tokens_tmp_b = list_of_tokens[len(list_of_tokens)//2:]

    # get ner
    ner_tmp_a_raw = spacy_nlp(' '.join(list_of_tokens_tmp_a))
    ner_tmp_b_raw = spacy_nlp(' '.join(list_of_tokens_tmp_b))
    
    ner_tmp_a = [ent.label_ for ent in ner_tmp_a_raw.ents]
    ner_tmp_b = [ent.label_ for ent in ner_tmp_b_raw.ents]

    all_ner = ner_tmp_a + ner_tmp_b

    all_ner_and_nones = all_ner + [None] * (len(start_tokens_df) - len(all_ner))
    
    data_df['ner'] = all_ner_and_nones
    
    return data_df

def get_frequencie(lime_explanation_df, max_tokens, mode, target, remove_stop_words):
    
    # input should be a df with the columns 'word' and 'weights'
    
    def clean_data(data_df):
        if remove_stop_words:
            data_df = data_df[~data_df['word'].isin(german_stop_words)]
            
        return data_df
        
    # prepare data
    lime_explanation_df_clean = clean_data(lime_explanation_df)
    
    # get frequencies
    frequencies_df = lime_explanation_df_clean.groupby(target)['weigth'].agg(['sum','count'])
    frequencies_df = frequencies_df.reset_index()
    
    # return total of most frequent words, weights not included
    if mode == 'frq':
        return frequencies_df[[target, 'count']].sort_values(by=['count'], ascending=False).head(max_tokens)
        
    # return most important word by calculation the l2 (or euclidean) distance for a vectore (coung, sum) to the origin (0,0)
    elif mode == 'l2-score':
        frequencies_df['l2'] = frequencies_df.apply(lambda row: np.linalg.norm(np.array((row['count'], row['sum']))), axis=1)
        return frequencies_df[[target, 'l2']].sort_values(by=['l2'], ascending=False).head(max_tokens)
        
    # res is a touple with (word_list, freq_list)
    else:
        print("WARNING, wrong mode")
        
def display_frequencies(tokens_dict, max_tokens = 15, mode = 'frq', target = 'word', remove_stop_words = False):
    start_token_word_freq_df = get_frequencie(tokens_dict['start_tokens_df'], max_tokens, mode, target, remove_stop_words)
    start_token_positive_word_freq_df = get_frequencie(tokens_dict['start_tokens_positive_df'], max_tokens, mode, target, remove_stop_words)
    start_token_negative_word_freq_df = get_frequencie(tokens_dict['start_tokens_negative_df'], max_tokens, mode, target, remove_stop_words)

    start_token_styler = start_token_word_freq_df.style.set_table_attributes("style='display:inline'").set_caption('start total')
    start_token_positive_styler = start_token_positive_word_freq_df.style.set_table_attributes("style='display:inline'").set_caption('start positive')
    start_token_negative_styler = start_token_negative_word_freq_df.style.set_table_attributes("style='display:inline'").set_caption('start negative')

    end_token_word_freq_df = get_frequencie(tokens_dict['end_tokens_df'], max_tokens, mode, target, remove_stop_words)
    end_token_positive_word_freq_df = get_frequencie(tokens_dict['end_tokens_positive_df'], max_tokens, mode, target, remove_stop_words)
    end_token_negative_word_freq_df = get_frequencie(tokens_dict['end_tokens_negative_df'], max_tokens, mode, target, remove_stop_words)

    end_token_styler = end_token_word_freq_df.style.set_table_attributes("style='display:inline'").set_caption('end total')
    end_token_positive_styler = end_token_positive_word_freq_df.style.set_table_attributes("style='display:inline'").set_caption('end positive')
    end_token_negative_styler = end_token_negative_word_freq_df.style.set_table_attributes("style='display:inline'").set_caption('end negative')

    display_html(start_token_styler._repr_html_() + start_token_positive_styler._repr_html_() + start_token_negative_styler._repr_html_() + 
                 end_token_styler._repr_html_() + end_token_positive_styler._repr_html_() + end_token_negative_styler._repr_html_(), raw=True)

### Get Data

In [32]:
all_data_df = pd.read_json(source_file)

test_data_df = all_data_df[all_data_df['usage'] == 'test']

len_test_data_df = len(test_data_df)
test_data_df = test_data_df.dropna()
len_test_data_dropna_df = len(test_data_df)

print(f'{len_test_data_df - len_test_data_dropna_df} of the {len_test_data_df} have been droped because the contained *None* values. {len_test_data_dropna_df} rows remaining')

df.head(3)

15 of the 2204 have been droped because the contained *None* values. 2189 rows remaining


Unnamed: 0,question_id,question,answers,context,document_id,is_impossible,usage,prediction,start_token_exlanation_lime,end_token_explanation_lime
0,67118,Warum sind höhere Temperaturen für die Bildung...,"[{'answer_id': 72940, 'document_id': 73863, 'q...",Gletscher\n\n=== Akkumulation und Metamorphose...,73863,False,train,Zum einen bilden sich in wärmeren ( temperiert...,"[[Warum, 0.1728582717], [Temperaturen, 0.11620...","[[Warum, 0.0826549458], [kann, 0.0461802756], ..."
1,41385,Mit welchen Päpsten hat sich Elisabeth II getr...,"[{'answer_id': 43009, 'document_id': 40918, 'q...",Elisabeth_II.\n\n== Persönlichkeit und öffentl...,40918,False,test,"Johannes XXIII., Johannes Paul II. und Benedik...","[[Päpste, 0.15638861680000002], [welchen, 0.14...","[[welchen, 0.2098591445], [Päpsten, 0.10988179..."
3,36706,In welchem Bundesstaat liegt Melbourne?,"[{'answer_id': 37054, 'document_id': 40673, 'q...",Melbourne\n\n=== Stadtregierung ===\nDie Metro...,40673,False,test,Victoria,"[[Bundesstaat, 0.1435744051], [Victoria, 0.085...","[[Bundesstaates, 0.24585934180000002], [Victor..."


In [33]:
# extract tokens

# start tokens
start_tokens_raw = [item for sublist in test_data_df['start_token_exlanation_lime'].tolist() for item in sublist]
start_tokens_raw_df = pd.DataFrame(start_tokens, columns =['word', 'weigth'])

# end tokens
end_tokens_raw = [item for sublist in test_data_df['end_token_explanation_lime'].tolist() for item in sublist]
end_tokens_raw_df = pd.DataFrame(end_tokens, columns =['word', 'weigth'])

In [34]:
start_tokens_df.head(3)

Unnamed: 0,word,weigth,pos,ner
0,warum,0.172858,NN,PER
1,temperaturen,0.116209,NN,LOC
2,vorteilhaft,0.073482,NN,LOC


In [35]:
# save different tokens subsets as df

token_subsets_dict = {}

# main categories
token_subsets_dict['start_tokens_df'] = enhance_data(start_tokens_raw_df)
token_subsets_dict['end_tokens_df'] = enhance_data(end_tokens_raw_df)

# sub categories
token_subsets_dict['start_tokens_positive_df'] = token_subsets_dict['start_tokens_df'][token_subsets_dict['start_tokens_df']['weigth'] > 0]
token_subsets_dict['start_tokens_negative_df'] = token_subsets_dict['start_tokens_df'][token_subsets_dict['start_tokens_df']['weigth'] < 0]

token_subsets_dict['end_tokens_positive_df'] = token_subsets_dict['end_tokens_df'][token_subsets_dict['end_tokens_df']['weigth'] > 0]
token_subsets_dict['end_tokens_negative_df'] = token_subsets_dict['end_tokens_df'][token_subsets_dict['end_tokens_df']['weigth'] < 0]

# check
print(f'number of start token values: {len(token_subsets_dict["start_tokens_df"])} - \n'
      f'number of start token with positive weights: {len(token_subsets_dict["start_tokens_positive_df"])} - \n'
      f'number of start tokens with negative weights: {len(token_subsets_dict["start_tokens_negative_df"])} \n'
      f'is {len(token_subsets_dict["start_tokens_df"]) - len(token_subsets_dict["start_tokens_positive_df"]) - len(token_subsets_dict["start_tokens_negative_df"])} \n'
     )

print(f'number of end token values: {len(token_subsets_dict["end_tokens_df"])} - \n'
      f'number of end token with positive weights: {len(token_subsets_dict["end_tokens_positive_df"])} - \n'
      f'number of end tokens with negative weights: {len(token_subsets_dict["end_tokens_negative_df"])} \n'
      f'is {len(token_subsets_dict["end_tokens_df"]) - len(token_subsets_dict["end_tokens_positive_df"]) - len(token_subsets_dict["end_tokens_negative_df"])} \n'
     )

print('if there are all zeros, everything is good :)')

number of start token values: 136790 - 
number of start token with positive weights: 108734 - 
number of start tokens with negative weights: 28056 
is 0 

number of end token values: 136790 - 
number of end token with positive weights: 111601 - 
number of end tokens with negative weights: 25189 
is 0 

if there are all zeros, everything is good :)


In [36]:
token_subsets_dict['start_tokens_df'].head(3)

Unnamed: 0,word,weigth,pos,ner
0,warum,0.172858,NN,PER
1,temperaturen,0.116209,NN,LOC
2,vorteilhaft,0.073482,NN,LOC


## Analysing

### Total word frequencies 

In [37]:
display_frequencies(token_subsets_dict)

Unnamed: 0,word,count
5660,die,2846
25121,wie,2626
5524,der,2187
24678,was,2167
24872,welche,2045
12018,ist,1879
11572,in,1660
22989,und,1635
24644,wann,1505
24331,von,969

Unnamed: 0,word,count
21894,wie,2481
21496,was,2098
21668,welche,1973
4956,die,1920
10465,ist,1722
21464,wann,1484
20000,und,1252
4838,der,1063
10080,in,964
22173,wurde,826

Unnamed: 0,word,count
1912,der,1124
1966,die,926
3981,in,696
8258,von,455
3958,im,441
7785,und,383
1852,das,337
1903,den,292
602,am,233
1894,dem,233

Unnamed: 0,word,count
26410,wie,2643
5953,die,2443
25962,was,2230
26161,welche,2092
24166,und,2041
5812,der,1817
12637,ist,1739
12159,in,1532
25930,wann,1520
26729,wurde,906

Unnamed: 0,word,count
22546,wie,2534
22160,was,2186
5104,die,2059
22328,welche,2036
10813,ist,1577
20623,und,1564
22131,wann,1513
4980,der,1329
10400,in,1029
22832,wurde,820

Unnamed: 0,word,count
4506,in,503
2165,der,488
8731,und,477
2218,die,384
4480,im,236
9277,von,214
4668,ist,162
660,als,157
2098,das,145
5915,mit,132


### Word frequencies weighted by LIME value

In [38]:
display_frequencies(token_subsets_dict, mode = 'l2-score')

Unnamed: 0,word,l2
5660,die,2846.45173
25121,wie,2636.517098
5524,der,2187.000566
24678,was,2185.054543
24872,welche,2057.900642
12018,ist,1880.704531
11572,in,1660.149467
22989,und,1635.560385
24644,wann,1540.644579
24331,von,969.026535

Unnamed: 0,word,l2
21894,wie,2492.587554
21496,was,2116.876719
21668,welche,1986.615069
4956,die,1921.861616
10465,ist,1724.075655
21464,wann,1520.266238
20000,und,1253.14399
4838,der,1063.884657
10080,in,965.090176
22173,wurde,826.895025

Unnamed: 0,word,l2
1912,der,1124.898433
1966,die,926.618869
3981,in,696.3994
8258,von,455.282286
3958,im,441.26627
7785,und,383.150074
1852,das,337.226136
1903,den,292.271864
602,am,233.289482
1894,dem,233.183287

Unnamed: 0,word,l2
26410,wie,2655.437718
5953,die,2444.229527
25962,was,2256.124146
26161,welche,2112.106511
24166,und,2042.033318
5812,der,1817.478765
12637,ist,1740.599457
25930,wann,1585.387437
12159,in,1532.337152
26729,wurde,906.812892

Unnamed: 0,word,l2
22546,wie,2547.248206
22160,was,2212.769442
5104,die,2060.837589
22328,welche,2056.849522
10813,ist,1578.988315
22131,wann,1578.73329
20623,und,1566.031097
4980,der,1330.18072
10400,in,1030.065823
22832,wurde,821.006559

Unnamed: 0,word,l2
4506,in,503.21487
2165,der,488.210043
8731,und,477.228893
2218,die,384.117282
4480,im,236.083128
9277,von,214.083116
4668,ist,162.065678
660,als,157.062368
2098,das,145.035459
5915,mit,132.044728


### Total word frequencies with removed stopwords

In [39]:
display_frequencies(token_subsets_dict, remove_stop_words = True)

Unnamed: 0,word,count
24497,wann,1505
25271,wurde,915
24808,wer,773
24019,viele,741
24528,warum,601
19899,seit,489
9720,gibt,332
25272,wurden,234
11991,jahr,226
20884,stadt,219

Unnamed: 0,word,count
21323,wann,1484
22010,wurde,826
21604,wer,757
20912,viele,713
21352,warum,585
17297,seit,368
8484,gibt,317
22011,wurden,205
18148,stadt,199
3545,bezeichnet,174

Unnamed: 0,word,count
2549,etwa,160
6646,seit,121
8477,wurde,89
6369,rund,72
4071,jahr,69
2488,erst,54
398,ab,53
1209,beispielsweise,43
1277,bereits,41
8707,zwei,38

Unnamed: 0,word,count
25782,wann,1520
26559,wurde,906
25293,viele,775
26100,wer,770
25811,warum,579
20982,seit,506
10240,gibt,279
26560,wurden,250
15749,menschen,227
21985,stadt,226

Unnamed: 0,word,count
21987,wann,1513
22666,wurde,820
22262,wer,757
21585,viele,741
22013,warum,552
17870,seit,466
8764,gibt,243
22667,wurden,211
18726,stadt,205
18374,sowie,190

Unnamed: 0,word,count
5714,menschen,92
9527,wurde,86
4607,jahr,60
9789,zwei,43
7506,seit,40
9528,wurden,39
435,ab,37
2464,einwohner,37
3806,gibt,36
4609,jahren,35


### Word frequencies weighted by LIME value with removed stopwords

In [40]:
display_frequencies(token_subsets_dict, mode = 'l2-score', remove_stop_words = True)

Unnamed: 0,word,l2
24497,wann,1540.644579
25271,wurde,915.718308
24808,wer,790.066121
24019,viele,747.372667
24528,warum,614.074588
19899,seit,489.596764
9720,gibt,332.286905
25272,wurden,234.167018
11991,jahr,226.955042
20884,stadt,219.501924

Unnamed: 0,word,l2
21323,wann,1520.266238
22010,wurde,826.895025
21604,wer,774.472051
20912,viele,719.735609
21352,warum,598.507619
17297,seit,369.128993
8484,gibt,317.320215
22011,wurden,205.225528
18148,stadt,199.59108
3545,bezeichnet,174.356935

Unnamed: 0,word,l2
2549,etwa,160.190537
6646,seit,121.090557
8477,wurde,89.027175
6369,rund,72.101508
4071,jahr,69.052546
2488,erst,54.046057
398,ab,53.039881
1209,beispielsweise,43.040336
1277,bereits,41.052434
8707,zwei,38.01853

Unnamed: 0,word,l2
25782,wann,1585.387437
26559,wurde,906.812892
26100,wer,793.58147
25293,viele,783.403544
25811,warum,588.169831
20982,seit,507.475791
10240,gibt,279.155405
26560,wurden,250.162087
15749,menschen,227.011773
21985,stadt,226.575162

Unnamed: 0,word,l2
21987,wann,1578.73329
22666,wurde,821.006559
22262,wer,781.063054
21585,viele,749.928669
22013,warum,561.79396
17870,seit,467.688245
8764,gibt,243.215734
22667,wurden,211.237895
18726,stadt,205.693613
18374,sowie,190.222562

Unnamed: 0,word,l2
5714,menschen,92.113446
9527,wurde,86.029538
4607,jahr,60.027481
9789,zwei,43.016859
7506,seit,40.013223
9528,wurden,39.013297
2464,einwohner,37.025353
435,ab,37.012978
3806,gibt,36.011967
4609,jahren,35.019413


## Using POS and NER instead of raw tokens

https://arxiv.org/abs/2011.06993 (state of the art)

In [41]:
display_frequencies(token_subsets_dict, target = 'pos')

Unnamed: 0,pos,count
11,NN,76719
7,JJ,22143
26,VBP,6904
13,NNS,5840
2,CD,5669
23,VBD,4371
6,IN,3559
12,NNP,1956
27,VBZ,1912
18,RB,1392

Unnamed: 0,pos,count
11,NN,61205
7,JJ,17622
26,VBP,5234
2,CD,4585
13,NNS,4483
23,VBD,3959
6,IN,2477
27,VBZ,1428
12,NNP,1423
18,RB,993

Unnamed: 0,pos,count
11,NN,15514
7,JJ,4521
24,VBP,1670
13,NNS,1357
2,CD,1084
6,IN,1082
12,NNP,533
25,VBZ,484
21,VBD,412
16,RB,399

Unnamed: 0,pos,count
11,NN,76618
7,JJ,22002
27,VBP,6721
2,CD,5997
13,NNS,5846
24,VBD,4498
6,IN,3672
12,NNP,1906
28,VBZ,1805
18,RB,1265

Unnamed: 0,pos,count
11,NN,62377
7,JJ,18001
27,VBP,5485
2,CD,4941
13,NNS,4660
24,VBD,4066
6,IN,2737
12,NNP,1480
28,VBZ,1408
18,RB,991

Unnamed: 0,pos,count
11,NN,14241
7,JJ,4001
25,VBP,1236
13,NNS,1186
2,CD,1056
6,IN,935
22,VBD,432
12,NNP,426
26,VBZ,397
18,RB,274


In [42]:
display_frequencies(token_subsets_dict, target = 'ner')

Unnamed: 0,ner,count
0,LOC,5856
3,PER,3128
1,MISC,2641
2,ORG,1347

Unnamed: 0,ner,count
0,LOC,4680
3,PER,2514
1,MISC,2079
2,ORG,1074

Unnamed: 0,ner,count
0,LOC,1176
3,PER,614
1,MISC,562
2,ORG,273

Unnamed: 0,ner,count
0,LOC,5770
3,PER,3232
1,MISC,2624
2,ORG,1363

Unnamed: 0,ner,count
0,LOC,4683
3,PER,2641
1,MISC,2157
2,ORG,1133

Unnamed: 0,ner,count
0,LOC,1087
3,PER,591
1,MISC,467
2,ORG,230
