In [1]:
!pip install transformers -q
!pip install lime -q

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
autogluon-contrib-nlp 0.0.1b20210201 requires tokenizers==0.9.4, but you have tokenizers 0.13.2 which is incompatible.[0m
You should consider upgrading via the '/opt/conda/envs/rapids/bin/python3.7 -m pip install --upgrade pip' command.[0m
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
autogluon-vision 0.3.1 requires Pillow<8.4.0,>=8.3.0, but you have pillow 9.3.0 which is incompatible.
autogluon-mxnet 0.3.1 requires Pillow<8.4.0,>=8.3.0, but you have pillow 9.3.0 which is incompatible.[0m
You should consider upgrading via the '/opt/conda/envs/rapids/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [2]:
%cd ..
from src.model_wrapper import Model
from src.explainer_lime import Lime_Explanation

/usr/src/app/data/XAI_Thesis


In [3]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import os
import logging

In [4]:
# config
chunk_tag = 'C' # the possible chunks are A, B, C, D, E with 2700 rows each
target_path_for_combined_data = f'./data/GermanQuAD_raw_combined/GermanQuAD_raw_combined_{chunk_tag}.json'
model_name = 'deepset/gelectra-base-germanquad'

number_of_features = 10 # the default value
number_of_samples = 5000 # the default value

In [5]:
# get raw data combined in one dataframe (test + train)
GQuAD_chunk_data = pd.read_json(target_path_for_combined_data)
GQuAD_chunk_data.head(5)

Unnamed: 0,question_id,question,answers,context,document_id,is_impossible,usage
5490,59654,Welche Länder grenzen an Liberia?,"[{'answer_id': 62940, 'document_id': 57443, 'q...",Liberia\n\n== Geografie ==\nLiberia liegt im S...,57443,False,train
5491,37333,Wie wurde aus Osmium ein Glühfaden hergestellt?,"[{'answer_id': 37688, 'document_id': 41086, 'q...",Glühlampe\n\n==== Geschichte ====\nDie ersten ...,41086,False,test
5492,65917,Was passiert in der Pressenpartie bei der Papi...,"[{'answer_id': 70854, 'document_id': 57866, 'q...",Papier\n\n==== Pressen und Trocknen ====\nAm E...,57866,False,train
5493,37664,Wem unterstehen die Universitäten in der Schweiz?,"[{'answer_id': 38028, 'document_id': 40697, 'q...",Universität\n\n=== Struktur ===\nBildungsgänge...,40697,False,test
5494,41165,Warum hat die Bevölkerung angefangen zu wachse...,"[{'answer_id': 42011, 'document_id': 40872, 'q...",Mittelalter\n\n=== Hochmittelalter ===\nDas Ho...,40872,False,test


In [6]:
# get important instances
GELECTRA_MODEL = Model(model_name)

# get lime explainer
LIME_EXPLAINER = Lime_Explanation(model = GELECTRA_MODEL,
                                  class_names = ["Start-Token","End-Token"], 
                                  num_samples = number_of_samples, # hier noch eine begründung warum
                                  num_features = number_of_features, # default value
                                  random_state = 0)

Downloading:   0%|          | 0.00/740 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/437M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/358 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/240k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [7]:
# helper functions to enhance_data_frame

def get_prediction(row):
    model_output = GELECTRA_MODEL.predict(row.question, row.context)
    
    answer = GELECTRA_MODEL.get_predicted_tokens(model_output[0],model_output[1])
    answer_string = GELECTRA_MODEL.get_answer_string(model_output[0],model_output[1])
    
    return answer_string

def get_lime_explanation(row):
    
    try:
        explanation = LIME_EXPLAINER.explain(row.question, row.context)

        explanation_start_token = explanation.as_list(label = 0)
        explanation_end_token = explanation.as_list(label = 1)

        return pd.Series([explanation_start_token, explanation_end_token])
    except:
        print(row.question_id)
        return pd.Series([None, None])
    
def enhance_data_frame(GQuAD_data):
    GQuAD_data['prediction'] = GQuAD_data.apply(lambda row: get_prediction(row), axis=1)
    GQuAD_data[['start_token_exlanation_lime', 'end_token_explanation_lime']] = GQuAD_data.apply(lambda row: get_lime_explanation(row), axis=1)
    
    return GQuAD_data

In [8]:
# helper function: parallizes the process (DOES NOT BRING A BENEFIT)
# def parallize_dataframe(dataframe, func, n_cores):
#     df_split = np.array_split(dataframe, n_cores)
#     pool = multiprocessing.Pool(n_cores)
    
#     dataframe = pd.concat(pool.map(func, df_split))
#     pool.close()
#     pool.join()
    
#     return dataframe

In [9]:
# creat new directory with specifications
dir_name = f'data/German_QuAD_with_lime/lime_ns_{number_of_samples}_nf_{number_of_features}'
if not os.path.isdir(dir_name):
    os.mkdir(dir_name)

In [10]:
# create a  logger
logging.basicConfig(filename=f'{dir_name}/{chunk_tag}_0_log.log', # file name
                    filemode='a', # file mode a == append instead of overwrite
                    level=logging.DEBUG,
                    datefmt='%Y-%m-%d %H:%M:%S'
                   )

In [11]:
# split df in chunks for better handeling (and backing up after each chunk)
list_of_dfs = np.array_split(GQuAD_chunk_data, 45)
len(list_of_dfs)

# TODO -> wenn ich weiß wie lange eine row auf dem cluster braucht, kann ich abschätzen wie groß die sub_chunks sein sollen

45

In [12]:
# iterate throug the dfs
for i in range(len(list_of_dfs)):
    try:
        current_df_chunck = list_of_dfs[i]

        first_index = current_df_chunck.index[0]
        last_index = current_df_chunck.index[-1]

        logging.info(f'Starting on new sub_chunk. The first index is: {first_index}')
        
        # enhance df sub-chunk
        df_sub_chuck_explained = enhance_data_frame(list_of_dfs[i])

        # safe them to a file with the correct name (as a counter)
        df_sub_chuck_explained.to_json(f'{dir_name}/{chunk_tag}_{i + 1}_von_{len(list_of_dfs)}_GermanQuAD_explained_lime.json')

        # log the index of the last row of the df
        logging.info(f'One sub_chunk has been finished and safed. The last index is: {last_index}')
                     
    except Exception as e:
        logging.critical(e, exc_info=True)
        

56264
36426
53206
59519
56206
60100
