In [1]:
%cd ..
import config
from src.model_wrapper import Model
from src.explainer_lime import Lime_Explanation
from src.explainer_full import get_full_explanation_for_data_point

C:\Users\Stefan.Beuchert\Desktop\backup_from_kubectl


In [2]:
import pandas as pd
#pd.options.mode.chained_assignment = None
import numpy as np
import os
import logging
import spacy

In [3]:
# config
chunk_tag = 'A' # the possible chunks are A, B, C, D, E with 2700 rows each
source_path = f'./data/GermanQuAD_raw_test/GermanQuAD_raw_test_{chunk_tag}.json'

number_of_features = 10 # the default value
number_of_samples = 3 # the default value 
bow = False # not default 

In [4]:
# get raw data in one dataframe
GQuAD_chunk_data = pd.read_json(source_path)
# GQuAD_chunk_data = GQuAD_chunk_data.head(10)
GQuAD_chunk_data.head(3)
len(GQuAD_chunk_data)

735

In [5]:
# get important instances
NLP = spacy.load("de_core_news_sm")

GELECTRA_MODEL = Model(config.GELECTRA_MODEL_NAME)

# get lime explainer
LIME_EXPLAINER = Lime_Explanation(model = GELECTRA_MODEL,
                                  class_names = ["Start-Token","End-Token"], 
                                  num_samples = number_of_samples, # hier noch eine begründung warum
                                  num_features = number_of_features, # default value
                                  bow = bow,
                                  random_state = 0)

In [7]:
# helper functions to enhance_data_frame

def get_prediction(row):
    model_output = GELECTRA_MODEL.predict(row.question, row.context)
    
    answer = GELECTRA_MODEL.get_predicted_tokens(model_output[0],model_output[1])
    answer_string = GELECTRA_MODEL.get_answer_string(model_output[0],model_output[1])
    
    return answer_string

def get_lime_explanation(row):
    
    try:
        res = get_full_explanation_for_data_point(LIME_EXPLAINER, NLP, row.question, row.context)

        return res
    except:
        print(f'results for {row.question_id} could not be calculated')
        return {}
    
def enhance_data_frame(GQuAD_data):
    GQuAD_data['prediction'] = GQuAD_data.apply(lambda row: get_prediction(row), axis=1)
    GQuAD_data['explanation'] = GQuAD_data.apply(lambda row: get_lime_explanation(row), axis=1)
    
    return GQuAD_data

In [8]:
# creat new target directory with specifications
dir_name = f'data/German_QuAD_test_with_lime/lime_ns_{number_of_samples}_nf_{number_of_features}'
if not os.path.isdir(dir_name):
    os.mkdir(dir_name)

In [9]:
# create a  logger
logging.basicConfig(filename=f'{dir_name}/{chunk_tag}_0_log.log', # file name
                    filemode='a', # file mode a == append instead of overwrite
                    level=logging.DEBUG,
                    datefmt='%Y-%m-%d %H:%M:%S'
                   )

In [10]:
# split df in chunks for better handeling (and backing up after each chunk)
list_of_dfs = np.array_split(GQuAD_chunk_data, 40) 
len(list_of_dfs)

40

In [None]:
# iterate throug the dfs, calculate results, safe to file and log
for i in range(len(list_of_dfs)):
    try:
        current_df_chunck = list_of_dfs[i]

        first_index = current_df_chunck.index[0]
        last_index = current_df_chunck.index[-1]

        logging.info(f'Starting on new sub_chunk. The first index is: {first_index}')

        # enhance df sub-chunk
        df_sub_chunck_explained = enhance_data_frame(list_of_dfs[i])

        # safe them to a file with the correct name (as a counter)
        df_sub_chunck_explained.to_json(f'{dir_name}/{chunk_tag}_{i + 1}_von_{len(list_of_dfs)}_GermanQuAD_explained_lime.json')

        # log the index of the last row of the df
        logging.info(f'One sub_chunk has been finished and safed. The last index is: {last_index}')
                     
    except Exception as e:
        logging.critical(e, exc_info=True)

results for 36651 could not be calculated


In [None]:
list_of_dfs[0]