In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import riiideducation

In [None]:
model = tf.keras.models.load_model('/kaggle/input/ver6-0/my_model')

In [None]:
history = pd.read_csv('/kaggle/input/ver6-0/train.csv')

In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
def get_data(data, history=None, get_ans=True):
    
    if history is None:
        history = data
    
    # wykluczenie wykładów
    history_no_lectures = history[history.content_type_id == 0]
    
    # wykluczenie pytań
    history_only_lectures = history[history.content_type_id == 1]
    
    # wykluczenie wykładów
    no_lectures = data[data.content_type_id == 0] 

    # wykluczenie pytań
    only_lectures = data[data.content_type_id == 1] 
    
    # rozszerzenie tabeli pytań o skuteczność, ilość pytań, ilość poprawnych
    questions_types = {
    'question_id': 'int16',
    'bundle_id': 'int16',
    'correct_answer': 'int8',
    'part': 'int8',
    'tags': 'string'
    }
    questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv', dtype = questions_types)
    questions.tags = [tags.split() if type(tags) is str else tags for tags in questions.tags.values]
    df = history_no_lectures.groupby(['content_id']).agg({'answered_correctly': ['mean', 'count', 'sum']})
    df = df['answered_correctly']
    questions = questions.merge(df, left_on = 'question_id', right_on = 'content_id', how = "left")
    questions = questions.fillna({'mean': 0.0, 'count': 0, 'sum': 0})
    questions = questions.rename(columns={'mean': 'accuracy', 'count': 'count_all', 'sum': 'count_correct'})
    
    
    # manipulacja tagami
    questions_with_tags = questions[questions.tags.isna() == False]
    tags = questions_with_tags.tags.values

    tags_set = set([item for elem in tags for item in elem])
    tags_list = list(tags_set)
    tags_df = pd.DataFrame()

    for tag in tags_list:
        df = questions_with_tags[questions_with_tags.tags.apply(lambda l: tag in l)]
        tmp_df = df.agg({'count_all': ['sum'], 'count_correct': ['sum']})
        tmp_df['tag'] = tag
        tmp_df['amount_questions_with_tag'] = len(df)
        tmp_df = tmp_df.set_index('tag')
        tags_df = tags_df.append(tmp_df)

    tags_df['accuracy'] = tags_df['count_correct'] / tags_df['count_all']
    tags_df = tags_df.sort_values(by='accuracy')
    
    
    struct = [] # dane statystyczne o danych wejściowych (pytaniach)
    struct_correct = [] # informacja czy na pytanie użytkownik odpowiedział poprawnie
    # ^ długości tablic te same, odpowiadające sobie indeksy

    for index, row in no_lectures.iterrows():
        # --- prior_question_had_explanation
        prior_question_had_explanation = int(row['prior_question_had_explanation']) if type(row['prior_question_had_explanation']) is bool else 0

        # --- all_users_task_container_id_accuracy 
        df = history_no_lectures[history_no_lectures['task_container_id'] == row['task_container_id']]
        df = df.agg({'answered_correctly': ['mean', 'count']})
        all_users_task_container_id_accuracy = df.values[0][0]

        # --- all_users_task_container_id_count
        all_users_task_container_id_count = df.values[1][0]

        # --- this_user_task_container_id_accuracy
        df = history_no_lectures[history_no_lectures['task_container_id'] == row['task_container_id']]
        df = df[df['user_id'] == row['user_id']]
        df = df.agg({'answered_correctly': ['mean', 'count']})
        this_user_task_container_id_accuracy = df.values[0][0]

        # --- this_user_task_container_id_count
        this_user_task_container_id_count = df.values[1][0]

        # --- all_users_content_id_accuracy
        df = history_no_lectures[history_no_lectures['content_id'] == row['content_id']]
        df = df.agg({'answered_correctly': ['mean', 'count']})
        all_users_content_id_accuracy = df.values[0][0]

        # --- all_users_content_id_count
        all_users_content_id_count = df.values[1][0]

        # --- this_user_accuracy
        df = history_no_lectures[history_no_lectures['user_id'] == row['user_id']]
        df = df.agg({'answered_correctly': ['mean', 'count']})
        this_user_accuracy = df.values[0][0]

        # --- this_user_question_count
        this_user_question_count = df.values[1][0]

        # --- this_user_lectures_count
        df = history_no_lectures[history_no_lectures['user_id'] == row['user_id']]
        this_user_lectures_count = df.shape[0]

        # --- this_user_lectures_count_task_container_id
        df = df[df['task_container_id'] == row['task_container_id']]
        this_user_lectures_count_task_container_id = df.shape[0]

        # --- part
        this_question = questions[questions['question_id'] == row['content_id']]
        part = this_question['part'].values[0]

        # --- all_users_part_accuracy
        parts = questions[questions['part'] == part].agg({'count_all': ['sum'], 'count_correct': ['sum']})
        parts['accuracy'] = parts['count_correct'] / parts['count_all']
        all_users_part_accuracy = parts['accuracy'].values[0]

        # --- most_difficult_tag_accuracy
        this_question_tags = this_question.tags.values[0]
        this_question_tags = tags_df[tags_df.index.isin(this_question_tags)]
        most_difficult_tag_accuracy = this_question_tags['accuracy'].values[0]

        # --- most_difficult_tag_count
        most_difficult_tag_count = this_question_tags['count_all'].values[0]

        # --- prior_question_elapsed_time
        prior_question_elapsed_time = row['prior_question_elapsed_time']

        # --- timestamp
        timestamp = row['timestamp']

        # --- all_users_answers_sd
        users_answers = history_no_lectures[history_no_lectures['content_id'] == row['content_id']]
        value_counts = users_answers['user_answer'].value_counts()
        A = value_counts.values[value_counts.values == 1][0] if len(value_counts.values[value_counts.values == 1]) > 0 else 0
        B = value_counts.values[value_counts.values == 2][0] if len(value_counts.values[value_counts.values == 2]) > 0 else 0
        C = value_counts.values[value_counts.values == 3][0] if len(value_counts.values[value_counts.values == 3]) > 0 else 0
        D = value_counts.values[value_counts.values == 4][0] if len(value_counts.values[value_counts.values == 4]) > 0 else 0
        all_users_answers_sd = np.std([A, B, C, D])

        struct.append({
            'prior_question_had_explanation': prior_question_had_explanation,
            'all_users_task_container_id_accuracy': all_users_task_container_id_accuracy,
            'all_users_task_container_id_count': all_users_task_container_id_count,
            'this_user_task_container_id_accuracy': this_user_task_container_id_accuracy,
            'this_user_task_container_id_count': this_user_task_container_id_count,
            'all_users_content_id_accuracy': all_users_content_id_accuracy,
            'all_users_content_id_count': all_users_content_id_count,
            'this_user_accuracy': this_user_accuracy,
            'this_user_question_count': this_user_question_count,
            'this_user_lectures_count': this_user_lectures_count,
            'this_user_lectures_count_task_container_id': this_user_lectures_count_task_container_id,
            'part': part,
            'all_users_part_accuracy': all_users_part_accuracy,
            'most_difficult_tag_accuracy': most_difficult_tag_accuracy,
            'most_difficult_tag_count': most_difficult_tag_count,
            'prior_question_elapsed_time': prior_question_elapsed_time,
            'timestamp': timestamp,
            'all_users_answers_sd': all_users_answers_sd  
        })
        
        if get_ans == True:
            struct_correct.append(row['answered_correctly'])
        
    return struct, struct_correct

In [None]:
def do_predict_df(test_df, sample_prediction_df):
    if sample_prediction_df.empty:
        return sample_prediction_df

    data = get_data(test_df, history, False)[0]
    data = pd.DataFrame(data)
    data = data.fillna(0)
    predictions = model.predict(data.values)
    df_tmp = sample_prediction_df.reset_index()
    for index in range(len(predictions)):
        df_tmp.loc[index, 'answered_correctly'] = predictions[index][0]
        
    return df_tmp.set_index('group_num')

for t, (test_df, sample_prediction_df) in enumerate(iter_test):
    predict_df = do_predict_df(test_df, sample_prediction_df)
    env.predict(predict_df)