In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import tensorflow as tf
import math

types = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'boolean'
}        

data = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', dtype = types, nrows = 110000)
data = data.sort_values(by=['user_id'])

In [None]:
history = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', dtype = types)

In [None]:
train = pd.DataFrame()
test = pd.DataFrame()

In [None]:
user_ids = mylist = list(set(data['user_id'].values))

In [None]:
for user_id in user_ids:
    tmp = data[data['user_id'] == user_id]
    length = tmp.shape[0]
    train_len = math.floor(0.9 * length)
    test_len = length - train_len
    train = pd.concat([train, tmp.head(train_len)])
    test = pd.concat([test, tmp.tail(test_len)])
train = train.sort_values(by=['row_id'])

In [None]:
def minmax(df):
    return (df-df.min())/(df.max()-df.min())

In [None]:
def normalize_struct(struct):
    struct['all_users_task_container_id_count'] = minmax(struct['all_users_task_container_id_count'])
    struct['this_user_task_container_id_count'] = minmax(struct['this_user_task_container_id_count'])
    struct['all_users_content_id_count'] = minmax(struct['all_users_content_id_count'])
    struct['this_user_question_count'] = minmax(struct['this_user_question_count'])
    struct['this_user_lectures_count'] = minmax(struct['this_user_lectures_count'])
    struct['this_user_lectures_count_task_container_id'] = minmax(struct['this_user_lectures_count_task_container_id'])
    struct['part'] = minmax(struct['part'])
    struct['most_difficult_tag_count'] = minmax(struct['most_difficult_tag_count'])
    struct['prior_question_elapsed_time'] = minmax(struct['prior_question_elapsed_time'])
    return struct.fillna(0)

In [None]:
def get_data(data, history=None, get_ans=True):
    
    if history is None:
        history = data
    
    # wykluczenie wykładów
    history_no_lectures = history[history.content_type_id == 0]
    
    # wykluczenie pytań
    history_only_lectures = history[history.content_type_id == 1]
    
    # wykluczenie wykładów
    no_lectures = data[data.content_type_id == 0] 

    # wykluczenie pytań
    only_lectures = data[data.content_type_id == 1] 
    
    # rozszerzenie tabeli pytań o skuteczność, ilość pytań, ilość poprawnych
    questions_types = {
    'question_id': 'int16',
    'bundle_id': 'int16',
    'correct_answer': 'int8',
    'part': 'int8',
    'tags': 'string'
    }
    questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv', dtype = questions_types)
    questions.tags = [tags.split() if type(tags) is str else tags for tags in questions.tags.values]
    df = history_no_lectures.groupby(['content_id']).agg({'answered_correctly': ['mean', 'count', 'sum']})
    df = df['answered_correctly']
    questions = questions.merge(df, left_on = 'question_id', right_on = 'content_id', how = "left")
    questions = questions.fillna({'mean': 0.0, 'count': 0, 'sum': 0})
    questions = questions.rename(columns={'mean': 'accuracy', 'count': 'count_all', 'sum': 'count_correct'})
    
    
    # manipulacja tagami
    questions_with_tags = questions[questions.tags.isna() == False]
    tags = questions_with_tags.tags.values

    tags_set = set([item for elem in tags for item in elem])
    tags_list = list(tags_set)
    tags_df = pd.DataFrame()

    for tag in tags_list:
        df = questions_with_tags[questions_with_tags.tags.apply(lambda l: tag in l)]
        tmp_df = df.agg({'count_all': ['sum'], 'count_correct': ['sum']})
        tmp_df['tag'] = tag
        tmp_df['amount_questions_with_tag'] = len(df)
        tmp_df = tmp_df.set_index('tag')
        tags_df = tags_df.append(tmp_df)

    tags_df['accuracy'] = tags_df['count_correct'] / tags_df['count_all']
    tags_df = tags_df.sort_values(by='accuracy')
    
    
    struct = [] # dane statystyczne o danych wejściowych (pytaniach)
    struct_correct = [] # informacja czy na pytanie użytkownik odpowiedział poprawnie
    # ^ długości tablic te same, odpowiadające sobie indeksy

    for index, row in no_lectures.iterrows():
        # --- prior_question_had_explanation
        prior_question_had_explanation = int(row['prior_question_had_explanation']) if type(row['prior_question_had_explanation']) is bool else 0

        # --- all_users_task_container_id_accuracy 
        df = history_no_lectures[history_no_lectures['task_container_id'] == row['task_container_id']]
        df = df.agg({'answered_correctly': ['mean', 'count']})
        all_users_task_container_id_accuracy = df.values[0][0]

        # --- all_users_task_container_id_count
        all_users_task_container_id_count = df.values[1][0]

        # --- this_user_task_container_id_accuracy
        df = history_no_lectures[history_no_lectures['task_container_id'] == row['task_container_id']]
        df = df[df['user_id'] == row['user_id']]
        df = df.agg({'answered_correctly': ['mean', 'count']})
        this_user_task_container_id_accuracy = df.values[0][0]

        # --- this_user_task_container_id_count
        this_user_task_container_id_count = df.values[1][0]

        # --- all_users_content_id_accuracy
        df = history_no_lectures[history_no_lectures['content_id'] == row['content_id']]
        df = df.agg({'answered_correctly': ['mean', 'count']})
        all_users_content_id_accuracy = df.values[0][0]

        # --- all_users_content_id_count
        all_users_content_id_count = df.values[1][0]

        # --- this_user_accuracy
        df = history_no_lectures[history_no_lectures['user_id'] == row['user_id']]
        df = df.agg({'answered_correctly': ['mean', 'count']})
        this_user_accuracy = df.values[0][0]

        # --- this_user_question_count
        this_user_question_count = df.values[1][0]

        # --- this_user_lectures_count
        df = history_no_lectures[history_no_lectures['user_id'] == row['user_id']]
        this_user_lectures_count = df.shape[0]

        # --- this_user_lectures_count_task_container_id
        df = df[df['task_container_id'] == row['task_container_id']]
        this_user_lectures_count_task_container_id = df.shape[0]

        # --- part
        this_question = questions[questions['question_id'] == row['content_id']]
        part = this_question['part'].values[0]

        # --- all_users_part_accuracy
        parts = questions[questions['part'] == part].agg({'count_all': ['sum'], 'count_correct': ['sum']})
        parts['accuracy'] = parts['count_correct'] / parts['count_all']
        all_users_part_accuracy = parts['accuracy'].values[0]

        # --- most_difficult_tag_accuracy
        this_question_tags = this_question.tags.values[0]
        this_question_tags = tags_df[tags_df.index.isin(this_question_tags)]
        most_difficult_tag_accuracy = this_question_tags['accuracy'].values[0]

        # --- most_difficult_tag_count
        most_difficult_tag_count = this_question_tags['count_all'].values[0]

        # --- prior_question_elapsed_time
        prior_question_elapsed_time = row['prior_question_elapsed_time']

        struct.append({
            'prior_question_had_explanation': prior_question_had_explanation,
            'all_users_task_container_id_accuracy': all_users_task_container_id_accuracy,
            'all_users_task_container_id_count': all_users_task_container_id_count,
            'this_user_task_container_id_accuracy': this_user_task_container_id_accuracy,
            'this_user_task_container_id_count': this_user_task_container_id_count,
            'all_users_content_id_accuracy': all_users_content_id_accuracy,
            'all_users_content_id_count': all_users_content_id_count,
            'this_user_accuracy': this_user_accuracy,
            'this_user_question_count': this_user_question_count,
            'this_user_lectures_count': this_user_lectures_count,
            'this_user_lectures_count_task_container_id': this_user_lectures_count_task_container_id,
            'part': part,
            'all_users_part_accuracy': all_users_part_accuracy,
            'most_difficult_tag_accuracy': most_difficult_tag_accuracy,
            'most_difficult_tag_count': most_difficult_tag_count,
            'prior_question_elapsed_time': prior_question_elapsed_time 
        })
        print(index)
        if get_ans == True:
            struct_correct.append(row['answered_correctly'])
    return struct, struct_correct

In [None]:
train_struct_raw, train_struct_correct_raw = get_data(train)
train_struct = pd.DataFrame(train_struct_raw)
train_struct = train_struct.fillna(0)
train_struct = normalize_struct(train_struct)

train_struct_correct = np.array(train_struct_correct_raw)
train_struct['answered_correctly'] = train_struct_correct
train_struct.to_csv('train_struct_final')

In [None]:
test_struct_raw, test_struct_correct_raw = get_data(test, train)
test_struct = pd.DataFrame(test_struct_raw)
test_struct = test_struct.fillna(0)
test_struct = normalize_struct(test_struct)

test_struct_correct = np.array(test_struct_correct_raw)
test_struct['answered_correctly'] = test_struct_correct
test_struct.to_csv('test_struct')

In [None]:
model = tf.keras.Sequential([tf.keras.layers.Dense(16, activation='sigmoid'), tf.keras.layers.Dense(10, activation='sigmoid'), tf.keras.layers.Dense(10, activation='sigmoid'), tf.keras.layers.Dense(1)])

In [None]:
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='mean_squared_error', metrics=['accuracy'])

In [None]:
train_x = pd.DataFrame()
for i in range(0, 9):
    train_x = train_struct.sample(10000)
    train_model_values_correct = train_x['answered_correctly'].values
    train_model_values = train_x.drop(columns=['answered_correctly']).values
    model.fit(train_model_values, train_model_values_correct, epochs=2)

In [None]:
# model.fit(train_struct.values, train_struct_correct, epochs=10)

In [None]:
test_struct

In [None]:
test_model_values_correct = test_struct['answered_correctly'].values
test_model_values = test_struct.drop(columns=['answered_correctly']).values
predictions = model.predict(test_model_values)
predictions = pd.DataFrame(predictions)

In [None]:
test_struct['answered_correctly'].value_counts()

In [None]:
binary_pred = [1 if i > 0 else 0 for i in predictions[0].values]

In [None]:
# pred_norm = minmax(predictions[0])

In [None]:
# binary_pred_norm = [1 if i > 0.15 else 0 for i in pred_norm.values]

In [None]:
am_correct = 0
tp = 0
tn = 0
fp = 0
fn = 0
for i in range(0, len(binary_pred)):
    if binary_pred[i] == test_model_values_correct[i]:
        if binary_pred[i] == 0:
            tn = tn + 1
        else:
            tp = tp + 1
    else:
        if binary_pred[i] == 0:
            fn = fn + 1
        else:
            fp = fp + 1
            
               


In [None]:
spec = tp / (tp + fn)
sens = tn / (tn + fp)
prec = tp / (tp + fp)
nprec = fp / (tp + fp)

In [None]:
print('Spec')
print(spec)
print('Sens')
print(sens)
print('Prec')
print(prec)
print('NPrec')
print(nprec)

In [None]:
model.save('my_model.h5')
