In [None]:
import numpy as np

import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

%matplotlib inline

# You can only call make_env() once, so don't lose it!
import riiideducation
#env = riiideducation.make_env()

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# INSPIRED BY

- [Kostiantyn Isaienkov's EDA Notebook](https://www.kaggle.com/isaienkov/riiid-answer-correctness-prediction-eda-modeling/)
- [Ilia Larchenko's Simple EDA and Baseline Notebook](https://www.kaggle.com/ilialar/simple-eda-and-baseline)


### Training data is in the competition dataset as usual

It's larger than will fit in memory with default settings, so we'll specify more efficient datatypes and only load a subset of the data for now.

In [None]:
#Reading Files

#train_df - Two options, for a smallish sample load data with code
# cell a couple below. For all data go to modelling section in notebook.

#questions.csv
used_data_types_dict = {
    'question_id': 'int16',
    'bundle_id': 'int16',
    'correct_answer': 'int8',
    'part': 'int8',
    'tags': 'str',
}


questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv',
                       usecols = used_data_types_dict.keys(), dtype=used_data_types_dict)


#lectures.csv
lect = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')

#example_test
et = pd.read_csv('../input/riiid-test-answer-prediction/example_test.csv', index_col='row_id')

#train_df
train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', low_memory=False, nrows=10**7, 
                       dtype={'row_id': 'int64',
                              'timestamp': 'int64',
                              'user_id': 'int32',
                              'content_id': 'int16',
                              'content_type_id': 'int8',
                              'task_container_id': 'int16',
                              'user_answer': 'int8',
                              'answered_correctly': 'int8',
                              'prior_question_elapsed_time': 'float32',
                              'prior_question_had_explanation': 'boolean',
                             })
print(train_df.shape)

In [None]:
#CHECKLIST
#1: Explore variables
    #TRAIN.CSV 
    #QUESTIONS.CSV
    #LECTURES.CSV

#2: Explore creating new features from variables combinations



# Data Exploration Stuff

### train.csv

* timestamp: done
* user_id: done
* content_id: done
* content_type_id: done
* task_container_id: done
* user_answer: done
* answered_correctly: done
* prior_question_elapsed_time: done
* prior_question_had_explanation: done

In [None]:
train_df.isnull().sum()

In [None]:
correct = train_df[train_df['answered_correctly'] == 1]
incorrect = train_df[train_df['answered_correctly'] == 0]

print ("Correct: %i (%.1f%%)"%(len(correct), float(len(correct))/len(train_df)*100.0))
print ("Incorrect: %i (%.1f%%)"%(len(incorrect), float(len(incorrect))/len(train_df)*100.0))
print ("Total: %i"%len(train_df))

### timestamp

In [None]:
train_df.timestamp.describe()

In [None]:
plt.xlim(0,50000000000)
sns.distplot(a=correct['timestamp'], label='correct', kde=False)
sns.distplot(a=incorrect['timestamp'], label="incorrect", kde=False)
plt.title("TimeStamp: Correct vs Incorrect")
plt.legend()

### prior_question_elapsed_time

In [None]:
plt.xlim(0, 150000)

sns.distplot(a=correct['prior_question_elapsed_time'], label='correct', kde=False)
sns.distplot(a=incorrect['prior_question_elapsed_time'], label="incorrect", kde=False)
plt.title("prior_question_elapsed_time: Correct vs Incorrect")
plt.legend()

### prior_question_had_explanation

In [None]:
train_df[['prior_question_had_explanation', 'answered_correctly']].groupby(['prior_question_had_explanation'], 
                                                                           as_index=False, dropna=False).mean()

Ideas?

* for the first question of a test is the prior_question_had_explanation always false, or is it filled with a null value?

* why are the prior_question_elapsed_time and prior_question_elapsed_time not the same in terms of null values?

### Task Container ID

In [None]:
train_df.task_container_id.nunique()


In [None]:
train_df['task_container_id'].value_counts(ascending=True)[-30:].plot(kind='barh', figsize=(10,6),
                                                                      title='Top 30: task_container_ids')

In [None]:
train_df['task_container_id'].value_counts().plot(kind='line', figsize=(10,6))
plt.xlabel('task_container_id')
plt.ylabel('counts')

### user_answer

In [None]:
train_df.user_answer.value_counts(normalize=True)

In [None]:
train_df[['user_answer', 'answered_correctly']].groupby(['user_answer'], as_index=False).mean()

### User ID

In [None]:
train_df['user_id'].value_counts()

In [None]:
#essentially makes a dataset with the value_counts of the user_id column
#need to figure out how to effectively use this.
ds = train_df['user_id'].value_counts().reset_index()
ds.columns = ['user_id', 'count']
ds = ds.sort_values('count')

In [None]:
train_df['user_id'].value_counts(ascending=True)[-30:].plot(kind='barh', figsize=(10,6),
                                                                      title='Top 30: user_ids')

In [None]:
train_df['user_id'].value_counts()[-30:].plot(kind='barh', figsize=(10,6),
                                                                      title='Bottom 30: user_ids')

In [None]:
plt.figure(figsize=(14,6))
sns.lineplot(x=train_df['user_id'], y=train_df['user_id'].value_counts())
plt.xlabel("user_id")
plt.ylabel("count")
plt.title("number of rows per user_id")

we can see from the above that the value_count doesnt seem to change between low and high user_id's. 

### content_id

In [None]:
train_df.content_id.nunique()

In [None]:
ds = train_df['content_id'].value_counts().reset_index()
ds.columns = ['content_id', 'count']
ds = ds.sort_values('count', ascending=False)

ds.head(5)

In [None]:
plt.figure(figsize=(10,6))
plt.title('Top 30: content_ids')

sns.barplot(x=ds.head(30)['count'], y=ds.head(30).content_id, orient = 'h', 
            order=ds.head(30).sort_values('count', ascending = False).content_id)

In [None]:
plt.figure(figsize=(10,6))
plt.title('Bottom 30: content_ids')

sns.barplot(x=ds.tail(30)['count'], y=ds.tail(30).content_id, orient = 'h', 
            order=ds.tail(30).sort_values('count').content_id)

In [None]:
sns.lineplot(data=ds['count'])
plt.xlabel('content_id')
plt.ylabel('count')

### content_type_id

In [None]:
train_df.user_answer.value_counts()

In [None]:
train_df.content_type_id.value_counts()

* 0 if the event was a question being posed to the user
* 1 if the event was the user watching a lecture

note: we can see that there are the same number of content_type_id = 1 as there is user_answer = -1.

# questions.csv

question_id - foreign key for the content_id column in train/test data

In [None]:
que = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv', index_col='question_id')
que

In [None]:
que.isnull().sum()

### correct_answer

In [None]:
ds = que['correct_answer'].value_counts(normalize=True).reset_index()
ds.columns = ['correct_answer', 'number_of_answers']
ds = ds.sort_values(['number_of_answers'], ascending=False)

ds

### bundle_id

In [None]:
que.bundle_id.value_counts()

In [None]:
ds = que.bundle_id.value_counts().value_counts(normalize=True)
ds = pd.DataFrame({'#_in_bundle':ds.index, 'percentage_of_questions':ds.values}) #created a data frame otherwise it would come out as a series
ds

In [None]:
que.loc[que['bundle_id'] == 7795]

Note: Maybe bundles with more that one question are connected? ie if you get the first question wrong you might be able to solve the remaining Q's?


### part

In [None]:
ds = que['part'].value_counts(normalize=True).reset_index()
ds.columns = ['parts', 'percentage_of_questions']
ds = ds.sort_values(['percentage_of_questions'], ascending=False)
ds

### tags

some feature eng here for sure

In [None]:
#seeing how many tags most of the questions have 
ds = que['tags'].str.split().str.len().value_counts(normalize=True)
ds = pd.DataFrame({'#_of_tags':ds.index, 'percentage_of_questions':ds.values})
ds['#_of_tags'] = ds['#_of_tags'].astype(int)

ds

In [None]:
#seeing which tags occur most frequently
ds = que['tags'].str.split(' ').explode('tags').reset_index()
ds = ds['tags'].value_counts().reset_index()
ds.columns = ['tag_number', 'count']

ds

In [None]:
plt.figure(figsize=(10,6))
plt.title('Top 30: tags')

sns.barplot(x=ds.head(30)['count'], y=ds.head(30).tag_number, orient = 'h', 
            order=ds.head(30).sort_values('count', ascending = False).tag_number)

In [None]:
plt.figure(figsize=(10,6))
plt.title('Bottom 30: tags')

sns.barplot(x=ds.tail(30)['count'], y=ds.tail(30).tag_number, orient = 'h', 
            order=ds.tail(30).sort_values('count', ascending = False).tag_number)

In [None]:
plt.figure(figsize=(14,6))
sns.lineplot(x=ds['tag_number'], y=ds['count'])

# lectures.csv

In [None]:
lect = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')
lect

In [None]:
lect.isnull().sum()

### Part

In [None]:
ds = lect.part.value_counts()
ds

In [None]:
ds = lect.part.value_counts(normalize=True)
ds = pd.DataFrame({'part_numb':ds.index, 'percentage_of_questions':ds.values})
ds

### type_of

In [None]:
ds = lect.type_of.value_counts()
ds

In [None]:
ds = lect.type_of.value_counts()
ds.plot(kind='bar', )

In [None]:
ds = lect.loc[lect['type_of'] == 'solving question']
ds.part.value_counts(normalize=True).plot(kind='bar')

In [None]:
ds = lect.loc[lect['type_of'] == 'concept']
ds.part.value_counts(normalize=True).plot(kind='bar')

### Tag

In [None]:
lect.tag.value_counts().value_counts().to_frame().plot(kind='bar')
plt.xlabel('Tag_number')
plt.ylabel('count')
plt.title('Number of Lectures with each tag number')

# example_test_rows.csv

In [None]:
et = pd.read_csv('../input/riiid-test-answer-prediction/example_test.csv', index_col='row_id')
et

# Some Feature Engineering

In [None]:
train_questions_only_df = train_df[train_df['answered_correctly']!=-1]
train_questions_only_df.head(1)

In [None]:
train_questions_only_df = pd.merge(train_questions_only_df, questions['part'], 
                                   left_on='content_id', right_index=True, how = 'left')
train_questions_only_df.head(1)

In [None]:
#removes rows that are lectures and .groupby the user_id
grouped_by_user_df = train_questions_only_df.groupby('user_id')

#getting the mean accuracy, question count of each user and other math stuff
user_answers_df = grouped_by_user_df.agg({'answered_correctly': ['mean', 'count', 'std', 'median', 'skew']}).copy()
user_answers_df.columns = [
    'mean_user_accuracy', 
    'questions_answered', 
    'std_user_accuracy', 
    'median_user_accuracy', 
    'skew_user_accuracy'
]

user_answers_df

In [None]:
#grouping by content_id
grouped_by_content_df = train_questions_only_df.groupby('content_id')

#getting mean count and other stuff for each content_id
content_answers_df = grouped_by_content_df.agg({'answered_correctly': ['mean', 'count', 'std', 'median', 'skew']}).copy()
content_answers_df.columns = [
    'mean_accuracy', 
    'question_asked', 
    'std_accuracy', 
    'median_accuracy', 
    'skew_accuracy'
]

content_answers_df

In [None]:
grouped_by_part_df = train_questions_only_df.groupby('part')

part_answers_df = grouped_by_part_df.agg({'answered_correctly': ['mean', 'count', 'std', 'skew']}).copy()
part_answers_df.columns = [
    'part_mean_accuracy', 
    'part_questions_answered', 
    'part_std_user_accuracy',  
    'part_skew_user_accuracy'
]

part_answers_df

In [None]:
features = [
    'mean_user_accuracy', 
    'questions_answered',
    'std_user_accuracy', 
    'median_user_accuracy',
    'skew_user_accuracy',
    'mean_accuracy', 
    'question_asked',
    'std_accuracy', 
    'median_accuracy',
    'prior_question_elapsed_time', 
    'prior_question_had_explanation',
    'skew_accuracy',
    'part',
    'part_mean_accuracy', 
    'part_questions_answered', 
    'part_std_user_accuracy',  
    'part_skew_user_accuracy'
]
target = 'answered_correctly'

### Feature Selection

In [None]:
train_df = train_df[train_df[target] != -1]

train_df = pd.merge(train_df, questions['part'], 
                    left_on='content_id', right_index=True, how = 'left')

train_df = train_df.merge(user_answers_df, how='left', on='user_id')
train_df = train_df.merge(content_answers_df, how='left', on='content_id')
train_df = train_df.merge(part_answers_df, how='left', left_on='part', right_index=True)

train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].fillna(value=False).astype(bool)
train_df = train_df.fillna(value=0.5)

train_df = train_df[features + [target]]
train_df = train_df.replace([np.inf, -np.inf], np.nan)
train_df = train_df.fillna(0.5)

train_df

# Maybe creating a loop to see if user has seen question before

This is an idea I had to create some sort of loop that checks if the user has seen the question before. It takes far too long to run, and I might have to try a different mehtod than pandas to speed up the process. 



In [None]:
from scipy.sparse import csc_matrix

In [None]:
seen_before_df_keys=csc_matrix((0, 0), dtype=np.int8).toarray()
seen_before_df=csc_matrix((0, 13523), dtype=np.int8).toarray()
clean_row=csc_matrix((1, 13523), dtype=np.int8).toarray()

In [None]:
def seen_question_before(dataset, seen_before_df_keys, seen_before_df, clean_row):
    
    dataset['seen_q_before']=0

    for i in dataset.index:
        x = dataset.loc[i] 
        if np.any(seen_before_df_keys == x.user_id)==True:
            dataset.at[i, 'seen_q_before'] = seen_before_df[(np.where(seen_before_df_keys == x.user_id)[0][0]),(x.content_id)]
            seen_before_df[(np.where(seen_before_df_keys == x.user_id)[0][0]),(x.content_id)]+=1

        elif np.any(seen_before_df_keys == x.user_id)==False:
            seen_before_df_keys = np.append(seen_before_df_keys, x.user_id)
            seen_before_df = np.append(seen_before_df, clean_row, axis=0)

            dataset.at[i, 'seen_q_before'] = seen_before_df[(np.where(seen_before_df_keys == x.user_id)[0][0]),(x.content_id)]
            seen_before_df[(np.where(seen_before_df_keys == x.user_id)[0][0]),(x.content_id)]+=1
    
    return (dataset, seen_before_df_keys, seen_before_df)
        
 

In [None]:
et, seen_before_df_keys, seen_before_df = seen_question_before(et, seen_before_df_keys, 
                                                                          seen_before_df, clean_row)