# load packages 

In [None]:
pip install seaborn==0.11.0

In [None]:
#import packages
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
import riiideducation

In [None]:
pip list

# load data and prepare environment

In [None]:
train = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',nrows=10000)

In [None]:
lectures = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')

In [None]:
questions = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/questions.csv")

# Data information copied from the Riiid! Case competition site at https://www.kaggle.com/c/riiid-test-answer-prediction/data

### train Data

<b>row_id: </b> (int64) ID code for the row.

<b>timestamp: </b>(int64) the time in milliseconds between this user interaction and the first event completion from that user.

<b>user_id: </b> (int32) ID code for the user.

<b> content_id:  </b> (int16) ID code for the user interaction

<b>content_type_id:</b>(int8) 0 if the event was a question being posed to the user, 1 if the event was the user watching a lecture.

<b>task_container_id:</b> (int16) Id code for the batch of questions or lectures. For example, a user might see three questions in a row before seeing the explanations for any of them. Those three would all share a task_container_id.

<b>user_answer: </b>(int8) the user's answer to the question, if any. Read -1 as null, for lectures.

<b>answered_correctly: </b> (int8) if the user responded correctly. Read -1 as null, for lectures.

<b>prior_question_elapsed_time:  </b>(float32) The average time in milliseconds it took a user to answer each question in the previous question bundle, ignoring any lectures in between. Is null for a user's first question bundle or lecture. Note that the time is the average time a user took to solve each question in the previous bundle.

<b>prior_question_had_explanation: </b> (bool) Whether or not the user saw an explanation and the correct response(s) after answering the previous question bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is null for a user's first question bundle or lecture. Typically the first several questions a user sees were part of an onboarding diagnostic test where they did not get any feedback.


### Questions : 

<b> questions.csv:</b> metadata for the questions posed to users.

<b> question_id: </b>foreign key for the train/test content_id column, when the content type is question (0).

<b> bundle_id:</b> code for which questions are served together.

<b> correct_answer:</b> the answer to the question. Can be compared with the train user_answer column to check if the user was right.

<b> part:</b> the relevant section of the TOEIC test.

<b> tags:</b> one or more detailed tag codes for the question. The meaning of the tags will not be provided, but these codes are sufficient for clustering the questions together.

### Lectures: 

<b>lecture_id: </b>foreign key for the train/test content_id column, when the content type is lecture (1).

<b>part:</b> top level category code for the lecture.

<b>tag:</b> one tag codes for the lecture. The meaning of the tags will not be provided, but these codes are sufficient for clustering the lectures together.

<b>type_of:</b> brief description of the core purpose of the lecture

### Example test rows:

<b>prior_group_responses: </b> (string) provides all of the user_answer entries for previous group in a string representation of a list in the first row of the group. All other rows in each group are null. If you are using Python, you will likely want to call eval on the non-null rows. Some rows may be null, or empty lists.

<b>prior_group_answers_correct: </b> (string) provides all the answered_correctly field for previous group, with the same format and caveats as prior_group_responses. Some rows may be null, or empty lists.

# Preliminary data exploration:

For this preliminary data exploration we'll be looking at the distributions of the columns.

### train Data

In [None]:
train.info()

In [None]:
train.nunique()

In [None]:
print(train.shape)
print(len(train['row_id'].unique()))

<b>timestamp: </b>(int64) the time in milliseconds between this user interaction and the first event completion from that user.

In [None]:
# we'll bin the timestamps into 20 buckets. 
print(train['timestamp'].min())
print(train['timestamp'].max())
step = (train['timestamp'].max()/20)
bins_list = [0]
step_added = 0
for i in range(0, 19):
    step_added = step_added + step
    bins_list.append(step_added)

In [None]:
train['binned_timestamp'] = pd.cut(train['timestamp'], bins_list) #If 0 miliseconds have passed, the value of binned_timestamp is NA
train['binned_timestamp'] = train['binned_timestamp'].astype(str)
train = train.sort_values(['timestamp'])

<b>content_type_id:</b>(int8) 0 if the event was a question being posed to the user, 1 if the event was the user watching a lecture.

<b> content_id:  </b> (int16) ID code for the user interaction

In [None]:
train.groupby('content_type_id').count() #very few lectures

<b>task_container_id:</b> (int16) Id code for the batch of questions or lectures. For example, a user might see three questions in a row before seeing the explanations for any of them. Those three would all share a task_container_id. This ID will make more sense from the questions point of view.

<b>user_answer: </b>(int8) the user's answer to the question, if any. Read -1 as null, for lectures.


In [None]:
train.groupby("user_answer").count()

<b> comment </b> : from looking at this plot we can see that the correct answer is usually evenly distributed among the numberical options. However, students tend to choose to choose 2 the least. 

<b>prior_question_elapsed_time:  </b>(float32) The average time in milliseconds it took a user to answer each question in the previous question bundle, ignoring any lectures in between. Is null for a user's first question bundle or lecture. Note that the time is the average time a user took to solve each question in the previous bundle.

In [None]:
print(train["prior_question_elapsed_time"].shape)
print(len(train['prior_question_elapsed_time'].unique()))

In [None]:
# we'll bin the timestamps into 20 buckets. 
print(train['prior_question_elapsed_time'].min())
print(train['prior_question_elapsed_time'].max())
step = (train['prior_question_elapsed_time'].max()/20)
bins_list = [0]
step_added = 0
for i in range(0, 19):
    step_added = step_added + step
    bins_list.append(step_added)

In [None]:
train['binned_prior_question_elapsed_time'] = pd.cut(train['prior_question_elapsed_time'], bins_list) #If 0 miliseconds have passed, the value of binned_timestamp is NA
train['binned_prior_question_elapsed_time'] = train['binned_prior_question_elapsed_time'].astype(str)
train = train.sort_values(['prior_question_elapsed_time'])

<b>prior_question_had_explanation: </b> (bool) Whether or not the user saw an explanation and the correct response(s) after answering the previous question bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is null for a user's first question bundle or lecture. Typically the first several questions a user sees were part of an onboarding diagnostic test where they did not get any feedback.

In [None]:
train["float_prior_question_had_explanation"] = train["prior_question_had_explanation"].astype(float)

<b> comment </b> : It seems that when the prior question has an explanation students tend not to get the question right

-----

### Questions : 

<b> questions.csv:</b> metadata for the questions posed to users.

<b> question_id: </b>foreign key for the train/test content_id column, when the content type is question (0).

<b> bundle_id:</b> code for which questions are served together.


In [None]:
print(questions.shape)
print(len(questions['bundle_id'].unique()))
print(len(questions['question_id'].unique()))

In [None]:
df_bundle = questions.groupby('bundle_id').count().reset_index()[['bundle_id', "part"]].rename(columns={"part":"count_in_bundle"})

In [None]:
print(df_bundle.shape)

<b> comment </b> : the majority of hte questions are not bundled

In [None]:
df_questions = questions.merge(df_bundle, on = "bundle_id", how = "left")

<b> correct_answer:</b> When we look at the number of asked questions and see how many fall into the category of "count_in_bundle" we see an increase in category 3, 4 and 5 (as we would have expected). 

<b> comment:</b> There's a fewer number of answers labled as option 2 that are correct. 

<b> part:</b> the relevant section of the TOEIC test.

<b> tags:</b> one or more detailed tag codes for the question. The meaning of the tags will not be provided, but these codes are sufficient for clustering the questions together.

In [None]:
#because there are so many tags and many tags combinations, I'll abstain from doing an very large number of feature by hotcoding the tags. 
#I'll look at the number of tags in the question instead under the hypothesis that the more tags the question has 
# the student has the opportunity to extrapolate the knowledge of the answer from more sources. 
print(df_questions['tags'].shape)
print(len(df_questions['tags'].unique()))
df_questions['number_tags'] = df_questions['tags'].str.split(" ").str.len()


### Lectures: 

<b>lecture_id: </b>foreign key for the train/test content_id column, when the content type is lecture (1).

<b>part:</b> top level category code for the lecture.

<b>tag:</b> one tag codes for the lecture. The meaning of the tags will not be provided, but these codes are sufficient for clustering the lectures together.

<b>type_of:</b> brief description of the core purpose of the lecture

In [None]:
print(len(lectures['tag'].unique()))
print(len(lectures['type_of'].unique()))

### Joining the three datasets
for now, I'll abstain from making features related to previous lectures because I dont know how to identified whether the student saw the lecture before or after answering the question

In [None]:
train = train.rename(columns = {"content_id": "question_id"})

In [None]:
df_joined = train[train["content_type_id"]==0].merge(df_questions, on = ['question_id'], how = 'left')

# Feature engineering

feature 1: fea timestamp

feature 2: fea prior question elapsed time

feature 3: fea prior question had explanation

feature 4: fea part (question)

feature 5: fea_num_questions_bundle

feature 6: number of tags in question

In [None]:
df_joined = df_joined.rename(columns = {
    "timestamp": "fea_time_until_first_event_completion",
    'prior_question_elapsed_time' : 'fea_prior_question_elapsed_time',
    'float_prior_question_had_explanation': 'fea_prior_question_had_explanation',
    'part':'fea_question_part',
    'count_in_bundle':"fea_num_questions_bundle",
    'number_tags':'fea_number_of_tags_in_question'
})

In [None]:
df_master_table = df_joined[['row_id', 'answered_correctly']+[c for c in df_joined.columns if "fea" in c]].dropna()

In [None]:
df_x = np.array(df_master_table[[c for c in df_joined.columns if "fea" in c]])
df_y = np.array(df_master_table[['answered_correctly']])

# Model Training Pipeline 

With the intention of keeping our notebook simple, we'll explore only three classification algorithms:
    - Forest of Randomized Trees 
    - Gradient Boosting Classifier

In [None]:
Dictionary_of_algorithms = {
    "GBM" : GradientBoostingClassifier(random_state=5),
    "random_forest": RandomForestClassifier(random_state=5),
}
#need to set the seed

before doing gridsearch, we'll explore which algorithms seems to be fitting the data best

In [None]:
report = pd.DataFrame(columns = ["algorithm_name", "mean_squared_error"])
for name, algo in Dictionary_of_algorithms.items():
    scores = pd.DataFrame([cross_val_score(algo, df_x, df_y, cv=3, scoring='accuracy').mean()], columns = ["mean_squared_error"])
    scores['algorithm_name'] = name
    report = pd.concat([report,scores])
    

In [None]:
report #it seems to be that the GBM algorithm provides with a higher accuracy

# Gridsearch: 

because GBM offers a higher accuracy, we'll proceed to do gridsearch on the GBM model.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.33, random_state=42)

In [None]:
parameters = {'random_state':[5], 'learning_rate':[0.05, 0.1, 0.15], "min_samples_split":[2,10,20]}
clf = GridSearchCV(GradientBoostingClassifier(), parameters)

In [None]:
gmb_gridsearched = clf.fit(X_train, y_train)

In [None]:
gmb_gridsearched.get_params()

In [None]:
df_base = pd.DataFrame(y_test, columns =['Actual_y'])
df_base['predicted_y'] = gmb_gridsearched.predict(X_test)

In [None]:
example_test = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/example_test.csv")

In [None]:
example_test = example_test.rename(columns = {"content_id": "question_id"})
example_test = example_test[example_test["content_type_id"]==0].merge(df_questions, on = ['question_id'], how = 'left')

In [None]:
example_test["float_prior_question_had_explanation"] = example_test["prior_question_had_explanation"].astype(float)

In [None]:
example_test = example_test.rename(columns = {
    "timestamp": "fea_time_until_first_event_completion",
    'prior_question_elapsed_time' : 'fea_prior_question_elapsed_time',
    'float_prior_question_had_explanation': 'fea_prior_question_had_explanation',
    'part':'fea_question_part',
    'count_in_bundle':"fea_num_questions_bundle",
    'number_tags':'fea_number_of_tags_in_question'
})

In [None]:
df_output = example_test['row_id']

In [None]:
example_test.dtypes

In [None]:
df_output['predictions'] = gmb_gridsearched.predict(np.array(example_test[[c for c in example_test.columns if "fea" in c]].dropna()))

In [None]:
df_output.to_csv("submission.csv")