In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# installing datatable
!pip install datatable

# Feature Engineering

In [None]:
questions = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv',
                       dtype = {
                           'question_id':'int64',
                           'bundle_id':'int64',
                           'correct_answer':'object',
                           'part':'int64'})
questions.fillna('-1', inplace = True)

import nltk, re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[0-9]', token):
            filtered_tokens.append(token)
    return filtered_tokens

from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=1.0, max_features=200000,
                                 min_df=0.0,use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))
%time tfidf_matrix = tfidf_vectorizer.fit_transform(questions.tags) #fit the vectorizer to synopses
print(tfidf_matrix.shape)
tags = tfidf_vectorizer.get_feature_names()
dist = 1 - cosine_similarity(tfidf_matrix)

NUM_CLUSTERS = 7


km = KMeans(n_clusters=NUM_CLUSTERS)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

n_words = 5

order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(NUM_CLUSTERS):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :n_words]: 
        print(' %s' % tags[ind], end=' ')
    print() #add whitespace
    
questions['kmean_cluster'] = clusters

questions.to_csv('./questions_fe.csv', index = False) 

In [None]:
lectures = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')

In [None]:
import datatable as dt
df = dt.fread("../input/riiid-test-answer-prediction/train.csv")
#df = pd.read_csv('../input/riiid-test-answer-prediction/train.csv',nrows = 1000000)
#df = df[df['content_type_id']!=1].merge(questions[['question_id', 'bundle_id','part','kmean_cluster']], left_on='content_id', right_on = 'question_id')
#df

In [None]:
df.names = {"content_id": "question_id"}
questions  = dt.Frame(questions[['question_id', 'bundle_id','part','kmean_cluster']])
questions.key = "question_id"
df = df[dt.f.content_type_id == 0, :]
df = df[: , : , dt.join(questions)]
df

In [None]:
from datatable import (dt, f, by, ifelse, update, sort,
                      count, min, max, mean, sum, rowsum,sd, unique)
num_users = unique(df[:,dt.f.user_id]).shape[0]

In [None]:
bundle_stats = df[:,{'mean': mean(dt.f.answered_correctly), 'std': sd(dt.f.answered_correctly),'nunique':count(dt.f.user_id)/num_users, 'count':count(dt.f.row_id)}, by('bundle_id')]
bundle_stats = bundle_stats.to_pandas()
bundle_stats.columns = ['bundle_id','bundle_mean_answered_correctly','bundle_std_answered_correctly','bundle_perc_students','bundle_times']
bundle_stats.to_csv('./bundle_stats.csv', index = False)

In [None]:
import gc 
gc.collect()

In [None]:
part_stats = df[:,{
    'mean': mean(dt.f.answered_correctly), 
    'std': sd(dt.f.answered_correctly),
    'nunique':count(dt.f.user_id)/num_users, 
    'count':count(dt.f.row_id)}, 
                by('part')]
part_stats = part_stats.to_pandas()
part_stats.columns = ['part','part_mean_answered_correctly','part_std_answered_correctly','part_perc_students','part_times']
part_stats.to_csv('./part_stats.csv', index = False)

In [None]:
cluster_stats = df[:,{
    'mean': mean(dt.f.answered_correctly), 
    'std': sd(dt.f.answered_correctly),
    'nunique':count(dt.f.user_id)/num_users, 
    'count':count(dt.f.row_id)}, 
                by('kmean_cluster')]
cluster_stats = cluster_stats.to_pandas()
cluster_stats.columns = ['kmean_cluster','cluster_mean_answered_correctly','cluster_std_answered_correctly','cluster_perc_students','cluster_times']
cluster_stats.to_csv('./cluster_stats.csv', index = False)

In [None]:
question_stats_columns  = ['question_id',
 #'bundle_id',
 #'part',
 #'kmean_cluster',
 'bundle_mean_answered_correctly',
 'bundle_std_answered_correctly',
 'bundle_perc_students',
 'bundle_times',
 'part_mean_answered_correctly',
 'part_std_answered_correctly',
 'part_perc_students',
 'part_times',
 'cluster_mean_answered_correctly',
 'cluster_std_answered_correctly',
 'cluster_perc_students',
 'cluster_times']
question_stats = questions.to_pandas().merge(bundle_stats, on='bundle_id').merge(part_stats,on = 'part').merge(cluster_stats, on = 'kmean_cluster')
question_stats = question_stats[question_stats_columns]
question_stats.to_csv('./questions_stats_fe.csv', index = False)



In [None]:
del bundle_stats, cluster_stats, part_stats