In [None]:
%%time

pip install pyspark -q

In [None]:
from pyspark.sql import SparkSession, DataFrame, Window, functions as F, types as T
#from pyspark.ml.feature import Bucketizer
spark = SparkSession.builder.appName('_').getOrCreate()

In [None]:
import numpy as np

In [None]:
%%time

def train_to_tmp():

    train_schema = T.StructType([
        T.StructField('row_id',                          T.LongType(),    False),
        T.StructField('timestamp',                       T.LongType(),    False),
        T.StructField('user_id',                         T.IntegerType(), False),
        T.StructField('content_id',                      T.ShortType(),   False),
        T.StructField('content_type_id',                 T.ByteType(),    False),
        T.StructField('task_container_id',               T.ShortType(),   False),
        T.StructField('user_answer',                     T.ByteType(),    False),
        T.StructField('answered_correctly',              T.ByteType(),    False),
        T.StructField('prior_question_elapsed_time',     T.FloatType(),   False),
        T.StructField('prior_question_had_explanation',  T.BooleanType(), False),
    ])

    train = spark\
    .read.csv('../input/riiid-test-answer-prediction/train.csv',
              schema = train_schema,
              enforceSchema = True,
              header = True)\
    .withColumn('prior_question_had_explanation', F.col('prior_question_had_explanation').astype(T.ByteType()))
    #.na.fill(-1)

    train.write.parquet('tmp', mode = 'overwrite')


train_to_tmp()
tmp = spark.read.parquet('./tmp')

In [None]:
%%time

tmp\
.filter(F.col('content_type_id')==0)\
.select(['user_id','timestamp','task_container_id']).distinct()\
.withColumn('new_order', 
            F.row_number().over(
                Window.partitionBy('user_id').orderBy('timestamp')
            )-1)\
.write.parquet('utc', mode = 'overwrite')
utc = spark.read.parquet('./utc')


tmp\
.join(utc,on=['user_id','timestamp','task_container_id'], how = 'inner')\
.write.parquet('tmp_new', mode = 'overwrite')

!rm -rf  ./tmp
!rm -rf  ./utc

df = spark.read.parquet('./tmp_new')

In [None]:
%%time

uo_tm = df.groupBy(['user_id','new_order']).agg(F.mean('prior_question_elapsed_time').alias('question_elapsed_time'))
uo_tm = uo_tm.withColumn('new_order', F.col('new_order')-1)
df = df.join(uo_tm, on = ['user_id','new_order'], how = 'left')
df.write.parquet('df_new', mode = 'overwrite')
df = spark.read.parquet('./df_new')

In [None]:
%%time

dfp = df\
.filter(F.col('content_type_id')==0)\
.groupBy('content_id')\
.agg(F.mean('question_elapsed_time').alias('mean_question_elapsed_time')).toPandas()

In [None]:
qm = dfp\
.sort_values(by = 'content_id')\
.mean_question_elapsed_time\
.to_numpy(dtype = np.float32)

In [None]:
np.save('question_mean_time', qm)

In [None]:
!rm -rf  ./tmp_new
!rm -rf  ./df_new