In [None]:
%%time

pip install pyspark -q

In [None]:
from pyspark.sql import SparkSession, DataFrame, Window, functions as F, types as T
from pyspark.ml.feature import Bucketizer
import pandas as pd

In [None]:
spark = SparkSession.builder.appName('_').getOrCreate()

In [None]:
def train_to_tmp():

    train_schema = T.StructType([
        T.StructField('row_id',                          T.LongType(),    False),
        T.StructField('timestamp',                       T.LongType(),    False),
        T.StructField('user_id',                         T.IntegerType(), False),
        T.StructField('content_id',                      T.ShortType(),   False),
        T.StructField('content_type_id',                 T.ByteType(),    False),
        T.StructField('task_container_id',               T.ShortType(),   False),
        T.StructField('user_answer',                     T.ByteType(),    False),
        T.StructField('answered_correctly',              T.ByteType(),    False),
        T.StructField('prior_question_elapsed_time',     T.FloatType(),   False),
        T.StructField('prior_question_had_explanation',  T.BooleanType(), False),
    ])

    train = spark\
    .read.csv('../input/riiid-test-answer-prediction/train.csv',
              schema = train_schema,
              enforceSchema = True,
              header = True)\
    .withColumn('prior_question_had_explanation', F.col('prior_question_had_explanation').astype(T.ByteType()))
    #.na.fill(-1)

    train.write.parquet('tmp', mode = 'overwrite')

In [None]:
%%time

train_to_tmp()
tmp = spark.read.parquet('./tmp')

In [None]:
%%time

tmp\
.select(['user_id','timestamp','task_container_id']).distinct()\
.withColumn('new_order', 
            F.row_number().over(
                Window.partitionBy('user_id').orderBy('timestamp')
            )-1)\
.write.parquet('utc', mode = 'overwrite')
utc = spark.read.parquet('./utc')

In [None]:
%%time

tmp\
.join(utc,on=['user_id','timestamp','task_container_id'], how = 'inner')\
.write.parquet('tmp_new', mode = 'overwrite')

In [None]:
!rm -rf  ./tmp
!rm -rf  ./utc

In [None]:
%%time

df = spark.read.parquet('./tmp_new')

user_map = df\
.select('user_id')\
.distinct()\
.withColumn('r', F.floor(F.rand(0)*10))\
.write.parquet('user_map', mode = 'overwrite')

user_map = spark.read.parquet('./user_map')

In [None]:
%%time

for i in range(10):

    imap = user_map\
    .filter(F.col('r')==i)\
    .select('user_id')

    idf = df.join(imap, on = 'user_id', how = 'inner')

    idf.repartition(10).write.parquet('train_'+str(i), mode = 'overwrite')
    
    _ = pd.read_parquet('./train_'+str(i))
    !rm -rf  ./train_*
    _.to_parquet('df_'+str(i))

In [None]:
!rm -rf  ./tmp_new
!rm -rf  ./user_map