In [1]:
import os
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import numpy as np

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages "org.apache.hadoop:hadoop-aws:2.7.3" pyspark-shell'
conf = SparkConf().setAppName("Medhere").setMaster("local[*]")
#extra memory
SparkContext.setSystemProperty('spark.executor.memory', '15g')
SparkContext.setSystemProperty('spark.driver.memory', '15g')

sc = pyspark.SparkContext(conf=conf).getOrCreate()

In [2]:
sc

## Variables

In [3]:
cgms_data = 'cgms.csv'
meal_time_data = 'meal_time.csv'

In [4]:
from pyspark.sql import SQLContext
# assuming the spark environemnt is set and sc is spark.sparkContext
sqlContext = SQLContext(sc)

In [5]:
header = sc.textFile(meal_time_data).first()
new_column = ['user', 'time', 'label']

meal_time_df = sc.textFile(meal_time_data).filter(lambda x: x != header)\
                 .map(lambda x: x.split(','))\
                 .map(lambda x: (x[0], x[2], 1)).toDF()

for c, n in zip(meal_time_df.columns, new_column):
    meal_time_df = meal_time_df.withColumnRenamed(c, n)

In [6]:
meal_time_df.show(3)

+-------------+--------+-----+
|         user|    time|label|
+-------------+--------+-----+
|T-04_IPEN_Run|03:31:00|    1|
|T-04_IPEN_Run|05:20:00|    1|
|T-04_IPEN_Run|10:24:00|    1|
+-------------+--------+-----+
only showing top 3 rows



In [7]:
header = sc.textFile(cgms_data).first()

cgms_df = sc.textFile(cgms_data).filter(lambda x: x != header)\
    .map(lambda x: x.split(','))\
    .map(lambda x: (x[0], x[1], float(x[2]), float(x[3]),
                    float(x[4]), float(x[5]), float(x[6]),
                    float(x[7]), float(x[8]), float(x[9]), 
                    float(x[10]), float(x[11]), float(x[12]),
                    float(x[13]), float(x[14]), float(x[15]))).toDF()

In [8]:
cgms_df.show(3)

+-------------+--------+-----------------+--------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+------------------+-----------------+------------------+----+------------------+-----------------+
|           _1|      _2|               _3|                  _4|                 _5|                  _6|                 _7|                 _8|                  _9|               _10|               _11|              _12|               _13| _14|               _15|              _16|
+-------------+--------+-----------------+--------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+------------------+-----------------+------------------+----+------------------+-----------------+
|T-04_IPEN_Run|00:00:00|83.16726152243767|5.260906662625613E-4|0.21077864128950047|3.791610828123643...| 0.2110929868794306|0.23817284490383425| -0.374

In [9]:
cond = [cgms_df._1 == meal_time_df.user, cgms_df._2 == meal_time_df.time ]
cgms_meal_df  = cgms_df.join(meal_time_df, cond ,how='left')

In [10]:
cgms_meal_df.show(3)

+---------------+--------+-----------------+--------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+----+----+-----+
|             _1|      _2|               _3|                  _4|                 _5|                  _6|                 _7|                 _8|                  _9|               _10|               _11|              _12|               _13|               _14|               _15|               _16|user|time|label|
+---------------+--------+-----------------+--------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+----+----+-----+
|T-01_3h_PEN_Run|05:45:00|80.77944320892584|0.003673

In [11]:
cgms_meal_df.filter("label = 1").show(3)

+-------------+--------+-----------------+--------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+------------------+------------------+------------------+-----------------+-----------------+------------------+-------------+--------+-----+
|           _1|      _2|               _3|                  _4|                 _5|                  _6|                 _7|                 _8|                  _9|               _10|               _11|               _12|               _13|              _14|              _15|               _16|         user|    time|label|
+-------------+--------+-----------------+--------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+------------------+------------------+------------------+-----------------+-----------------+------------------+-------------+--------+-----+
|T-02_IPEN_Run|02:41:0

In [12]:
cgms_meal_df = cgms_meal_df.select("_1", "_2", "_3", "_4", "_5", "_6", "_7", "_8",
                                   "_9", "_10", "_11", "_12", "_13", "_14", "_15",
                                   "_16", "label")

In [13]:
cgms_meal_df.show(5)

+---------------+--------+-----------------+--------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+-----+
|             _1|      _2|               _3|                  _4|                 _5|                  _6|                 _7|                 _8|                  _9|               _10|               _11|              _12|               _13|               _14|               _15|               _16|label|
+---------------+--------+-----------------+--------------------+-------------------+--------------------+-------------------+-------------------+--------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+-----+
|T-01_3h_PEN_Run|05:45:00|80.77944320892584|0.003673457809955...| 0.20668288582125

In [None]:
cgms_meal_df.write.save("cgms_meal_df.parquet", format="parquet")