In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import asc, desc

In [2]:
# For output_1.txt
subject_id = 1613
activity_code = 'A'
n = 20
files = 'WISDM/*/*/'

# For output_2.txt
# subject_id = 1631
# activity_code = 'B'
# n = 10
# files = 'WISDM/*/*/'

In [3]:
ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext  # Have to create sparckContext in order to create RDD

In [4]:
data = sc.wholeTextFiles(files)

# QUESTION 1

In [63]:
num_files = data.map(lambda x: x[0]).distinct().count()
print(num_files)

204


# QUESTION 2

In [60]:
# cleaned_rdd = data.map(lambda x: (x[0],x[1].split('\n')))\
#     .flatMapValues(lambda x: x)\
#     .map(lambda x: (x[0].split('/'), x[1]))\
#     .map(lambda x: (x[0][-2], x[0][-3], x[1][:-1].split(',')))\
#     #.filter(lambda x: x[2][0]!='')\
#     #.map(lambda x: (int(x[2][0]), x[0], x[1], x[2][2], float(x[2][3]), float(x[2][4]), float(x[2][5])))

In [83]:
cleaned_rdd = data.map(lambda x: (x[0], x[1].split('\n')))\
    .flatMapValues(lambda x: x)\
    .map(lambda x: (x[0].split('/'), x[1]))\
    .map(lambda x: (x[0][-2], x[0][-3], x[1][:-1].split(',')))\
    .filter(lambda x: all(x[2]))\
    .map(lambda x: (int(x[2][0]), x[0], x[1], x[2][1], int(x[2][2]), float(x[2][3]), float(x[2][4]), float(x[2][5])))

In [84]:
num_records_no_null = cleaned_rdd.count()

In [81]:
print(num_records_no_null)

15630426


In [85]:
# cleaned_rdd.take(1)

[(1613,
  'gyro',
  'phone',
  'A',
  178468071944614,
  -0.020240024,
  -0.004261058,
  -0.023435818)]

# QUESTION 3
            

In [86]:
schema = StructType([StructField('subject_id', IntegerType(), False),
                     StructField('sensor', StringType(), False),
                     StructField('device', StringType(), False),
                     StructField('activity_code', StringType(), False),
                     StructField('timestamp', LongType(), False),
                     StructField('x', FloatType(), False),
                     StructField('y', FloatType(), False),
                     StructField('z', FloatType(), False)])

In [87]:
data_df = ss.createDataFrame(cleaned_rdd, schema)

In [89]:
data_df.printSchema()

root
 |-- subject_id: integer (nullable = false)
 |-- sensor: string (nullable = false)
 |-- device: string (nullable = false)
 |-- activity_code: string (nullable = false)
 |-- timestamp: long (nullable = false)
 |-- x: float (nullable = false)
 |-- y: float (nullable = false)
 |-- z: float (nullable = false)



# QUESTION 4

In [98]:
unique_id_df = data_df.select('subject_id').distinct().sort(
    'subject_id', ascending=True)

In [99]:
unique_id_df.show(unique_id_df.count())

+----------+
|subject_id|
+----------+
|      1600|
|      1601|
|      1602|
|      1603|
|      1604|
|      1605|
|      1606|
|      1607|
|      1608|
|      1609|
|      1610|
|      1611|
|      1612|
|      1613|
|      1614|
|      1615|
|      1616|
|      1617|
|      1618|
|      1619|
|      1620|
|      1621|
|      1622|
|      1623|
|      1624|
|      1625|
|      1626|
|      1627|
|      1628|
|      1629|
|      1630|
|      1631|
|      1632|
|      1633|
|      1634|
|      1635|
|      1636|
|      1637|
|      1638|
|      1639|
|      1640|
|      1641|
|      1642|
|      1643|
|      1644|
|      1645|
|      1646|
|      1647|
|      1648|
|      1649|
|      1650|
+----------+



# QUESTION 5

In [100]:
unique_sensor_df = data_df.select(
    'sensor').distinct().sort('sensor', ascending=True)
unique_sensor_df.show(unique_sensor_df.count())

+------+
|sensor|
+------+
| accel|
|  gyro|
+------+



# QUESTION 6

In [102]:
unique_act_code_df = data_df.select(
    'activity_code').distinct().sort('activity_code', ascending=True)
unique_act_code_df.show(unique_act_code_df.count())

+-------------+
|activity_code|
+-------------+
|            A|
|            B|
|            C|
|            D|
|            E|
|            F|
|            G|
|            H|
|            I|
|            J|
|            K|
|            L|
|            M|
|            O|
|            P|
|            Q|
|            R|
|            S|
+-------------+



# QUESTION 7

In [112]:
result7 = data_df.filter(f"subject_id == '{subject_id}' and activity_code == '{activity_code}'").orderBy(
    ['timestamp', 'sensor'], ascending=[True, False])

In [113]:
result7.show(n)

+----------+------+------+-------------+---------------+------------+------------+------------+
|subject_id|sensor|device|activity_code|      timestamp|           x|           y|           z|
+----------+------+------+-------------+---------------+------------+------------+------------+
|      1613|  gyro| phone|            A|178468071944614|-0.020240024|-0.004261058|-0.023435818|
|      1613| accel| phone|            A|178468071944614|  -3.7834373|    13.03164|   2.2427685|
|      1613|  gyro| phone|            A|178468104194617|  -2.5750105|  0.18109496|   1.3864417|
|      1613| accel| phone|            A|178468104194617|     -2.9317|    8.473679|    2.373851|
|      1613|  gyro| phone|            A|178468142811857|  -1.5739282|   0.6668556|    1.320928|
|      1613| accel| phone|            A|178468142811857|  -2.0260932|      5.4091|   1.3359646|
|      1613|  gyro| phone|            A|178468183987271|  -1.5041534|   1.7973675|    0.824781|
|      1613| accel| phone|            A|

# QUESTION 8

In [115]:
# data_df.show(5)

+----------+------+------+-------------+---------------+------------+------------+------------+
|subject_id|sensor|device|activity_code|      timestamp|           x|           y|           z|
+----------+------+------+-------------+---------------+------------+------------+------------+
|      1613|  gyro| phone|            A|178468071944614|-0.020240024|-0.004261058|-0.023435818|
|      1613|  gyro| phone|            A|178468104194617|  -2.5750105|  0.18109496|   1.3864417|
|      1613|  gyro| phone|            A|178468142811857|  -1.5739282|   0.6668556|    1.320928|
|      1613|  gyro| phone|            A|178468183987271|  -1.5041534|   1.7973675|    0.824781|
|      1613|  gyro| phone|            A|178468225406856| -0.50786483|   1.6002935|  0.45833004|
+----------+------+------+-------------+---------------+------------+------------+------------+
only showing top 5 rows



In [131]:
result8 = data_df.filter(f"subject_id == '{subject_id}' and activity_code == '{activity_code}'")\
    .orderBy(['timestamp', 'sensor'], ascending=[True, False])

In [133]:
result8 = result8.withColumn('x_positive', result8['x'] >= 0)\
    .withColumn('y_positive', result8['y'] >= 0)\
    .withColumn('z_positive', result8['z'] >= 0)\
    .drop('x', 'y', 'z')

In [134]:
result8.show(n)

+----------+------+------+-------------+---------------+----------+----------+----------+
|subject_id|sensor|device|activity_code|      timestamp|x_positive|y_positive|z_positive|
+----------+------+------+-------------+---------------+----------+----------+----------+
|      1613|  gyro| phone|            A|178468071944614|     false|     false|     false|
|      1613| accel| phone|            A|178468071944614|     false|      true|      true|
|      1613|  gyro| phone|            A|178468104194617|     false|      true|      true|
|      1613| accel| phone|            A|178468104194617|     false|      true|      true|
|      1613|  gyro| phone|            A|178468142811857|     false|      true|      true|
|      1613| accel| phone|            A|178468142811857|     false|      true|      true|
|      1613|  gyro| phone|            A|178468183987271|     false|      true|      true|
|      1613| accel| phone|            A|178468183987271|     false|      true|      true|
|      161

In [135]:
ss.stop()