# Input HMP
This notebook pulls the HMP accelerometer sensor data classification data set

First thing to do is start a Spark Session

In [1]:
import findspark
findspark.init()

In [2]:
import fnmatch
import os
from pathlib import Path
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql.functions import lit
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType
from pyspark.sql.types import StructField
from pyspark.sql.types import StructType
import random
import re
import shutil
import sys

In [3]:
import csv

In [4]:
import os 

In [5]:
# path and file name for output (default: data.csv)
data_csv = os.environ.get('data_csv', 'data.csv')

# url of master (default: local mode)
master = os.environ.get('master', "local[*]")

# temporal data storage for local execution
data_dir = os.environ.get('data_dir', '../data/')

# sample on input data to increase processing speed 0..1 (default: 1.0)
sample = os.environ.get('sample', '1.0')

In [6]:
# override parameters received from a potential call using %run magic
parameters = list(
    map(
        lambda s: re.sub('$', '"', s),
        map(
            lambda s: s.replace('=', '="'),
            filter(
                lambda s: s.find('=') > -1,
                sys.argv
            )
        )
    )
)

for parameter in parameters:
    exec(parameter)

# cast parameters to appropriate type
sample = float(sample)

Lets create a local spark context (sc) and session (spark)

In [7]:
sc = SparkContext.getOrCreate(SparkConf().setMaster(master))

spark = SparkSession \
    .builder \
    .getOrCreate()

Lets pull the data in raw format from the source (github)

In [8]:
if os.name != 'nt':
    !rm -Rf HMP_Dataset

In [9]:
if os.name == 'nt':
    !rmdir -p HMP_Dataset

The system cannot find the file specified.
The directory is not empty.


In [10]:
!git clone https://github.com/wchill/HMP_Dataset

fatal: destination path 'HMP_Dataset' already exists and is not an empty directory.


In [11]:
schema = StructType([
    StructField("x", IntegerType(), True),
    StructField("y", IntegerType(), True),
    StructField("z", IntegerType(), True)])

This step takes a while, it parses through all files and folders and creates a temporary dataframe for each file which gets appended to an overall data-frame "df". In addition, a column called "class" is added to allow for straightforward usage in Spark afterwards in a supervised machine learning scenario for example.

In [12]:
d = 'HMP_Dataset/'

# filter list for all folders containing data (folders that don't start with .)
file_list_filtered = [s for s in os.listdir(d)
                      if os.path.isdir(os.path.join(d, s)) & ~fnmatch.fnmatch(s, '.*')]

# create pandas data frame for all the data

df = None

for category in file_list_filtered:
    data_files = os.listdir('HMP_Dataset/' + category)

    # create a temporary pandas data frame for each data file
    for data_file in data_files:
        if sample < 1.0:
            if random.random() > sample:
                print('Skipping: ' + data_file)
                continue
        print(data_file)
        temp_df = spark.read. \
            option("header", "false"). \
            option("delimiter", " "). \
            csv('HMP_Dataset/' + category + '/' + data_file, schema=schema)

        # create a column called "source" storing the current CSV file
        temp_df = temp_df.withColumn("source", lit(data_file))

        # create a column called "class" storing the current data folder
        temp_df = temp_df.withColumn("class", lit(category))

        if df is None:
            df = temp_df
        else:
            df = df.union(temp_df)

Accelerometer-2011-04-11-13-28-18-brush_teeth-f1.txt
Accelerometer-2011-04-11-13-29-54-brush_teeth-f1.txt
Accelerometer-2011-05-30-08-35-11-brush_teeth-f1.txt
Accelerometer-2011-05-30-09-36-50-brush_teeth-f1.txt
Accelerometer-2011-05-30-10-34-16-brush_teeth-m1.txt
Accelerometer-2011-05-30-21-10-57-brush_teeth-f1.txt
Accelerometer-2011-05-30-21-55-04-brush_teeth-m2.txt
Accelerometer-2011-05-31-15-16-47-brush_teeth-f1.txt
Accelerometer-2011-06-02-10-42-22-brush_teeth-f1.txt
Accelerometer-2011-06-02-10-45-50-brush_teeth-f1.txt
Accelerometer-2011-06-06-10-45-27-brush_teeth-f1.txt
Accelerometer-2011-06-06-10-48-05-brush_teeth-f1.txt
Accelerometer-2011-03-24-10-24-39-climb_stairs-f1.txt
Accelerometer-2011-03-24-10-25-44-climb_stairs-f1.txt
Accelerometer-2011-03-29-09-55-46-climb_stairs-f1.txt
Accelerometer-2011-04-05-18-21-22-climb_stairs-f1.txt
Accelerometer-2011-04-05-18-32-29-climb_stairs-f1.txt
Accelerometer-2011-04-11-11-44-35-climb_stairs-f1.txt
Accelerometer-2011-04-11-11-57-50-climb_

Lets write the dataf-rame to a file in "CSV" format, this will also take quite some time:

In [13]:
if Path(data_dir + data_csv).exists():
    shutil.rmtree(data_dir + data_csv)

In [14]:
df.write.option("header", "true").csv(data_dir + data_csv)

Now we should have a CSV file with our contents