In [1]:
import os
import sys
import pandas as pd
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col

from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline


#temporary fix for running pyspark in jupyter notebook
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
# initialize spark session
spark = SparkSession.builder.master("local[2]").appName("Victim-Records").getOrCreate()

sc = spark.sparkContext

sqlContext = SQLContext(sc)


24/12/16 11:49:06 WARN Utils: Your hostname, Seans-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.158 instead (on interface en0)
24/12/16 11:49:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/16 11:49:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/16 11:49:08 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# define the path to the data

VictimRecords = ['20160924_VictimRecords.txt','20170112_VictimRecords.txt','20180925_VictimRecords.txt', '20201024_VictimRecords.txt']
# define the schema
schema = StructType([
    StructField('CASE_ID', StringType(), True),
    StructField('PARTY_NUMBER', IntegerType(), True),
    StructField('VICTIM_ROLE', StringType(), True),
    StructField('VICTIM_SEX', StringType(), True),
    StructField('VICTIM_AGE', IntegerType(), True),
    StructField('VICTIM_DEGREE_OF_INJURY', StringType(), True),
    StructField('VICTIM_SEATING_POSITION', StringType(), True),
    StructField('VICTIM_SAFETY_EQUIP1', StringType(), True),
    StructField('VICTIM_SAFETY_EQUIP2', StringType(), True),
    StructField('VICTIM_EJECTED', StringType(), True),
])


# load the data, skip header
victim_df = spark.read.csv(path = VictimRecords, schema = schema).cache()
header = victim_df.first()
victim_df = victim_df.filter(col("CASE_ID") != header["CASE_ID"])
victim_df.take(5)

#### Debug: Reduce dataset for testing ####
#victim_df = victim_df.limit(10)




24/12/16 11:49:50 WARN MemoryStore: Not enough space to cache rdd_3_5 in memory! (computed 59.3 MiB so far)
24/12/16 11:49:50 WARN BlockManager: Persisting block rdd_3_5 to disk instead.
24/12/16 11:49:53 WARN MemoryStore: Not enough space to cache rdd_3_6 in memory! (computed 14.7 MiB so far)
24/12/16 11:49:53 WARN BlockManager: Persisting block rdd_3_6 to disk instead.
24/12/16 11:49:55 WARN MemoryStore: Not enough space to cache rdd_3_7 in memory! (computed 25.4 MiB so far)
24/12/16 11:49:55 WARN BlockManager: Persisting block rdd_3_7 to disk instead.
24/12/16 11:50:00 WARN MemoryStore: Not enough space to cache rdd_3_7 in memory! (computed 25.4 MiB so far)
24/12/16 11:50:00 WARN MemoryStore: Not enough space to cache rdd_3_6 in memory! (computed 25.4 MiB so far)
24/12/16 11:50:02 WARN MemoryStore: Not enough space to cache rdd_3_9 in memory! (computed 14.7 MiB so far)
24/12/16 11:50:02 WARN MemoryStore: Not enough space to cache rdd_3_8 in memory! (computed 14.9 MiB so far)
24/12/1

[Row(CASE_ID=' 097293', PARTY_NUMBER=1, VICTIM_ROLE='2', VICTIM_SEX='M', VICTIM_AGE=20, VICTIM_DEGREE_OF_INJURY='0', VICTIM_SEATING_POSITION='3', VICTIM_SAFETY_EQUIP1='G', VICTIM_SAFETY_EQUIP2='-', VICTIM_EJECTED='0'),
 Row(CASE_ID=' 965874', PARTY_NUMBER=2, VICTIM_ROLE='2', VICTIM_SEX='M', VICTIM_AGE=19, VICTIM_DEGREE_OF_INJURY='0', VICTIM_SEATING_POSITION='3', VICTIM_SAFETY_EQUIP1='M', VICTIM_SAFETY_EQUIP2='G', VICTIM_EJECTED='0'),
 Row(CASE_ID='0000003', PARTY_NUMBER=1, VICTIM_ROLE='2', VICTIM_SEX='M', VICTIM_AGE=21, VICTIM_DEGREE_OF_INJURY='0', VICTIM_SEATING_POSITION='3', VICTIM_SAFETY_EQUIP1='G', VICTIM_SAFETY_EQUIP2='-', VICTIM_EJECTED='0'),
 Row(CASE_ID='0000005', PARTY_NUMBER=1, VICTIM_ROLE='1', VICTIM_SEX='M', VICTIM_AGE=44, VICTIM_DEGREE_OF_INJURY='2', VICTIM_SEATING_POSITION='1', VICTIM_SAFETY_EQUIP1='G', VICTIM_SAFETY_EQUIP2='-', VICTIM_EJECTED='0'),
 Row(CASE_ID='0000008', PARTY_NUMBER=1, VICTIM_ROLE='2', VICTIM_SEX='F', VICTIM_AGE=59, VICTIM_DEGREE_OF_INJURY='0', VICTIM_

In [4]:
victim_df.columns


['CASE_ID',
 'PARTY_NUMBER',
 'VICTIM_ROLE',
 'VICTIM_SEX',
 'VICTIM_AGE',
 'VICTIM_DEGREE_OF_INJURY',
 'VICTIM_SEATING_POSITION',
 'VICTIM_SAFETY_EQUIP1',
 'VICTIM_SAFETY_EQUIP2',
 'VICTIM_EJECTED']

In [5]:
victim_df.printSchema()

root
 |-- CASE_ID: string (nullable = true)
 |-- PARTY_NUMBER: integer (nullable = true)
 |-- VICTIM_ROLE: string (nullable = true)
 |-- VICTIM_SEX: string (nullable = true)
 |-- VICTIM_AGE: integer (nullable = true)
 |-- VICTIM_DEGREE_OF_INJURY: string (nullable = true)
 |-- VICTIM_SEATING_POSITION: string (nullable = true)
 |-- VICTIM_SAFETY_EQUIP1: string (nullable = true)
 |-- VICTIM_SAFETY_EQUIP2: string (nullable = true)
 |-- VICTIM_EJECTED: string (nullable = true)



In [6]:
victim_df.describe().show()

24/12/16 11:50:14 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/12/16 11:51:45 WARN MemoryStore: Not enough space to cache rdd_3_6 in memory! (computed 25.4 MiB so far)
24/12/16 11:51:47 WARN MemoryStore: Not enough space to cache rdd_3_7 in memory! (computed 7.5 MiB so far)
24/12/16 11:52:12 WARN MemoryStore: Not enough space to cache rdd_3_8 in memory! (computed 22.3 MiB so far)
24/12/16 11:52:13 WARN MemoryStore: Not enough space to cache rdd_3_9 in memory! (computed 3.9 MiB so far)
24/12/16 11:52:33 WARN MemoryStore: Not enough space to cache rdd_3_10 in memory! (computed 25.4 MiB so far)
                                                                                

+-------+--------------------+------------------+------------------+------------------+------------------+-----------------------+-----------------------+--------------------+--------------------+------------------+
|summary|             CASE_ID|      PARTY_NUMBER|       VICTIM_ROLE|        VICTIM_SEX|        VICTIM_AGE|VICTIM_DEGREE_OF_INJURY|VICTIM_SEATING_POSITION|VICTIM_SAFETY_EQUIP1|VICTIM_SAFETY_EQUIP2|    VICTIM_EJECTED|
+-------+--------------------+------------------+------------------+------------------+------------------+-----------------------+-----------------------+--------------------+--------------------+------------------+
|  count|            30396065|          30396065|          30396065|          30396065|          30396065|               30396065|               30396065|            30396065|            28568078|          30396065|
|   mean|3.570956518533931...|1.6305697464457982|1.8703510818017473|3.5555555555555554| 63.75028152492765|     1.9722526757425833|      

In [7]:
victim_df.filter(victim_df.VICTIM_AGE == 0).count()

24/12/16 11:52:54 WARN MemoryStore: Not enough space to cache rdd_3_7 in memory! (computed 14.7 MiB so far)
24/12/16 11:52:54 WARN MemoryStore: Not enough space to cache rdd_3_6 in memory! (computed 14.7 MiB so far)
24/12/16 11:52:54 WARN MemoryStore: Not enough space to cache rdd_3_9 in memory! (computed 3.9 MiB so far)
24/12/16 11:52:54 WARN MemoryStore: Not enough space to cache rdd_3_8 in memory! (computed 22.3 MiB so far)
24/12/16 11:52:54 WARN MemoryStore: Not enough space to cache rdd_3_10 in memory! (computed 25.4 MiB so far)
                                                                                

217309

In [8]:

# filter out unknown ages (998)
victim_df = victim_df.filter(victim_df.VICTIM_AGE != 998)

# change all 999 age instances to 0
victim_df = victim_df.withColumn('VICTIM_AGE', F.when(victim_df.VICTIM_AGE == 999, 0).otherwise(victim_df.VICTIM_AGE))


victim_df.describe().show()








24/12/16 11:54:19 WARN MemoryStore: Not enough space to cache rdd_3_6 in memory! (computed 25.4 MiB so far)
24/12/16 11:54:21 WARN MemoryStore: Not enough space to cache rdd_3_7 in memory! (computed 7.5 MiB so far)
24/12/16 11:54:43 WARN MemoryStore: Not enough space to cache rdd_3_8 in memory! (computed 22.3 MiB so far)
24/12/16 11:54:44 WARN MemoryStore: Not enough space to cache rdd_3_9 in memory! (computed 3.9 MiB so far)
24/12/16 11:55:02 WARN MemoryStore: Not enough space to cache rdd_3_10 in memory! (computed 25.4 MiB so far)

+-------+--------------------+------------------+------------------+------------------+------------------+-----------------------+-----------------------+--------------------+--------------------+-------------------+
|summary|             CASE_ID|      PARTY_NUMBER|       VICTIM_ROLE|        VICTIM_SEX|        VICTIM_AGE|VICTIM_DEGREE_OF_INJURY|VICTIM_SEATING_POSITION|VICTIM_SAFETY_EQUIP1|VICTIM_SAFETY_EQUIP2|     VICTIM_EJECTED|
+-------+--------------------+------------------+------------------+------------------+------------------+-----------------------+-----------------------+--------------------+--------------------+-------------------+
|  count|            29360479|          29360479|          29360479|          29360479|          29360479|               29360479|               29360479|            29360479|            27573502|           29360479|
|   mean|3.624734272311125...|1.6304027260590674|1.8611631661828245|3.5619047619047617|30.780437471745607|      2.009398080677406|  

                                                                                

In [9]:
victim_df.filter(victim_df.VICTIM_AGE == 0).count()

24/12/16 11:55:21 WARN MemoryStore: Not enough space to cache rdd_3_6 in memory! (computed 14.7 MiB so far)
24/12/16 11:55:21 WARN MemoryStore: Not enough space to cache rdd_3_7 in memory! (computed 14.7 MiB so far)
24/12/16 11:55:21 WARN MemoryStore: Not enough space to cache rdd_3_8 in memory! (computed 14.9 MiB so far)
24/12/16 11:55:21 WARN MemoryStore: Not enough space to cache rdd_3_9 in memory! (computed 14.7 MiB so far)
24/12/16 11:55:22 WARN MemoryStore: Not enough space to cache rdd_3_10 in memory! (computed 25.4 MiB so far)
                                                                                

217824

In [10]:
# remove all "not stated - values" and null
filterCondition = (col(victim_df.columns[0]) != "-")
filterCondition = filterCondition | (col(victim_df.columns[0]) != None)


for c in victim_df.columns[1:]:
    filterCondition = filterCondition | (col(c) != "-")
    filterCondition = filterCondition | (col(c) != None)

filtered_df = victim_df.filter(filterCondition)
filtered_df.head(5)

[Row(CASE_ID=' 097293', PARTY_NUMBER=1, VICTIM_ROLE='2', VICTIM_SEX='M', VICTIM_AGE=20, VICTIM_DEGREE_OF_INJURY='0', VICTIM_SEATING_POSITION='3', VICTIM_SAFETY_EQUIP1='G', VICTIM_SAFETY_EQUIP2='-', VICTIM_EJECTED='0'),
 Row(CASE_ID=' 965874', PARTY_NUMBER=2, VICTIM_ROLE='2', VICTIM_SEX='M', VICTIM_AGE=19, VICTIM_DEGREE_OF_INJURY='0', VICTIM_SEATING_POSITION='3', VICTIM_SAFETY_EQUIP1='M', VICTIM_SAFETY_EQUIP2='G', VICTIM_EJECTED='0'),
 Row(CASE_ID='0000003', PARTY_NUMBER=1, VICTIM_ROLE='2', VICTIM_SEX='M', VICTIM_AGE=21, VICTIM_DEGREE_OF_INJURY='0', VICTIM_SEATING_POSITION='3', VICTIM_SAFETY_EQUIP1='G', VICTIM_SAFETY_EQUIP2='-', VICTIM_EJECTED='0'),
 Row(CASE_ID='0000005', PARTY_NUMBER=1, VICTIM_ROLE='1', VICTIM_SEX='M', VICTIM_AGE=44, VICTIM_DEGREE_OF_INJURY='2', VICTIM_SEATING_POSITION='1', VICTIM_SAFETY_EQUIP1='G', VICTIM_SAFETY_EQUIP2='-', VICTIM_EJECTED='0'),
 Row(CASE_ID='0000008', PARTY_NUMBER=1, VICTIM_ROLE='2', VICTIM_SEX='F', VICTIM_AGE=59, VICTIM_DEGREE_OF_INJURY='0', VICTIM_

In [11]:

encoded_and_index = ["VICTIM_SEX","VICTIM_DEGREE_OF_INJURY","VICTIM_SEATING_POSITION","VICTIM_SAFETY_EQUIP2","VICTIM_EJECTED"]

indexers = [StringIndexer(inputCol=c, outputCol=c+"_index") for c in encoded_and_index]
encoders = [OneHotEncoder(inputCol=c+"_index", outputCol=c+"_vec") for c in encoded_and_index]

filtered_df2 = filtered_df

for col_name in encoded_and_index:
    
    indexer = StringIndexer(inputCol=col_name, outputCol=col_name + "_index")
    print(indexer)
    indexedDf = indexer.fit(victim_df).transform(victim_df)
   
    encoder = OneHotEncoder(inputCol=col_name + "_index", outputCol=col_name + "_vec")
    encoded_df = encoder.fit(indexedDf).transform(indexedDf)

encoded_df.show(truncate=False)

StringIndexer_98d23f3e6c97


24/12/16 11:55:25 WARN MemoryStore: Not enough space to cache rdd_3_6 in memory! (computed 14.7 MiB so far)
24/12/16 11:55:25 WARN MemoryStore: Not enough space to cache rdd_3_7 in memory! (computed 14.7 MiB so far)
24/12/16 11:55:25 WARN MemoryStore: Not enough space to cache rdd_3_8 in memory! (computed 14.9 MiB so far)
24/12/16 11:55:25 WARN MemoryStore: Not enough space to cache rdd_3_9 in memory! (computed 14.7 MiB so far)
24/12/16 11:55:26 WARN MemoryStore: Not enough space to cache rdd_3_10 in memory! (computed 25.4 MiB so far)
                                                                                

StringIndexer_bd76f85846bf


24/12/16 11:55:29 WARN MemoryStore: Not enough space to cache rdd_3_7 in memory! (computed 14.7 MiB so far)
24/12/16 11:55:29 WARN MemoryStore: Not enough space to cache rdd_3_6 in memory! (computed 14.7 MiB so far)
24/12/16 11:55:30 WARN MemoryStore: Not enough space to cache rdd_3_8 in memory! (computed 14.9 MiB so far)
24/12/16 11:55:30 WARN MemoryStore: Not enough space to cache rdd_3_9 in memory! (computed 14.7 MiB so far)
24/12/16 11:55:30 WARN MemoryStore: Not enough space to cache rdd_3_10 in memory! (computed 25.4 MiB so far)
                                                                                

StringIndexer_654df9678172


24/12/16 11:55:32 WARN MemoryStore: Not enough space to cache rdd_3_6 in memory! (computed 14.7 MiB so far)
24/12/16 11:55:32 WARN MemoryStore: Not enough space to cache rdd_3_7 in memory! (computed 14.7 MiB so far)
24/12/16 11:55:33 WARN MemoryStore: Not enough space to cache rdd_3_8 in memory! (computed 14.9 MiB so far)
24/12/16 11:55:33 WARN MemoryStore: Not enough space to cache rdd_3_9 in memory! (computed 14.7 MiB so far)
24/12/16 11:55:33 WARN MemoryStore: Not enough space to cache rdd_3_10 in memory! (computed 25.4 MiB so far)
                                                                                

StringIndexer_3b7610c86801


24/12/16 11:55:36 WARN MemoryStore: Not enough space to cache rdd_3_7 in memory! (computed 14.7 MiB so far)
24/12/16 11:55:36 WARN MemoryStore: Not enough space to cache rdd_3_6 in memory! (computed 14.7 MiB so far)
24/12/16 11:55:37 WARN MemoryStore: Not enough space to cache rdd_3_8 in memory! (computed 22.3 MiB so far)
24/12/16 11:55:37 WARN MemoryStore: Not enough space to cache rdd_3_9 in memory! (computed 3.9 MiB so far)
24/12/16 11:55:37 WARN MemoryStore: Not enough space to cache rdd_3_10 in memory! (computed 25.4 MiB so far)
                                                                                

StringIndexer_e3c10dab6dd1


24/12/16 11:55:40 WARN MemoryStore: Not enough space to cache rdd_3_6 in memory! (computed 14.7 MiB so far)
24/12/16 11:55:40 WARN MemoryStore: Not enough space to cache rdd_3_7 in memory! (computed 14.7 MiB so far)
24/12/16 11:55:40 WARN MemoryStore: Not enough space to cache rdd_3_8 in memory! (computed 22.3 MiB so far)
24/12/16 11:55:40 WARN MemoryStore: Not enough space to cache rdd_3_9 in memory! (computed 3.9 MiB so far)
24/12/16 11:55:41 WARN MemoryStore: Not enough space to cache rdd_3_10 in memory! (computed 25.4 MiB so far)
                                                                                

+-------+------------+-----------+----------+----------+-----------------------+-----------------------+--------------------+--------------------+--------------+--------------------+------------------+
|CASE_ID|PARTY_NUMBER|VICTIM_ROLE|VICTIM_SEX|VICTIM_AGE|VICTIM_DEGREE_OF_INJURY|VICTIM_SEATING_POSITION|VICTIM_SAFETY_EQUIP1|VICTIM_SAFETY_EQUIP2|VICTIM_EJECTED|VICTIM_EJECTED_index|VICTIM_EJECTED_vec|
+-------+------------+-----------+----------+----------+-----------------------+-----------------------+--------------------+--------------------+--------------+--------------------+------------------+
| 097293|1           |2          |M         |20        |0                      |3                      |G                   |-                   |0             |0.0                 |(15,[0],[1.0])    |
| 965874|2           |2          |M         |19        |0                      |3                      |M                   |G                   |0             |0.0                 |(15,[0],[1

In [12]:
# Save the cleaned Victim Records DataFrame as CSV
filtered_df2.write.csv("clean_victim_records.csv", header=True, mode="overwrite")
print("Victim Records saved successfully")

24/12/16 11:55:53 WARN MemoryStore: Not enough space to cache rdd_3_7 in memory! (computed 14.7 MiB so far)
24/12/16 11:55:53 WARN MemoryStore: Not enough space to cache rdd_3_6 in memory! (computed 14.7 MiB so far)
24/12/16 11:55:56 WARN MemoryStore: Not enough space to cache rdd_3_8 in memory! (computed 22.3 MiB so far)
24/12/16 11:55:56 WARN MemoryStore: Not enough space to cache rdd_3_9 in memory! (computed 3.9 MiB so far)
24/12/16 11:55:59 WARN MemoryStore: Not enough space to cache rdd_3_10 in memory! (computed 25.4 MiB so far)

Victim Records saved successfully


                                                                                