In [30]:
import os
import sys
import pandas as pd

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col




#temporary fix for running pyspark in jupyter notebook
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [31]:
# initialize spark session
spark = SparkSession.builder.master("local[2]").appName("Victim-Records").getOrCreate()

sc = spark.sparkContext

sqlContext = SQLContext(sc)


In [32]:
# define the path to the data

VictimRecords = '20160924_VictimRecords.txt'

# define the schema
schema = StructType([
    StructField('CASE_ID', StringType(), True),
    StructField('PARTY_NUMBER', IntegerType(), True),
    StructField('VICTIM_ROLE', StringType(), True),
    StructField('VICTIM_SEX', StringType(), True),
    StructField('VICTIM_AGE', IntegerType(), True),
    StructField('VICTIM_DEGREE_OF_INJURY', StringType(), True),
    StructField('VICTIM_SEATING_POSITION', StringType(), True),
    StructField('VICTIM_SAFETY_EQUIP1', StringType(), True),
    StructField('VICTIM_SAFETY_EQUIP2', StringType(), True),
    StructField('VICTIM_EJECTED', StringType(), True),
])

# load the data, skip header
victim_df = spark.read.csv(path = VictimRecords, schema = schema).cache()
header = victim_df.first()
victim_df = victim_df.filter(col("CASE_ID") != header["CASE_ID"])
victim_df.take(5)


24/12/04 16:00:29 WARN CacheManager: Asked to cache already cached data.


[Row(CASE_ID=' 097293', PARTY_NUMBER=1, VICTIM_ROLE='2', VICTIM_SEX='M', VICTIM_AGE=20, VICTIM_DEGREE_OF_INJURY='0', VICTIM_SEATING_POSITION='3', VICTIM_SAFETY_EQUIP1='G', VICTIM_SAFETY_EQUIP2='-', VICTIM_EJECTED='0'),
 Row(CASE_ID=' 965874', PARTY_NUMBER=2, VICTIM_ROLE='2', VICTIM_SEX='M', VICTIM_AGE=19, VICTIM_DEGREE_OF_INJURY='0', VICTIM_SEATING_POSITION='3', VICTIM_SAFETY_EQUIP1='M', VICTIM_SAFETY_EQUIP2='G', VICTIM_EJECTED='0'),
 Row(CASE_ID='0000003', PARTY_NUMBER=1, VICTIM_ROLE='2', VICTIM_SEX='M', VICTIM_AGE=21, VICTIM_DEGREE_OF_INJURY='0', VICTIM_SEATING_POSITION='3', VICTIM_SAFETY_EQUIP1='G', VICTIM_SAFETY_EQUIP2='-', VICTIM_EJECTED='0'),
 Row(CASE_ID='0000005', PARTY_NUMBER=1, VICTIM_ROLE='1', VICTIM_SEX='M', VICTIM_AGE=44, VICTIM_DEGREE_OF_INJURY='2', VICTIM_SEATING_POSITION='1', VICTIM_SAFETY_EQUIP1='G', VICTIM_SAFETY_EQUIP2='-', VICTIM_EJECTED='0'),
 Row(CASE_ID='0000008', PARTY_NUMBER=1, VICTIM_ROLE='2', VICTIM_SEX='F', VICTIM_AGE=59, VICTIM_DEGREE_OF_INJURY='0', VICTIM_

In [33]:
victim_df.columns


['CASE_ID',
 'PARTY_NUMBER',
 'VICTIM_ROLE',
 'VICTIM_SEX',
 'VICTIM_AGE',
 'VICTIM_DEGREE_OF_INJURY',
 'VICTIM_SEATING_POSITION',
 'VICTIM_SAFETY_EQUIP1',
 'VICTIM_SAFETY_EQUIP2',
 'VICTIM_EJECTED']

In [34]:
victim_df.printSchema()

root
 |-- CASE_ID: string (nullable = true)
 |-- PARTY_NUMBER: integer (nullable = true)
 |-- VICTIM_ROLE: string (nullable = true)
 |-- VICTIM_SEX: string (nullable = true)
 |-- VICTIM_AGE: integer (nullable = true)
 |-- VICTIM_DEGREE_OF_INJURY: string (nullable = true)
 |-- VICTIM_SEATING_POSITION: string (nullable = true)
 |-- VICTIM_SAFETY_EQUIP1: string (nullable = true)
 |-- VICTIM_SAFETY_EQUIP2: string (nullable = true)
 |-- VICTIM_EJECTED: string (nullable = true)



In [35]:
victim_df.describe().show()



+-------+--------------------+------------------+------------------+----------+------------------+-----------------------+-----------------------+--------------------+--------------------+-------------------+
|summary|             CASE_ID|      PARTY_NUMBER|       VICTIM_ROLE|VICTIM_SEX|        VICTIM_AGE|VICTIM_DEGREE_OF_INJURY|VICTIM_SEATING_POSITION|VICTIM_SAFETY_EQUIP1|VICTIM_SAFETY_EQUIP2|     VICTIM_EJECTED|
+-------+--------------------+------------------+------------------+----------+------------------+-----------------------+-----------------------+--------------------+--------------------+-------------------+
|  count|             8037881|           8037881|           8037881|   8037881|           8037881|                8037881|                8037881|             8037881|             7428552|            8037881|
|   mean|4.501311827838024...|1.6286763887148865|1.9098758740021158|      NULL| 63.66258992886309|     1.8622888296057132|       3.17401942629864|                NU

                                                                                

In [36]:
victim_df.filter(victim_df.VICTIM_AGE == 0).count()

60387

In [37]:

# filter out unknown ages (998)
victim_df = victim_df.filter(victim_df.VICTIM_AGE != 998)

# change all 999 age instances to 0
victim_df = victim_df.withColumn('VICTIM_AGE', F.when(victim_df.VICTIM_AGE == 999, 0).otherwise(victim_df.VICTIM_AGE))


victim_df.describe().show()










+-------+--------------------+-----------------+------------------+----------+------------------+-----------------------+-----------------------+--------------------+--------------------+------------------+
|summary|             CASE_ID|     PARTY_NUMBER|       VICTIM_ROLE|VICTIM_SEX|        VICTIM_AGE|VICTIM_DEGREE_OF_INJURY|VICTIM_SEATING_POSITION|VICTIM_SAFETY_EQUIP1|VICTIM_SAFETY_EQUIP2|    VICTIM_EJECTED|
+-------+--------------------+-----------------+------------------+----------+------------------+-----------------------+-----------------------+--------------------+--------------------+------------------+
|  count|             7761237|          7761237|           7761237|   7761237|           7761237|                7761237|                7761237|             7761237|             7165578|           7761237|
|   mean|4.570746238923401E17| 1.62854284697143|1.9009195312551337|      NULL|30.339591356377856|     1.8968020175134452|      3.127575494729112|                NULL|      

                                                                                

In [38]:
victim_df.filter(victim_df.VICTIM_AGE == 0).count()

60536