In [1]:
import os
import sys
import pandas as pd

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col




#temporary fix for running pyspark in jupyter notebook
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
# initialize spark session
spark = SparkSession.builder.master("local[2]").appName("Party-Records").config("spark.driver.memory", "4g").config("spark_executor_memory", "4g").getOrCreate()

sc = spark.sparkContext

sqlContext = SQLContext(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/04 16:12:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/04 16:12:41 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/12/04 16:12:41 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
# define the path to the data

PartyRecords = '20160924_PartyRecords.txt'

# define the schema
schema = StructType([
    StructField('CASE_ID', StringType(), True),
    StructField('PARTY_NUMBER', IntegerType(), True),
    StructField('PARTY_TYPE', IntegerType(), True),
    StructField('AT_FAULT', StringType(), True),
    StructField('PARTY_SEX', StringType(), True),
    StructField('PARTY_AGE', IntegerType(), True),
    StructField('PARTY_SOBRIETY', StringType(), True),
    StructField('PARTY_DRUG_PHYSICAL', StringType(), True),
    StructField('DIR_OF_TRAVEL', StringType(), True),
    StructField('PARTY_SAFETY_EQUIP_1', StringType(), True),
    StructField('PARTY_SAFETY_EQUIP_2', StringType(), True),
    StructField('FINAN_RESPONS', StringType(), True),
    StructField('SP_INFO_1', StringType(), True),
    # MIGHT NEED TO ADJUST THIS - HAS BOTH STRING AND INTEGER TYPES
    StructField('SP_INFO_2', StringType(), True),
    StructField('SP_INFO_3', StringType(), True),
    StructField('OAF_VIOLATION_CODE', StringType(), True),
    StructField('OAF_VIOL_CAT', IntegerType(), True),
    StructField('OAF_VIOL_SECTION', IntegerType(), True),
    StructField('OAF_VIOLATION_SUFFIX', StringType(), True),
    StructField('OAF_1', StringType(), True),
    StructField('OAF_2', StringType(), True),
    StructField('PARTY_NUMBER_KILLED', IntegerType(), True),
    StructField('PARTY_NUMBER_INJURED', IntegerType(), True),
    StructField('MOVE_PRE_ACC', StringType(), True),
    StructField('VEHICLE_YEAR', IntegerType(), True),
    StructField('VEHICLE_MAKE', StringType(), True),
    StructField('STWD_VEHICLE_TYPE', StringType(), True),
    StructField('CHP_VEH_TYPE_TOWING', IntegerType(), True),
    StructField('CHP_VEH_TYPE_TOWED', IntegerType(), True),
    StructField('RACE', StringType(), True),
    StructField('INATTENTION', StringType(), True),
    StructField('SPECIAL_INFO_F', StringType(), True),
    StructField('SPECIAL_INFO_G', StringType(), True),
    StructField('local_report_number', IntegerType(), True),
])

# load the data, skip header
party_df = spark.read.csv(path = PartyRecords, schema = schema).cache()
header = party_df.first()
party_df = party_df.filter(col("CASE_ID") != header["CASE_ID"])
party_df.take(5)

24/12/04 16:12:42 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

[Row(CASE_ID=' 097293', PARTY_NUMBER=1, PARTY_TYPE=1, AT_FAULT='Y', PARTY_SEX='-', PARTY_AGE=20, PARTY_SOBRIETY='A', PARTY_DRUG_PHYSICAL='-', DIR_OF_TRAVEL='E', PARTY_SAFETY_EQUIP_1='G', PARTY_SAFETY_EQUIP_2='-', FINAN_RESPONS='N', SP_INFO_1='-', SP_INFO_2='D', SP_INFO_3='-', OAF_VIOLATION_CODE='-', OAF_VIOL_CAT=None, OAF_VIOL_SECTION=None, OAF_VIOLATION_SUFFIX=None, OAF_1='N', OAF_2='-', PARTY_NUMBER_KILLED=0, PARTY_NUMBER_INJURED=0, MOVE_PRE_ACC='M', VEHICLE_YEAR=1996, VEHICLE_MAKE='FORD', STWD_VEHICLE_TYPE='A', CHP_VEH_TYPE_TOWING=8, CHP_VEH_TYPE_TOWED=0, RACE='W', INATTENTION=None, SPECIAL_INFO_F='-', SPECIAL_INFO_G='-', local_report_number=None),
 Row(CASE_ID=' 373108', PARTY_NUMBER=1, PARTY_TYPE=1, AT_FAULT='Y', PARTY_SEX='M', PARTY_AGE=18, PARTY_SOBRIETY='A', PARTY_DRUG_PHYSICAL='-', DIR_OF_TRAVEL='S', PARTY_SAFETY_EQUIP_1='N', PARTY_SAFETY_EQUIP_2='G', FINAN_RESPONS='Y', SP_INFO_1='-', SP_INFO_2='-', SP_INFO_3='-', OAF_VIOLATION_CODE='-', OAF_VIOL_CAT=None, OAF_VIOL_SECTION=Non

In [7]:
party_df.describe().show()

[Stage 11:>                                                         (0 + 1) / 1]

+-------+--------------------+------------------+------------------+--------+---------+------------------+--------------+-------------------+-------------+--------------------+--------------------+-------------+---------+-------------------+---------+------------------+------------------+------------------+--------------------+--------+--------+--------------------+--------------------+------------+-----------------+------------+-----------------+-------------------+------------------+--------+-----------+--------------+--------------+-------------------+
|summary|             CASE_ID|      PARTY_NUMBER|        PARTY_TYPE|AT_FAULT|PARTY_SEX|         PARTY_AGE|PARTY_SOBRIETY|PARTY_DRUG_PHYSICAL|DIR_OF_TRAVEL|PARTY_SAFETY_EQUIP_1|PARTY_SAFETY_EQUIP_2|FINAN_RESPONS|SP_INFO_1|          SP_INFO_2|SP_INFO_3|OAF_VIOLATION_CODE|      OAF_VIOL_CAT|  OAF_VIOL_SECTION|OAF_VIOLATION_SUFFIX|   OAF_1|   OAF_2| PARTY_NUMBER_KILLED|PARTY_NUMBER_INJURED|MOVE_PRE_ACC|     VEHICLE_YEAR|VEHICLE_MAKE|STWD_

                                                                                