In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Step 4 Data Transformation').getOrCreate()
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import when

In [2]:
df = spark.read.csv('Department 37 prepped.csv', header=True, inferSchema=True)

# 4.2

In [3]:
df = df.drop("_c0") #column was created by writing to csv in previous step, not required now
df.show(5)

+------+-------+-------------------+--------------------+----------+--------+------+--------------------+----------+-------------------+--------------------+----------+----------+--------------------+
|OffSex|OffRace|INCIDENT_DATE_LESS_|     OFF_INJURE_DESC|OFF_HOSPIT| CitRace|CitSex|    SUBJ_INJURE_DESC|CIT_ARREST|         CIT_INFL_A|          CITChargeT|SERVICE_TY|UOF_REASON|          ForceType1|
+------+-------+-------------------+--------------------+----------+--------+------+--------------------+----------+-------------------+--------------------+----------+----------+--------------------+
|  Male|  Black|                  2|No injuries noted...|        No|   Black|Female|Non-Visible Injur...|       Yes|  Mentally unstable|               APOWW|    Arrest|    Arrest|Hand/Arm/Elbow St...|
|  Male|  White|                 17|       Sprain/Strain|       Yes|Hispanic|  Male|No injuries noted...|       Yes|  Mentally unstable|               APOWW|    Arrest|    Arrest|         Joint Lo

In [4]:
encodeddf = df
x = df.columns
x.remove("INCIDENT_DATE_LESS_") #already numerical
x.remove("CitSex") #this field throws errors while encoding hence must be encoded manually
for i in x:
    ic = i
    oc = i+" encoded"
    indexer = StringIndexer(inputCol=ic, outputCol=oc)
    encodeddf = indexer.fit(encodeddf).transform(encodeddf)

In [5]:
y = [i for i in encodeddf.columns if i not in x] #storing only those columns which are numerical
encodeddf = encodeddf[y]

In [6]:
encodeddf = encodeddf.withColumn("CitSex encoded", when(df["CitSex"] == "Male", 1.0).otherwise(2.0)) #only two genders

In [7]:
encodeddf = encodeddf.drop("CitSex")
encodeddf.show(5)

+-------------------+--------------+---------------+-----------------------+------------------+---------------+------------------------+------------------+------------------+------------------+------------------+------------------+------------------+--------------+
|INCIDENT_DATE_LESS_|OffSex encoded|OffRace encoded|OFF_INJURE_DESC encoded|OFF_HOSPIT encoded|CitRace encoded|SUBJ_INJURE_DESC encoded|CIT_ARREST encoded|CIT_INFL_A encoded|CITChargeT encoded|SERVICE_TY encoded|UOF_REASON encoded|ForceType1 encoded|CitSex encoded|
+-------------------+--------------+---------------+-----------------------+------------------+---------------+------------------------+------------------+------------------+------------------+------------------+------------------+------------------+--------------+
|                  2|           0.0|            2.0|                    0.0|               0.0|            0.0|                     4.0|               0.0|               0.0|               0.0|         

Now we can use VectorAssembly to convert the dataframe into a (features, labels) vector. "CIT_ARREST encoded" is the label.

In [8]:
x = encodeddf.columns
x.remove("CIT_ARREST encoded")
af = VectorAssembler(inputCols = x, outputCol = "features")

In [9]:
output = af.transform(encodeddf)

In [10]:
final = output.select("features", "CIT_ARREST encoded")

In [11]:
final.show(5)

+--------------------+------------------+
|            features|CIT_ARREST encoded|
+--------------------+------------------+
|(13,[0,2,6,11,12]...|               0.0|
|(13,[0,3,4,5,11,1...|               0.0|
|(13,[0,2,5,7,11,1...|               0.0|
|[24.0,0.0,2.0,0.0...|               0.0|
|(13,[0,7,8,12],[7...|               0.0|
+--------------------+------------------+
only showing top 5 rows



In [12]:
final.toPandas().to_csv("Department 37 vector.csv")