# Obj:                   To classify San Francisco Crime Description into one of 33 pre-defined categories.
# Data Source (Kaggle) : https://www.kaggle.com/c/sf-crime/data

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd

In [2]:
# Building the Spark Session
spark = SparkSession \
        .builder \
        .appName("SFO_Crime_Classif") \
        .config('spark.some.config.option','some-value') \
        .getOrCreate()

# here, 'spark' is an object of SparkSession, which has the 'SparkContext' object and can be accessed directly
sc = spark.sparkContext
print(sc.version)

2.0.2


In [3]:
# Loading the csv files:

train_data = spark.read.format('com.databricks.spark.csv') \
        .options(header='True', inferschema='True') \
        .load('/home/ramscrux7757/SPARK/SFO_Crime_Class/train.csv')

test_data = spark.read.format('com.databricks.spark.csv') \
       .options(header='True', inferschema='True') \
       .load('/home/ramscrux7757/SPARK/SFO_Crime_Class/test.csv')

In [4]:
print(type(train_data))
print(type(test_data))

<class 'pyspark.sql.dataframe.DataFrame'>
<class 'pyspark.sql.dataframe.DataFrame'>


In [5]:
print(train_data.count(), len(train_data.columns))
print(test_data.count(), len(test_data.columns))

(878049, 9)
(884262, 7)


# These are huge in size and will be using a sampled datasets 

In [6]:
print(train_data.columns)
print(test_data.columns)

['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']
['Id', 'Dates', 'DayOfWeek', 'PdDistrict', 'Address', 'X', 'Y']


In [7]:
train_data.limit(5).toPandas()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [8]:
# since the task is to catogorize the cirme/incident based on the description, 
# we are dropping all other irrelavant data except Category and Description 

train_cln = train_data.select(['Category','Descript'])
#cols_to_drop = ['Dates','DayOfWeek','PdDistrict','Resolution','Address','X','Y']
#train_cln = train_data.select([col for col in train_data.columns if col not in cols_to_drop])
train_cln.show(5)

+--------------+--------------------+
|      Category|            Descript|
+--------------+--------------------+
|      WARRANTS|      WARRANT ARREST|
|OTHER OFFENSES|TRAFFIC VIOLATION...|
|OTHER OFFENSES|TRAFFIC VIOLATION...|
| LARCENY/THEFT|GRAND THEFT FROM ...|
| LARCENY/THEFT|GRAND THEFT FROM ...|
+--------------+--------------------+
only showing top 5 rows



In [9]:
# Check the data types are correctly identified or not
train_cln.printSchema()

root
 |-- Category: string (nullable = true)
 |-- Descript: string (nullable = true)



In [10]:
# Identifying the distinct categories in 'Category'
# Registering the table as a View (for sql kind of exploration)
train_cln.createOrReplaceTempView('View')
df = spark.sql('select Distinct(Category) from View')
df.count()

39

# There are 39 distinct categories

In [11]:
from pyspark.sql.functions import col

train_cln.groupBy('Category') \
         .count() \
         .orderBy(col('count').desc()) \
         .show()

+--------------------+------+
|            Category| count|
+--------------------+------+
|       LARCENY/THEFT|174900|
|      OTHER OFFENSES|126182|
|        NON-CRIMINAL| 92304|
|             ASSAULT| 76876|
|       DRUG/NARCOTIC| 53971|
|       VEHICLE THEFT| 53781|
|           VANDALISM| 44725|
|            WARRANTS| 42214|
|            BURGLARY| 36755|
|      SUSPICIOUS OCC| 31414|
|      MISSING PERSON| 25989|
|             ROBBERY| 23000|
|               FRAUD| 16679|
|FORGERY/COUNTERFE...| 10609|
|     SECONDARY CODES|  9985|
|         WEAPON LAWS|  8555|
|        PROSTITUTION|  7484|
|            TRESPASS|  7326|
|     STOLEN PROPERTY|  4540|
|SEX OFFENSES FORC...|  4388|
+--------------------+------+
only showing top 20 rows



In [12]:
# top crime descriptions (grouping based on 'Descript')
train_cln.groupBy('Descript') \
         .count() \
         .orderBy(col('count').desc()) \
         .show()

+--------------------+-----+
|            Descript|count|
+--------------------+-----+
|GRAND THEFT FROM ...|60022|
|       LOST PROPERTY|31729|
|             BATTERY|27441|
|   STOLEN AUTOMOBILE|26897|
|DRIVERS LICENSE, ...|26839|
|      WARRANT ARREST|23754|
|SUSPICIOUS OCCURR...|21891|
|AIDED CASE, MENTA...|21497|
|PETTY THEFT FROM ...|19771|
|MALICIOUS MISCHIE...|17789|
|   TRAFFIC VIOLATION|16471|
|PETTY THEFT OF PR...|16196|
|MALICIOUS MISCHIE...|15957|
|THREATS AGAINST LIFE|14716|
|      FOUND PROPERTY|12146|
|ENROUTE TO OUTSID...|11470|
|GRAND THEFT OF PR...|11010|
|POSSESSION OF NAR...|10050|
|PETTY THEFT FROM ...|10029|
|PETTY THEFT SHOPL...| 9571|
+--------------------+-----+
only showing top 20 rows



In [13]:
# Subsampling the data (for the sake of running on this machine)
print(train_cln.count())
train_samp = train_cln.sample(False,0.01, 101)
print(train_samp.count())

878049
8639


In [14]:
# Dataset used for the modeling purposes
train_samp.limit(5).toPandas()

Unnamed: 0,Category,Descript
0,LARCENY/THEFT,PETTY THEFT FROM A BUILDING
1,WARRANTS,ENROUTE TO PAROLE OFFICER
2,LARCENY/THEFT,GRAND THEFT SHOPLIFTING
3,FRAUD,"CREDIT CARD, THEFT BY USE OF"
4,OTHER OFFENSES,TAMPERING WITH A VEHICLE


In [None]:
# Model pipeline includes the following steps
# 1. regexTokenizer (Tokenization with Regular Expressions)
# 2. stopwordsRemover
# 3. countVectors (Count vectors ('document-term vectors'))

In [15]:
from pyspark.ml.feature import RegexTokenizer,StopWordsRemover,CountVectorizer
from pyspark.ml.feature import HashingTF, IDF

# regular expression tokenizer 
regexTokenizer = RegexTokenizer(inputCol = 'Descript', outputCol='words', pattern='\\W')
# splits the sentence into words

# stop words
add_stopwords = ['http','https','amp','rt','t','c','the']

stopwordsRemover = StopWordsRemover(inputCol='words', outputCol='filtered').setStopWords(add_stopwords)

#doc is converted into vector of tokens and counts - this is one way of vectorizing
countVectors = CountVectorizer(inputCol='filtered',outputCol = 'features', vocabSize=10000, minDF=5)

# TF-IDF way of vectorizing (in spark, it is implemented as HashingTF) (statistically popularized vectorization)
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms

In [27]:
# Sting Indexing (the most frequent one gets '0')
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer,VectorAssembler
label_stringIdx = StringIndexer(inputCol = 'Category', outputCol='label')
# This is the output

pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
#pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])

# Fit the pipeline to training documents
pipelineFit = pipeline.fit(train_samp)
train_prepr = pipelineFit.transform(train_samp)
train_prepr.show(5)

+--------------+--------------------+--------------------+--------------------+--------------------+-----+
|      Category|            Descript|               words|            filtered|            features|label|
+--------------+--------------------+--------------------+--------------------+--------------------+-----+
| LARCENY/THEFT|PETTY THEFT FROM ...|[petty, theft, fr...|[petty, theft, fr...|(322,[0,2,7,9,41]...|  0.0|
|      WARRANTS|ENROUTE TO PAROLE...|[enroute, to, par...|[enroute, to, par...|(322,[26,42,102,1...|  7.0|
| LARCENY/THEFT|GRAND THEFT SHOPL...|[grand, theft, sh...|[grand, theft, sh...|(322,[0,3,72],[1....|  0.0|
|         FRAUD|CREDIT CARD, THEF...|[credit, card, th...|[credit, card, th...|(322,[0,1,52,62,6...| 12.0|
|OTHER OFFENSES|TAMPERING WITH A ...|[tampering, with,...|[tampering, with,...|(322,[9,18,25,228...|  1.0|
+--------------+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [17]:
# Sting Indexing (the most frequent one gets '0')

pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])

# Fit the pipeline to training documents
pipelineFit = pipeline.fit(train_samp)
train_prepr = pipelineFit.transform(train_samp)
train_prepr.show(5)

+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|      Category|            Descript|               words|            filtered|         rawFeatures|            features|label|
+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
| LARCENY/THEFT|PETTY THEFT FROM ...|[petty, theft, fr...|[petty, theft, fr...|(10000,[274,3170,...|(10000,[274,3170,...|  0.0|
|      WARRANTS|ENROUTE TO PAROLE...|[enroute, to, par...|[enroute, to, par...|(10000,[1245,3258...|(10000,[1245,3258...|  7.0|
| LARCENY/THEFT|GRAND THEFT SHOPL...|[grand, theft, sh...|[grand, theft, sh...|(10000,[274,713,9...|(10000,[274,713,9...|  0.0|
|         FRAUD|CREDIT CARD, THEF...|[credit, card, th...|[credit, card, th...|(10000,[274,2144,...|(10000,[274,2144,...| 12.0|
|OTHER OFFENSES|TAMPERING WITH A ...|[tampering, with,...|[tampering, with,...|(10000,[1158,2895...|(100

# Though, I showed both the vecotrizations, in the following I use TF-IDF way of vectorized features

In [28]:
# Partition the data into train and tests
(train, test) = train_prepr.randomSplit([0.7,0.3], seed=101)
print('Training Dataset Count:' + str(train.count()))
print('Test Dataset Count:'+ str(test.count()))

Training Dataset Count:6051
Test Dataset Count:2588


# Building a Naive Bayes model

In [29]:
# Naive Bayes
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
model = nb.fit(train)

In [30]:
predictions = model.transform(test)

#predictions.head(3)
predictions.filter(predictions['prediction'] == 0) \
    .select("Descript","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show()

+--------------------+-------------+--------------------+-----+----------+
|            Descript|     Category|         probability|label|prediction|
+--------------------+-------------+--------------------+-----+----------+
|PETTY THEFT FROM ...|LARCENY/THEFT|[0.99999999582460...|  0.0|       0.0|
|PETTY THEFT FROM ...|LARCENY/THEFT|[0.99999999582460...|  0.0|       0.0|
|PETTY THEFT FROM ...|LARCENY/THEFT|[0.99999999582460...|  0.0|       0.0|
|PETTY THEFT FROM ...|LARCENY/THEFT|[0.99999999582460...|  0.0|       0.0|
|PETTY THEFT FROM ...|LARCENY/THEFT|[0.99999999582460...|  0.0|       0.0|
|PETTY THEFT FROM ...|LARCENY/THEFT|[0.99999999582460...|  0.0|       0.0|
|PETTY THEFT FROM ...|LARCENY/THEFT|[0.99999999582460...|  0.0|       0.0|
|PETTY THEFT FROM ...|LARCENY/THEFT|[0.99999999582460...|  0.0|       0.0|
|PETTY THEFT FROM ...|LARCENY/THEFT|[0.99999999582460...|  0.0|       0.0|
|PETTY THEFT FROM ...|LARCENY/THEFT|[0.99999999582460...|  0.0|       0.0|
|PETTY THEFT FROM ...|LAR

In [31]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.9670678743353475

# Building a Random Forest Model

In [32]:
# Random Forest
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)
# Train model with Training Data
rfModel = rf.fit(train)

In [33]:
predictions = rfModel.transform(test)
predictions.filter(predictions['prediction'] == 0) \
    .select("Descript","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show()

+--------------------+-------------+--------------------+-----+----------+
|            Descript|     Category|         probability|label|prediction|
+--------------------+-------------+--------------------+-----+----------+
|PETTY THEFT FROM ...|LARCENY/THEFT|[0.72457857067092...|  0.0|       0.0|
|PETTY THEFT FROM ...|LARCENY/THEFT|[0.72457857067092...|  0.0|       0.0|
|PETTY THEFT FROM ...|LARCENY/THEFT|[0.72457857067092...|  0.0|       0.0|
|PETTY THEFT FROM ...|LARCENY/THEFT|[0.72457857067092...|  0.0|       0.0|
|PETTY THEFT FROM ...|LARCENY/THEFT|[0.72457857067092...|  0.0|       0.0|
|PETTY THEFT FROM ...|LARCENY/THEFT|[0.72457857067092...|  0.0|       0.0|
|PETTY THEFT FROM ...|LARCENY/THEFT|[0.72457857067092...|  0.0|       0.0|
|PETTY THEFT FROM ...|LARCENY/THEFT|[0.72457857067092...|  0.0|       0.0|
|PETTY THEFT FROM ...|LARCENY/THEFT|[0.72457857067092...|  0.0|       0.0|
|PETTY THEFT FROM ...|LARCENY/THEFT|[0.72457857067092...|  0.0|       0.0|
|PETTY THEFT FROM ...|LAR

In [34]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.6838730954522798

In [35]:
#Create ParamGrid for Cross Validation

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = (ParamGridBuilder()
             .addGrid(rf.numTrees, [50, 100, 200]) # number of trees
             .addGrid(rf.maxDepth, [3, 4, 5]) # maximum depth
#            .addGrid(rf.maxBins, [24, 32, 40]) #Number of bins
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=rf, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

# Run cross validations
cvModel = cv.fit(train)

# Use test set here so we can measure the accuracy of our model on new data
predictions = cvModel.transform(test)

In [36]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.7412958347068506

In [2]:
spark.stop()