In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Study London Crime Rate').getOrCreate()
spark

In [5]:
path = 'Datasets/'

crime_data = spark.read.csv(path+'london_crime_by_lsoa.csv',inferSchema=True,header=True)
crime_data.limit(5).toPandas()

Unnamed: 0,lsoa_code,borough,major_category,minor_category,value,year,month
0,E01001116,Croydon,Burglary,Burglary in Other Buildings,0,2016,11
1,E01001646,Greenwich,Violence Against the Person,Other violence,0,2016,11
2,E01000677,Bromley,Violence Against the Person,Other violence,0,2015,5
3,E01003774,Redbridge,Burglary,Burglary in Other Buildings,0,2016,3
4,E01004563,Wandsworth,Robbery,Personal Property,0,2008,6


In [6]:
crime_data.printSchema()

root
 |-- lsoa_code: string (nullable = true)
 |-- borough: string (nullable = true)
 |-- major_category: string (nullable = true)
 |-- minor_category: string (nullable = true)
 |-- value: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)



In [8]:
len(crime_data.columns)

7

In [10]:
crime_data = crime_data.drop('lsoa_code')

crime_data.show(5)

+----------+--------------------+--------------------+-----+----+-----+
|   borough|      major_category|      minor_category|value|year|month|
+----------+--------------------+--------------------+-----+----+-----+
|   Croydon|            Burglary|Burglary in Other...|    0|2016|   11|
| Greenwich|Violence Against ...|      Other violence|    0|2016|   11|
|   Bromley|Violence Against ...|      Other violence|    0|2015|    5|
| Redbridge|            Burglary|Burglary in Other...|    0|2016|    3|
|Wandsworth|             Robbery|   Personal Property|    0|2008|    6|
+----------+--------------------+--------------------+-----+----+-----+
only showing top 5 rows



In [11]:
crime_data.count()

13490604

In [12]:
crime_data= crime_data.dropna() 

In [13]:
crime_data.count()

13490604

In [16]:
borough = crime_data.select('borough').distinct()

In [18]:
borough.show()

+--------------------+
|             borough|
+--------------------+
|             Croydon|
|          Wandsworth|
|              Bexley|
|             Lambeth|
|Barking and Dagenham|
|              Camden|
|           Greenwich|
|              Newham|
|       Tower Hamlets|
|            Hounslow|
|              Barnet|
|              Harrow|
|Kensington and Ch...|
|           Islington|
|               Brent|
|            Haringey|
|             Bromley|
|              Merton|
|         Westminster|
|             Hackney|
+--------------------+
only showing top 20 rows



In [19]:
type(borough)

pyspark.sql.dataframe.DataFrame

In [20]:
borough.count()

33

In [22]:
Hackney_data = crime_data.filter(crime_data['borough']=='Hackney')
Hackney_data.show(5)

+-------+--------------------+--------------------+-----+----+-----+
|borough|      major_category|      minor_category|value|year|month|
+-------+--------------------+--------------------+-----+----+-----+
|Hackney|     Criminal Damage|Criminal Damage T...|    0|2011|    6|
|Hackney|Violence Against ...|          Harassment|    1|2013|    2|
|Hackney|     Criminal Damage|Other Criminal Da...|    0|2011|    7|
|Hackney|Violence Against ...|        Wounding/GBH|    0|2013|   12|
|Hackney|  Theft and Handling|  Other Theft Person|    0|2016|    8|
+-------+--------------------+--------------------+-----+----+-----+
only showing top 5 rows



In [27]:
year_data = crime_data.filter(crime_data['year'].isin(['2011','2016']) )
year_data.show(5)

+----------+--------------------+--------------------+-----+----+-----+
|   borough|      major_category|      minor_category|value|year|month|
+----------+--------------------+--------------------+-----+----+-----+
|   Croydon|            Burglary|Burglary in Other...|    0|2016|   11|
| Greenwich|Violence Against ...|      Other violence|    0|2016|   11|
| Redbridge|            Burglary|Burglary in Other...|    0|2016|    3|
|    Sutton|  Theft and Handling|Theft/Taking of P...|    1|2016|    8|
|Wandsworth|Violence Against ...|    Offensive Weapon|    0|2011|   10|
+----------+--------------------+--------------------+-----+----+-----+
only showing top 5 rows



In [28]:
borough_crime = crime_data.groupBy('borough')

In [30]:
borough_crime.count().show()

+--------------------+------+
|             borough| count|
+--------------------+------+
|             Croydon|602100|
|          Wandsworth|498636|
|              Bexley|385668|
|             Lambeth|519048|
|Barking and Dagenham|311040|
|              Camden|378432|
|           Greenwich|421200|
|              Newham|471420|
|       Tower Hamlets|412128|
|            Hounslow|395928|
|              Barnet|572832|
|              Harrow|365688|
|Kensington and Ch...|296784|
|           Islington|359208|
|               Brent|490644|
|            Haringey|413856|
|             Bromley|523908|
|              Merton|339876|
|         Westminster|366660|
|             Hackney|417744|
+--------------------+------+
only showing top 20 rows



In [32]:
borough_crime_convictions = crime_data.groupBy('borough').agg({'value':'sum'}).\
                            withColumnRenamed('sum(value)','convictions')

In [34]:
borough_crime_convictions.show()

+--------------------+-----------+
|             borough|convictions|
+--------------------+-----------+
|             Croydon|     260294|
|          Wandsworth|     204741|
|              Bexley|     114136|
|             Lambeth|     292178|
|Barking and Dagenham|     149447|
|              Camden|     275147|
|           Greenwich|     181568|
|              Newham|     262024|
|       Tower Hamlets|     228613|
|            Hounslow|     186772|
|              Barnet|     212191|
|              Harrow|     116848|
|Kensington and Ch...|     171981|
|           Islington|     230286|
|               Brent|     227551|
|            Haringey|     213272|
|             Bromley|     184349|
|              Merton|     115654|
|         Westminster|     455028|
|             Hackney|     217119|
+--------------------+-----------+
only showing top 20 rows



In [37]:
borough_crime_convictions_count = borough_crime_convictions.agg({"convictions":'sum'})

borough_crime_convictions_count.show()

+----------------+
|sum(convictions)|
+----------------+
|         6447758|
+----------------+



In [38]:
total_convictions = borough_crime_convictions_count.collect()[0][0]

In [39]:
total_convictions

6447758

In [40]:
import pyspark.sql.functions as func

In [41]:
borough_crime_percentage = borough_crime_convictions.\
                withColumn('percentageConvictions', func.round(borough_crime_convictions.convictions/total_convictions *100,2))

In [45]:
borough_crime_percentage.printSchema()

root
 |-- borough: string (nullable = true)
 |-- convictions: long (nullable = true)
 |-- percentageConvictions: double (nullable = true)



In [46]:
borough_crime_percentage.orderBy(borough_crime_percentage[2].desc()).show(5)

+-----------+-----------+---------------------+
|    borough|convictions|percentageConvictions|
+-----------+-----------+---------------------+
|Westminster|     455028|                 7.06|
|    Lambeth|     292178|                 4.53|
|  Southwark|     278809|                 4.32|
|     Camden|     275147|                 4.27|
|     Newham|     262024|                 4.06|
+-----------+-----------+---------------------+
only showing top 5 rows



In [49]:
monthly_convictions = crime_data.filter(crime_data['year']==2016).groupBy('month').agg({"value":"sum"}).withColumnRenamed("sum(value)",'convictions')

In [50]:
monthly_convictions.show(5)

+-----+-----------+
|month|convictions|
+-----+-----------+
|   12|      62455|
|    1|      58847|
|    6|      62262|
|    3|      59167|
|    5|      63990|
+-----+-----------+
only showing top 5 rows



In [53]:
monthly_convictions_total = monthly_convictions.agg({"convictions":"sum"})

In [55]:
monthly_convictions_total.show()

+----------------+
|sum(convictions)|
+----------------+
|          736121|
+----------------+



In [58]:
monthly_total2016 = monthly_convictions_total.collect()[0][0]

In [59]:
monthly_total2016

736121

In [60]:
monthly_convictions_percentage = monthly_convictions.\
                withColumn('percentageConvictions', func.round(monthly_convictions.convictions/monthly_total2016 *100,2))

In [61]:
monthly_convictions_percentage.orderBy(monthly_convictions_percentage[2].desc()).show()

+-----+-----------+---------------------+
|month|convictions|percentageConvictions|
+-----+-----------+---------------------+
|    7|      65519|                  8.9|
|    5|      63990|                 8.69|
|   10|      63405|                 8.61|
|    8|      62666|                 8.51|
|   12|      62455|                 8.48|
|    6|      62262|                 8.46|
|    9|      61412|                 8.34|
|   11|      61064|                  8.3|
|    3|      59167|                 8.04|
|    1|      58847|                 7.99|
|    4|      58637|                 7.97|
|    2|      56697|                  7.7|
+-----+-----------+---------------------+



In [62]:
crimes_category = crime_data.groupBy('major_category').agg({'value':'sum'})\
                    .withColumnRenamed('sum(value)','convictions')

In [65]:
crimes_category.orderBy(crimes_category.convictions.desc()).show()

+--------------------+-----------+
|      major_category|convictions|
+--------------------+-----------+
|  Theft and Handling|    2661861|
|Violence Against ...|    1558081|
|            Burglary|     754293|
|     Criminal Damage|     630938|
|               Drugs|     470765|
|             Robbery|     258873|
|Other Notifiable ...|     106349|
|    Fraud or Forgery|       5325|
|     Sexual Offences|       1273|
+--------------------+-----------+



In [70]:
year = crime_data.select('year')

In [71]:
year.describe().show()

+-------+------------------+
|summary|              year|
+-------+------------------+
|  count|          13490604|
|   mean|            2012.0|
| stddev|2.5819889931674522|
|    min|              2008|
|    max|              2016|
+-------+------------------+

