In [1]:
#### Setting up Spark for colab space, code provided by Professor Othman

import os
#Install Java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#Set JAVA_HOME path variable in Linux
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
!update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
!java -version


#Install Spark
#download Spark file
!wget -q http://apache.osuosl.org/spark/spark-2.4.6/spark-2.4.6-bin-hadoop2.7.tgz
#extract the file
!tar xf spark-2.4.6-bin-hadoop2.7.tgz
#Set SPARK-HOME path variable in Linux
os.environ["SPARK_HOME"] = "/content/spark-2.4.6-bin-hadoop2.7"


#install findspark package
!pip install -q findspark

openjdk version "1.8.0_252"
OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)
OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)


In [2]:
import findspark
findspark.init()

In [3]:
# create entry points to spark
try:
    sc.stop()
except:
    pass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext
conf = SparkConf().setAppName("lecture10").setMaster("local[*]")
sc=SparkContext(conf = conf)
spark = SparkSession(sparkContext=sc)
sqlContext = SQLContext(sc)

In [4]:
spark

In [5]:
import pandas as pd
from pyspark.sql.types import *
from pyspark.sql.functions import substring, lit, desc, col
import pyspark.sql.functions as F

In [6]:
def read_nibrs_csv(firstyear,lastyear,state_name,state_abbr, csv, my_schema):
    url_base="https://raw.githubusercontent.com/roched1atwit/CS3800_4050/master/data/"

    print(("reading in data for " + state_name + " - " + csv + "..."))

    df_list = []
    for year in range(firstyear,lastyear+1):
        url = url_base+state_name+'/'+state_abbr+'-'+str(year)
        url_c = url + '/' + csv + '.csv'
              
        try:
            temp_df = pd.read_csv(url_c, error_bad_lines=False)
        except:
            # there was a format change in 2016 where NIBRS_data became capitalized
            url_c = url + '/' + csv.upper() + '.csv' 
            try:
                temp_df = pd.read_csv(url_c, error_bad_lines=False)
                temp_df.columns = map(str.lower, temp_df.columns)
            except:
                continue
        
        temp_df['state'] = state_abbr
        temp_df['year'] = year
        df_list.append(temp_df)

    full_df = pd.concat(df_list)

    full_df = full_df.astype({'ethnicity_id':'float64'})

    spark_df = spark.createDataFrame(full_df,schema=my_schema)

    return spark_df
            


In [7]:
circumstances_schema = StructType([ StructField("circumstances_id", IntegerType(), True)\

                       ,StructField("circumstances_type", StringType(), True)\

                       ,StructField("circumstances_code", IntegerType(), True)\

                       ,StructField("circumstances_name", StringType(), True)\

                       ,StructField("state", StringType(), True)\
                       
                       ,StructField("year", IntegerType(), True)])

In [8]:
arrestee_schema = StructType([ StructField("arrestee_id", IntegerType(), True)\

                       ,StructField("incident_id", IntegerType(), True)\

                       ,StructField("arrestee_seq_num", IntegerType(), True)\

                       ,StructField("arrest_num", StringType(), True)\

                       ,StructField("arrest_date", StringType(), True)\
                       
                       ,StructField("arrest_type_id", IntegerType(), True)\
                       
                       ,StructField("multiple_indicator", StringType(), True)\
                       
                       ,StructField("offense_type_id", IntegerType(), True)\
                       
                       ,StructField("age_id", IntegerType(), True)\
                       
                       ,StructField("age_num", DoubleType(), True)\
                       
                       ,StructField("sex_code", StringType(), True)\
                       
                       ,StructField("race_id", IntegerType(), True)\
                       
                       ,StructField("ethnicity_id", DoubleType(), True)\
                       
                       ,StructField("resident_code", StringType(), True)\
                       
                       ,StructField("under_18_disposition_code", StringType(), True)\
                       
                       ,StructField("clearance_ind", DoubleType(), True)\
                       
                       ,StructField("ff_line_number", DoubleType(), True)\
                       
                       ,StructField("age_range_low_num", DoubleType(), True)\
                       
                       ,StructField("age_range_high_num", DoubleType(), True)\
                       
                       ,StructField("state", StringType(), True)\
                       
                       ,StructField("year", IntegerType(), True)\
                       
                       ,StructField("data_year", DoubleType(), True)])

In [9]:
# NOTE: Uncomment additional readings of states when running locally

#arrestee = read_nibrs_csv(1991,2018,'alabama','AL','nibrs_arrestee', arrestee_schema)
#arrestee = arrestee.union(read_nibrs_csv(1991,2018,'arizona', 'AZ', 'nibrs_arrestee', arrestee_schema))
#arrestee = arrestee.union(read_nibrs_csv(1991,2018,'arkansas', 'AR', 'nibrs_arrestee', arrestee_schema))
#arrestee = arrestee.union(read_nibrs_csv(1991,2018,'colorado', 'CO', 'nibrs_arrestee', arrestee_schema))
#arrestee = arrestee.union(read_nibrs_csv(1991,2018,'connecticut', 'CT', 'nibrs_arrestee', arrestee_schema))

In [10]:
#arrestee.show()

In [11]:
# hardcoded variables telling our program which states, years, and tables are needed from NIBRS dataa

# Can make the list longer if running locally, for google colab, better to do one state at a time

state_list = ["alabama", "arizona", "arkansas","colorado", "connecticut","massachusetts","texas"]
state_abbr = ["AL", "AZ", "AR", "CO", "CT","MA","TX"]
csv = "nibrs_arrestee"
firstyear = 1991
lastyear = 2018

In [13]:

for state_i in range(0,len(state_list)):

    arrestee = read_nibrs_csv(firstyear,lastyear,state_list[state_i],state_abbr[state_i],csv, arrestee_schema)
    res = arrestee.select("offense_type_id", "year","state")

    # get the count of each type of crime per state per year
    off_types = res.groupby(res.offense_type_id, res.year, res.state).count()

    # get the total number of arrests per year, per state
    num_arrests = res.groupby(res.year, res.state).count()
    num_arrests = num_arrests.select(col("year"), col("state"), col("count").alias("total_arrests_that_year"))
    off_types = off_types.orderBy(desc("count"))
    off_types = off_types.select("offense_type_id", "year", "state", col("count").alias("num_arrests"))
    #off_types.show()
    #num_arrests.show()
    off_types = off_types.join(num_arrests, ((off_types.state == num_arrests.state) & (off_types.year == num_arrests.year)), "inner")
    #off_types.show()

    off_types = off_types.withColumn("perc_tot_arrests", col("num_arrests")/col("total_arrests_that_year"))
    offenses = off_types.select("*").toPandas()
    offenses = offenses.loc[:,~offenses.columns.duplicated()]

    arrestee.unpersist(True)
    #print(arrestee.count())
    res.unpersist(True)
    #print(res.count())
    num_arrests.unpersist(True)
    off_types.unpersist(True)
    #spark.catalog.clearCache()    #google colab cannot handle large datasets; if run locally, comment out these lines

    if state_i == 0:
        all_offenses = offenses
    else:
        all_offenses = pd.concat([all_offenses, offenses])
    
    offenses.unpersist(True)

reading in data for alabama - nibrs_arrestee...
reading in data for arizona - nibrs_arrestee...
reading in data for arkansas - nibrs_arrestee...
reading in data for colorado - nibrs_arrestee...
reading in data for connecticut - nibrs_arrestee...
reading in data for massachusetts - nibrs_arrestee...


KeyboardInterrupt: ignored

In [None]:
all_offenses

In [None]:
all_offenses = all_offenses[['offense_type_id','year','state','total_arrests_that_year']]
shoplfiting_by_state = all_offenses.loc[all_offenses['offense_type_id'] == 23]
murder_by_state =  all_offenses.loc[all_offenses['offense_type_id'] == 32]
a_assault_by_state = all_offenses.loc[all_offenses['offense_type_id'] == 27]
shoplfiting_by_state

In [None]:
victim_circumstances_schema = StructType([ StructField("victim_id", IntegerType(), True)\

                       ,StructField("circumstances_id", IntegerType(), True)\

                       ,StructField("justifiable_force_id", DoubleType(), True)\

                       ,StructField("state", StringType(), True)\

                       ,StructField("data_year", DoubleType(), True)])

In [None]:
victim_schema = StructType([ StructField("victim_id", IntegerType(), True)\

                       ,StructField("incident_id", IntegerType(), True)\

                       #,StructField("victim_seq_num", IntegerType(), True)\

                       #,StructField("victim_type_id", IntegerType(), True)\

                       #,StructField("assignment_type_id", DoubleType(), True)\

                       #,StructField("activity_type_id", DoubleType(), True)\

                       #,StructField("outside_agency_id", DoubleType(), True)\

                       #,StructField("age_id", DoubleType(), True)\

                       #,StructField("age_num", DoubleType(), True)\

                       #,StructField("sex_code", StringType(), True)\
                       
                       ,StructField("race_id", DoubleType(), True)\
                       
                       ,StructField("ethnicity_id", DoubleType(), True)\
                       
                       #,StructField("resident_status_code", StringType(), True)\
                       
                       #,StructField("agency_data_year", DoubleType(), True)\
                       
                       #,StructField("ff_line_number", DoubleType(), True)\
                       
                       #,StructField("age_range_low_num", DoubleType(), True)\
                       
                       #,StructField("age_range_high_num", DoubleType(), True)\
                       
                       ,StructField("state", StringType(), True)\
                       
                       #,StructField("data_year", DoubleType(), True)\
                       
                       ])

In [None]:
races_schema = StructType([ StructField("race_id", IntegerType(), True)\

                       #,StructField("race_code", StringType(), True)\

                       ,StructField("race_desc", StringType(), True)\

                       #,StructField("start_year", DoubleType(), True)\

                       #,StructField("end_year", DoubleType(), True)\

                       #,StructField("notes", StringType(), True)\

                       #,StructField("state", StringType(), True)

                        ])