In [None]:
from pyspark import SparkContext

In [None]:
sc = SparkContext.getOrCreate()

In [None]:
sc

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
spark

In [None]:
data_path = './data'
file_path = f'{data_path}/reported-crimes.csv'

In [None]:
from pyspark.sql.functions import to_timestamp,col,lit

rc = (spark.read
            .csv(file_path,header=True)
            .withColumn('Date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm:ss a'))
            .filter(col('Date') <= lit('2018-11-11')))

In [None]:
rc.dtypes

In [None]:
rc.printSchema()

In [None]:
rc.columns

In [None]:
import pyspark.sql.types as st

In [None]:
rc_schema = st.StructType([
    st.StructField('ID', st.StringType(), True),
    st.StructField('Case Number', st.StringType(), True),
    st.StructField('Date', st.TimestampType(), True),
    st.StructField('Block', st.StringType(), True),
    st.StructField('IUCR', st.StringType(), True),
    st.StructField('Primary Type', st.StringType(), True),
    st.StructField('Description', st.StringType(), True),
    st.StructField('Location Description', st.StringType(), True),
    st.StructField('Domestic', st.BooleanType(), True),
    st.StructField('Beat', st.StringType(), True),
    st.StructField('Ward', st.StringType(), True),
    st.StructField('Community Area', st.StringType(), True),
    st.StructField('FBI Code', st.StringType(), True),
    st.StructField('X Coordinate', st.StringType(), True),
    st.StructField('Y Coordinate', st.StringType(), True),
    st.StructField('Year', st.IntegerType(), True),
    st.StructField('Updated On', st.TimestampType(), True),
    st.StructField('Latitude', st.BooleanType(), True),
    st.StructField('Longitude', st.BooleanType(), True),
    st.StructField('Location', st.StringType(), True)
])

In [None]:
rcs = (spark.read
            .csv(file_path,header=True, schema=rc_schema)
            .withColumn('Date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm:ss a'))
            .filter(col('Date') <= lit('2018-11-11')))

In [None]:
rcs.printSchema()

In [None]:
# df.take(n), df.collect(), df.show(), df.limit(), df.head()
# df.columnName, df['columnName'], df.select(col('columnName'))

In [None]:
# df.columns

In [None]:
# df.select('columnName1', 'columnName2')
# df[['columnName1', 'columnName2']]

In [None]:
rc.select('IUCR').show(3)

In [None]:
rc.select(rc.IUCR).show(3)

In [None]:
rc.select(col('IUCR')).show(3)

In [None]:
rc.select('Case Number', 'Date', 'Arrest').show(3)

In [None]:
from pyspark.sql.functions import lit

In [None]:
rc.withColumn('One', lit(1))

In [None]:
_rc = rc.drop('IUCR')

In [None]:
aday_inc = (spark.read
            .csv(file_path,header=True)
            .withColumn('Date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm:ss a'))
            .filter(col('Date') == lit('2018-11-12')))

In [None]:
aday_inc.count()

In [None]:
rc.union(aday_inc).orderBy('Date', ascending=False)

In [None]:
(rc.groupBy('Primary type')
    .count()
    .orderBy('count', ascending=False)
    .show(10))

In [None]:
rc.select('Arrest').distinct().show()

In [None]:
rc.filter(col('Arrest') == 'true').count() / rc.select(col('Arrest')).count()

In [None]:
(rc.groupBy('Location Description')
    .count()
    .orderBy('count', ascending=False)
    .show(3))

In [None]:
import pyspark.sql.functions as fnc

In [None]:
rc.select(
    fnc.lower(col('Primary Type')), 
    fnc.upper(col('Primary Type')), 
    fnc.substring('Primary Type', 1, 4)
    ).show(3)

In [None]:
rc.select(
    fnc.min(col('Date')), 
    fnc.max(col('Date'))
).show()

In [None]:
rc.select(
    fnc.date_sub(
        fnc.min(col('Date')), 3),
    fnc.date_add(
        fnc.max(col('Date')), 3)
    ).show()

In [None]:
df = spark.createDataFrame(
                            [('2019-12-25 13:30:00',)],
                            ['Christmas']
                          )

In [None]:
df.show()

In [None]:
df.select(fnc.to_date(col('Christmas'), 'yyyy-MM-dd HH:mm:ss'),
          fnc.to_timestamp(col('Christmas'), 'yyyy-MM-dd HH:mm:ss')
         ).show()

In [None]:
df = spark.createDataFrame(
                            [('25/Dec/2019 13:30:00',)],
                            ['Christmas']
                          )

In [None]:
df.select(fnc.to_date(col('Christmas'), 'dd/MMM/yyyy HH:mm:ss'),
          fnc.to_timestamp(col('Christmas'), 'dd/MMM/yyyy HH:mm:ss')
         ).show()

In [None]:
df = spark.createDataFrame(
                            [('12/25/2019 01:30:00 PM',)],
                            ['Christmas']
                          )

In [None]:
df.show(1, truncate=False)

In [None]:
df.select(fnc.to_date(col('Christmas'), 'MM/dd/yyyy hh:mm:ss a'),
          fnc.to_timestamp(col('Christmas'), 'MM/dd/yyyy hh:mm:ss a')
         ).show()

In [None]:
import pyspark.sql.functions as fn

In [None]:
data_path = './data'
file_path = f'{data_path}/police-station.csv'
ps = spark.read.csv(file_path,header=True)

In [None]:
ps.select(fn.col('DISTRICT')).distinct().show(3)

In [None]:
data_path = './data'
file_path = f'{data_path}/reported-crimes.csv'
rc = (spark.read
            .csv(file_path,header=True)
            .withColumn('Date',fn.to_timestamp(fn.col('Date'),'MM/dd/yyyy hh:mm:ss a'))
            .filter(fn.col('Date') >= fn.lit('2018-11-11')))

In [None]:
rc.cache()
rc.count()

In [None]:
rc.select(fn.col('District')).distinct().show(3)

In [None]:
ps.select(fn.lpad(fn.col('DISTRICT'), 3, '0')).show(3)

In [None]:
ps = ps.withColumn('format_district', fn.lpad(fn.col('DISTRICT'), 3, '0'))

In [None]:
ps_col_drop = (
             'ADDRESS',
             'CITY',
             'STATE',
             'ZIP',
             'WEBSITE',
             'PHONE',
             'FAX',
             'TTY',
             'X COORDINATE',
             'Y COORDINATE',
             'LATITUDE',
             'LONGITUDE',
             'LOCATION',
             'format_district')

In [None]:
rc.select(fn.col('Primary Type')).distinct().count()

In [None]:
rc.select(fn.col('Primary Type')).distinct().orderBy(fn.col('Primary Type')).show(35, truncate=False)

In [None]:
nc = rc.filter(
    (fn.col('Primary Type') == 'NON-CRIMINAL') | 
    (fn.col('Primary Type') == 'NON - CRIMINAL') | 
    (fn.col('Primary Type') == 'NON - CRIMINAL (SUBJECT SPECIFIED)')
)

In [None]:
nc.select(fn.col('Primary Type')).distinct().orderBy(fn.col('Primary Type')).show(35, truncate=False)

In [None]:
nc.groupBy(fn.col('Description')).count().orderBy('count', ascending=False).show(truncate=False)

In [None]:
rc.select(
    fn.col('Date'), 
    fn.dayofweek(fn.col('Date')),
    fn.date_format(fn.col('Date'), 'E')
).show(3)

In [None]:
rc.groupBy(fn.date_format(fn.col('Date'), 'E')).count().orderBy('count', ascending=False).show()

In [None]:
rc.groupBy(fn.date_format(fn.col('Date'), 'E')).count().collect()

In [None]:
dow = [x[0] for x in rc.groupBy(fn.date_format(fn.col('Date'), 'E')).count().collect()]
cnt = [x[1] for x in rc.groupBy(fn.date_format(fn.col('Date'), 'E')).count().collect()]

In [None]:
dow

In [None]:
cnt