# Starting Pyspark Session 

In [None]:
#pip install pyspark
from pyspark.sql import SparkSession, SQLContext
sparkSess = SparkSession.builder.appName("Association Rule Mining").config("spark.executor.memory", "50g").config('spark.driver.memory','50g').config("spark.memory.fraction", 0.9).getOrCreate()

# To link GCP bucket to Colab

In [None]:
# from google.colab import auth
# !echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
# !curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
# !apt -qq update
# !apt -qq install gcsfuse
# !mkdir Data
!gcsfuse --implicit-dirs usml-data Data

# Data Preprocessing

**To combine all CSV files and store them in spark Dataframe**

In [None]:
dataPath = '/content/Data/Data/*/*/*-street.csv'
data = sparkSess.read.csv(dataPath, header = True, sep = ",")

In [None]:
dataorg = data

In [None]:
data = data.drop('Crime ID', 'Latitude', 'Longitude','LSOA code','Context','Reported by')

In [None]:
from pyspark.sql.functions import split
data = data.withColumn('Year', split(data['Month'], '-').getItem(0))
data = data.withColumn('Month', split(data['Month'], '-').getItem(1))

In [None]:
data = data.filter(data['Crime type'] != 'Anti-social behaviour')
data = data.filter(data['LSOA name'].isNotNull())

In [None]:
from pyspark.sql.functions import regexp_replace
data = data.withColumn('Location', regexp_replace('Location','On or near',''))
data = data.withColumn('LSOA name',regexp_replace('LSOA name', ' [0-9]{3}\w', ''))

In [None]:
from pyspark.sql.functions import array_distinct, array
data = data.withColumn('itemsets', array(data['Falls Within'], data['Location'], data['Crime type'], data['Last outcome category']))

In [None]:
#data.describe().show()

# Exploratory Data Analysis

In [None]:
import calendar
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from plotnine import ggplot, geoms

**Incidents of Reported Street Crime (Dec 2010 - Feb 2022)**

In [None]:
plt.figure(figsize=(25, 5))
crime_over_time = data\
                  .groupBy(["Month", "Year"])\
                  .count()\
                  .toPandas()
months = map(lambda x: calendar.month_abbr[x], range(1, 13))
crime_time_series = crime_over_time\
                    .set_index(["Year", "Month"])\
                    .sort_index()\
                    .squeeze()
plot = crime_time_series.plot(kind = "line", color="b", title = "Incidents of Reported Street Crime (Dec 2010 - Feb 2022)")
plot.set_xticks(range(0, len(crime_time_series.index)))
plot.set_xticklabels(list(crime_time_series.index), rotation=90)
plt.show()

**Top 20 total Reported Crime Type and Outcome combination**

In [None]:
plt.figure(figsize=(10, 10))
outcome_counts = data\
              .groupBy(["Crime type", "Last outcome category"])\
              .count()\
              .sort(F.col("count").desc())\
              .toPandas()
outcome_counts_series = outcome_counts\
                        .set_index(["Crime type", "Last outcome category"])\
                        .squeeze()\
                        .apply(lambda x: x*100/outcome_counts["count"].sum())\
                        .head(20)
index = [str(x) + " -> " + str(y) for x, y in outcome_counts_series.index]
plot = sns.barplot(x = outcome_counts_series.values, y = index, color='b')
    
plot.set_title('% Total Reported Crime Type and Outcome combination (Dec 2010 - Feb 2022)')
plot.set_xlabel('% of Reported Crimes') 
plot.set_ylabel('Crime Type -> Outcome')

plt.show()

# Fitting Fpgrowth Algorithm and extracting rules

**Generating List of Itemsets**

In [None]:
itemsets = data.drop_duplicates(['itemsets']).select('itemsets')

In [None]:
from pyspark.ml.fpm import FPGrowth
fpgrowth = FPGrowth(itemsCol = 'itemsets',minSupport = 0.001, minConfidence = 0.02)
model = fpgrowth.fit(itemsets)

In [None]:
association_rules = model.associationRules

In [None]:
# association_rules.count()

In [None]:
rules = association_rules[((association_rules['confidence'] > 0.2)) & (association_rules['lift'] > 1)].toPandas()

In [None]:
rules.sort_values(by = 'confidence',ascending=False)

In [None]:
frequentItems = model.freqItemsets.toPandas()

**Calculating Interest**

In [None]:
itemsetsCount = itemsets.count()
sum = 0
mainList=[]
for i,conseq in rules.iterrows():
  # print(conseq['consequent'])
  lis=[]
  for index, row in frequentItems.iterrows():
    if(all(elem in row['items'] for elem in conseq['consequent'])):
      lis.append(row['freq'])
  mainList.append([conseq['antecedent'], conseq['consequent'], conseq['support'], conseq['confidence'], conseq['lift'],conseq['confidence']- max(lis)/itemsetsCount])

# Visualizing the Rules

In [None]:
import pandas as pd
df = pd.DataFrame (mainList, columns = ['antecedent','consequent','support','confidence','lift','interest'])

In [None]:
df = df.sort_values(by = 'confidence',ascending=False)#.to_csv('rules.csv')

In [None]:
import numpy as np
import networkx as nx  
graph_1 = nx.DiGraph()
cm=[]
N = 50
clrs = np.random.rand(N)    
strs=['R0', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R11']   

In [None]:
for i in range (12):      
  graph_1.add_nodes_from(["R"+str(i)]) 
  for a in df['antecedent'][i]:         
      graph_1.add_nodes_from([a])
      graph_1.add_edge(a, "R"+str(i), color=clrs[i] , weight = 2)
  for c in df['consequent'][i]:
          graph_1.add_nodes_from([c])
          graph_1.add_edge("R"+str(i), c, color=clrs[i],  weight=2)

In [None]:
for node in graph_1:
      found_a_string = False
      for item in strs: 
          if node==item:
              found_a_string = True
      if found_a_string:
          cm.append('yellow')
      else:
          cm.append('green')

In [None]:
edges = graph_1.edges()
clrs = [graph_1[u][v]['color'] for u,v in edges]
weights = [graph_1[u][v]['weight'] for u,v in edges]
plt.figure(figsize=(8, 8))
pos = nx.spring_layout(graph_1, k=16, scale=1)
nx.draw(graph_1, pos, edgelist=edges, node_color = cm, edge_color=clrs, width=weights, font_size=8, with_labels=False) 

In [None]:
for p in pos: 
          pos[p][1] += 0.1
          
nx.draw_networkx_labels(graph_1, pos,font_size=8)
plt.show()