In [1]:
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark import SparkContext

In [2]:
import gensim

# Load Google's pre-trained Word2Vec model.
model = gensim.models.KeyedVectors.load_word2vec_format('https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz', binary=True) 

In [3]:
print (len(model['cat'].tolist()))

In [4]:
import re
def split_and_merge_vector(words,model):
  words = words.strip()
  word_list = re.split(' |,|/',words)
  word_list  = list(filter(None, word_list))
#   print(word_list)
  vector_lst = []
  count = 0
  for word in word_list:
    try:
      vector = model[word].tolist()
    except KeyError:
      continue
    vector_lst.append(vector)
    count = count+1
  if count == 0:
    return []
  return [sum(x)/count for x in zip(*vector_lst)]
# split_and_merge_vector(" cat catB cat ",model)

In [5]:
crimeDF = spark.read.format('csv').options(header='true', inferSchema='true').load('/FileStore/tables/Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv').cache()

In [6]:
crimeDF_new = crimeDF.sort("Category").select('Category','Descript')

In [7]:
Category_count = crimeDF_new.groupBy("Category").count()
Category_count = Category_count.sort('count', ascending=False).cache()
Important_Categories = Category_count.filter("`count` >= 100000").cache()

In [8]:
Important_Categories.show()

In [9]:
import unicodedata
def unicodeToString(uw):
  return unicodedata.normalize('NFKD', uw).encode('ascii','ignore')

In [10]:
def category_des_lst(category):
  cate_df = crimeDF_new.filter(crimeDF_new.Category == category).sort('Descript')
  cate_df = assault_df.distinct()
  # print(assault_df.count())
  # display(assault_df)
  des_assault = assault_df.select('Descript')
  return [unicodeToString(i.Descript) for i in des_assault.collect()]
#example using category assualt to ge list of descriptions
des_assault_lst = category_des_lst('ASSAULT')

In [11]:
def featureVectorForCategory(des_lst):
  feature_vectors = []
  for words in des_lst:
    feature_vectors.append(split_and_merge_vector(words,model))
  return feature_vectors
#example feature map of ASSAULT category
feature_vecs_assualt = featureVectorForCategory(des_assault_lst)

In [12]:
from sklearn.cluster import KMeans
import numpy as np
def Kmean_cluster(num_clusters,word_vectors):
  X = np.array(word_vectors)
  kmeans_fit = KMeans(n_clusters=num_clusters, random_state=0).fit(X)
  cluster_labels = kmeans_fit.labels_
  return cluster_labels
def mergeLabelWithDescription(labels,des_lst):
  tup_lst = zip(labels,des_lst)
  rdd = sc.parallelize(tup_lst)
  return rdd.sortBy(lambda tup: tup[0])
  

In [13]:
#example: display the subcategory with assault
assault_labels = Kmean_cluster(4,feature_vecs_assualt)
assault_rdd = mergeLabelWithDescription(assault_labels,des_assault_lst)
assault_rdd.groupByKey().mapValues(list).collect()

In [14]:
import re
def flatternAndCreateDict(des_lst):
  s = set()
  for words in des_lst:
    ws = re.split(' |,|/',words)
    for word in ws:
      s.add(word)
  return list(s)
#example 
unique_word_lst = flatternAndCreateDict(des_assault_lst)
len(unique_word_lst)
print(unique_word_lst)

In [15]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 

In [16]:
def removeStopWords(unique_word_lst):
  stop_words = set(stopwords.words('english')) 
  filtered_sentence = [] 
  for w in unique_word_lst: 
      w_lower = w.lower()
      if w_lower not in stop_words: 
          filtered_sentence.append(w) 
  return filtered_sentence
# print(removeStopWords(['W']))
unique_word_lst = removeStopWords(unique_word_lst)
print(len(unique_word_lst))

In [17]:
def createDictVector(word_list,unique_word_lst):
  vector = [0] * len(unique_word_lst)
  for word in word_list:
    indx = unique_word_lst.index(word)
    vector[indx] = 2
  return vector

In [18]:
import re
def split_and_merge_vector2(words,model,unique_word_lst):
  words = words.strip()
  word_list = re.split(' |,|/',words)
  word_list  = list(filter(None, word_list))
  filtered_word_list = removeStopWords(word_list)
  print(filtered_word_list)
#   print(word_list)
  vector_lst = []
  count = 0
  for word in word_list:
    try:
      vector = model[word].tolist()
      vector = vector + createDictVector(filtered_word_list,unique_word_lst)
    except KeyError:
      continue
    vector_lst.append(vector)
    count = count+1
  if count == 0:
    return []
  return [sum(x)/count for x in zip(*vector_lst)]
# split_and_merge_vector2(" cat catB cat ",model,["cat","catB","dog"])[302]

In [19]:
def featureVectorForCategory(des_lst,unique_word_lst):
  feature_vectors = []
  for words in des_lst:
    feature_vectors.append(split_and_merge_vector2(words,model,unique_word_lst))
  return feature_vectors
#example feature map of ASSAULT category
feature_vecs_assualt2 = featureVectorForCategory(des_assault_lst,unique_word_lst)
print(len(feature_vecs_assualt2[0]))

In [20]:
#example: display the subcategory with assault
assault_labels = Kmean_cluster(5,feature_vecs_assualt2)
assault_rdd = mergeLabelWithDescription(assault_labels,des_assault_lst)
assault_rdd.groupByKey().mapValues(list).collect()