## MODELING

In [1]:
import os
import sys
import numpy
import pandas
import matplotlib.pyplot as plt
sys.path.append(os.path.abspath("./util"))
from pymining import itemmining, assocrules
from fp_growth import find_frequent_itemsets as get_freq_itemset

%matplotlib inline
plt.style.use('ggplot')
pandas.set_option('display.max_rows', 10000, "display.max_columns", 100)

In [2]:
to_tuple   = lambda trans_list: tuple([tuple(trans) for trans in trans_list])
# percentage = lambda x, tot=len(transactions): (x * 1.0) / tot

def only_rules_with(columns, dataframe, rules, only_cons=False):
    list_of_sets = [set(dataframe[column].values) for column in columns]
    result = []
    for rule in rules:
        current_itemset = rule[1] if only_cons else rule[0] | rule[1]
        cond = True
        for column_set in list_of_sets:
            if (current_itemset & column_set) == set():
                cond = False
        if cond:
            result.append(rule)
    return result

def print_that(filepath, rules):
    with open(filepath, 'w+') as result_file:
        for rule in rules:
            result_file.write(rule_to_string(rule))
        
def rule_to_string(rule):
    return str([j for j in rule[0]]) + "  ->  " + str([z for z in rule[1]]) + \
    ", supp: " + str(rule[2]) + \
    ", conf: " + str(rule[3]) + \
    "\n"

### Sampled Dataset

In [3]:
df = pandas.read_csv("./dataset/crimes_census_5poi_sampled100.csv", index_col=0, sep="\t")
df.shape

(2954, 21)

In [4]:
# without predicted values
# df = df.drop(["ARPopDen", "ARPerAA", "ARPerM1724", "ARPerSF"], axis=1)
df = df.drop(["BLOCKID10", "Street_Nam", "ARPerRMI5L", "ARPer3MU", "ARHeteInx", "ARPerHOwn"], axis=1)
df = df.drop(["First_POI", "Second_POI", "Third_POI", "Fourth_POI", "Fifth_POI"], axis=1)
transactions = [row.tolist() for i, row in df.iterrows()]

print df.shape
df.describe()

(2954, 10)


Unnamed: 0,ARPCIncome,ARPerHEdu,ARPerWork,ARPopDen,ARPerAA,ARPerM1724,ARPerSF,NIBRSclass,Place2,Report_Dat
count,2954,2954,2954,2954,2954,2954,2954,2954,2954,2954
unique,7,7,7,7,7,7,7,52,60,36
top,inc-1/7,edu-4/7,empl-1/7,popden-5/7,afro-1/7,youngm-4/7,sinpar-6/7,Drug/Narcotic Violations,Private Residence,07-2x
freq,528,482,590,535,596,678,585,100,511,109


### Frequent Itemset generation

In [5]:
min_supp = 10

#### Relim
[Paper 1](https://pdfs.semanticscholar.org/cb3e/76d1773d08545f21daf28cc87b051604aa95.pdf)
[Paper 2](http://www.borgelt.net/papers/relim.pdf)

In [6]:
%%time
relim_input = itemmining.get_relim_input(to_tuple(transactions))
relim_itemsets = itemmining.relim(relim_input, min_support=min_supp)
print "number of frequent itemsets", len(relim_itemsets), "\n"

number of frequent itemsets 21140 

CPU times: user 872 ms, sys: 52 ms, total: 924 ms
Wall time: 871 ms


### Rules generation

In [7]:
min_conf = 0.60

In [8]:
%%time
rules = assocrules.mine_assoc_rules(relim_itemsets, min_support=min_supp, min_confidence=min_conf)
print len(rules)

175934
CPU times: user 3.48 s, sys: 88 ms, total: 3.57 s
Wall time: 3.48 s


#### Filtering the rules
leaving only rules that have crimes or that have crime in consequent (RHS)

In [9]:
rules_with_crimes = only_rules_with(["NIBRSclass"], df, rules)
print "# of rules with crimes:", len(rules_with_crimes)
rules_with_crimes_in_cons = only_rules_with(["NIBRSclass"], df, rules, only_cons=True)
print "# of rules with crimes in consequent:", len(rules_with_crimes_in_cons)

# of rules with crimes: 48218
# of rules with crimes in consequent: 4173


In [10]:
# printing some rules
for rule in rules_with_crimes_in_cons[:2]:
    print rule_to_string(rule)

['popden-1/7', 'School - Primary or Secondary', 'youngm-3/7']  ->  ['sinpar-7/7', 'inc-1/7', 'edu-6/7', 'afro-6/7', 'empl-1/7', 'Disorderly Conduct'], supp: 23, conf: 0.741935483871

['School - Primary or Secondary', 'youngm-3/7', 'empl-1/7']  ->  ['sinpar-7/7', 'inc-1/7', 'edu-6/7', 'afro-6/7', 'popden-1/7', 'Disorderly Conduct'], supp: 23, conf: 0.741935483871



In [11]:
# print_that('./dataset/result_file.txt', rules_with_crimes)
# print_that('./dataset/result_file_only_cons.txt', rules_with_crimes_in_cons)

---

### Entire dataset

In [12]:
df_entire = pandas.read_csv("./dataset/crimes_census_5poi.csv", index_col=0, sep="\t")
df_entire.shape

(19106, 21)

In [13]:
# without predicted values
# df_entire = df_entire.drop(["ARPopDen", "ARPerAA", "ARPerM1724", "ARPerSF"], axis=1)
df_entire = df_entire.drop(["BLOCKID10", "Street_Nam", "ARHeteInx", "ARPerHOwn", 
                            "ARPerRMI5L", "ARPer3MU"], axis=1)
df_entire = df_entire.drop(["First_POI", "Second_POI", "Third_POI", "Fourth_POI", "Fifth_POI"], axis=1)
transactions_entire = [row.tolist() for i, row in df_entire.iterrows()]

print df.shape
df.describe()

(2954, 10)


Unnamed: 0,ARPCIncome,ARPerHEdu,ARPerWork,ARPopDen,ARPerAA,ARPerM1724,ARPerSF,NIBRSclass,Place2,Report_Dat
count,2954,2954,2954,2954,2954,2954,2954,2954,2954,2954
unique,7,7,7,7,7,7,7,52,60,36
top,inc-1/7,edu-4/7,empl-1/7,popden-5/7,afro-1/7,youngm-4/7,sinpar-6/7,Drug/Narcotic Violations,Private Residence,07-2x
freq,528,482,590,535,596,678,585,100,511,109


### Frequent Itemset generation

In [14]:
min_supp = 70

In [15]:
%time
relim_input_entire = itemmining.get_relim_input(to_tuple(transactions_entire))
relim_itemsets_entire = itemmining.relim(relim_input_entire, min_support=min_supp)
print "number of frequent itemsets", len(relim_itemsets_entire), "\n"

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 7.15 µs
number of frequent itemsets 17086 



### Rules generation

In [16]:
min_conf = 0.60

In [17]:
%%time
rules_entire = assocrules.mine_assoc_rules(relim_itemsets_entire, min_support=min_supp, min_confidence=min_conf)
print len(rules_entire)

101352
CPU times: user 2.49 s, sys: 52 ms, total: 2.54 s
Wall time: 2.47 s


In [18]:
rules_with_crimes_entire = only_rules_with(["NIBRSclass"], df_entire, rules_entire)
print "# of rules with crimes:", len(rules_with_crimes_entire)
rules_with_crimes_in_cons_entire = only_rules_with(["NIBRSclass"], df_entire, rules_entire, only_cons=True)
print "# of rules with crimes in consequent:", len(rules_with_crimes_in_cons_entire)

# of rules with crimes: 23332
# of rules with crimes in consequent: 1461


In [19]:
for rule in rules_with_crimes_in_cons_entire[:3]:
    print rule_to_string(rule)

['Department Store', 'empl-6/7', 'inc-6/7', 'youngm-1/7', 'sinpar-2/7', 'popden-1/7', 'edu-4/7']  ->  ['afro-1/7', 'Shoplifting'], supp: 78, conf: 0.838709677419

['Department Store', 'youngm-1/7', 'sinpar-2/7', 'inc-6/7', 'popden-1/7', 'edu-4/7']  ->  ['afro-1/7', 'empl-6/7', 'Shoplifting'], supp: 78, conf: 0.838709677419

['Department Store', 'sinpar-2/7', 'inc-6/7', 'popden-1/7', 'edu-4/7']  ->  ['afro-1/7', 'youngm-1/7', 'empl-6/7', 'Shoplifting'], supp: 78, conf: 0.838709677419



---
### Evaluation

In [20]:
def lift(rule_supp, ant_supp, cons_supp):
    return (rule_supp / 1.0) / (ant_supp * cons_supp)

def conviction(cons_supp, rule_conf):
    return (1.0 - cons_supp) / (1.0 - rule_conf)

In [21]:
for rule in rules:
    l = lift(relim_itemsets[rule[0] | rule[1]], relim_itemsets[rule[0]], relim_itemsets[rule[1]])
    if l > 1:
        print l
    # c = conviction(relim_itemsets[rule[1]], rule[3])