In [1]:
%matplotlib inline

import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

from collections import defaultdict
from scipy.stats.stats import pearsonr


In [2]:
def tra2rel(fileinput, fileoutput, delimiter=',', has_header=True):
    data = open(fileinput, 'r')
    if has_header:
        data.readline()
    baskets = defaultdict(list)

    for row in data:
        basket_id = row.replace('\r\n', '').split(delimiter)[0]
        item_id = row.replace('\r\n', '').split(delimiter)[1]
        baskets[basket_id].append(item_id)

    data.close()

    out = open(fileoutput, 'w')
    for k, v in baskets.iteritems():
        s = '%s' % k
        for item in v:
            s += ',%s' % item
        out.write('%s\n' % s)
    out.close()
    
    return baskets

In [3]:
baskets = tra2rel('transactions.csv', 'baskets.csv', delimiter=',', has_header=True)

In [4]:
baskets.keys()[:10]

['2663064031721',
 '2663064031720',
 '2663064031723',
 '2596064017113',
 '2596064017112',
 '2596064017111',
 '2596064017110',
 '2566064065708',
 '2566064065709',
 '2596064017115']

In [5]:
baskets['2663064031721']

['1108', '1140', '2017', '4180', '5030', '5064']

In [16]:
import fim
from fim import apriori

In [12]:
baskets_lists = [b for b in baskets.values()]

In [13]:
baskets_lists[:3]

[['1108', '1140', '2017', '4180', '5030', '5064'],
 ['1122', '2243', '2551', '3569', '3664', '3000442'],
 ['437', '559']]

In [74]:
itemsets = apriori(baskets_lists[:100], supp=2, zmin=2, target='a') 

In [75]:
itemsets

[(('1658', '2650'), 2),
 (('3000442', '2551'), 2),
 (('2061', '3672'), 2),
 (('3086', '3087'), 2),
 (('5086', '2729', '445'), 2),
 (('5086', '2729'), 2),
 (('5086', '445'), 2),
 (('2674', '2650'), 2),
 (('608', '3087'), 2),
 (('2665', '2052'), 2),
 (('2665', '3828'), 2),
 (('2665', '3087'), 2),
 (('5072', '920'), 2),
 (('5025', '445'), 2),
 (('2058', '3448'), 2),
 (('2058', '445'), 2),
 (('3749', '3448'), 2),
 (('2080', '437'), 2),
 (('632', '2650'), 2),
 (('147', '1640'), 2),
 (('147', '2243'), 2),
 (('5069', '3750'), 2),
 (('441', '2193', '2504'), 2),
 (('441', '2193'), 2),
 (('441', '2504'), 2),
 (('2532', '4658'), 2),
 (('954', '3448'), 2),
 (('954', '4049'), 2),
 (('4029', '207'), 2),
 (('577', '2198'), 2),
 (('1257', '2009'), 2),
 (('4180', '2050'), 2),
 (('2494', '3448'), 2),
 (('1281', '2089'), 2),
 (('3690', '2727', '2729'), 2),
 (('3690', '2727'), 2),
 (('3690', '2729'), 2),
 (('4047', '2727', '2729'), 2),
 (('4047', '2727'), 2),
 (('4047', '2729'), 2),
 (('2495', '4776'), 2)

In [22]:
rules = apriori(baskets_lists[:100], supp=3, zmin=2, target='r', 
                conf=90, report='ascl') 

In [23]:
rules

[('4049', ('3449', '2504'), 3, 0.03, 1.0, 20.0),
 ('2504', ('3449', '4049'), 3, 0.03, 1.0, 25.0),
 ('3449', ('2504', '4049'), 3, 0.03, 1.0, 33.333333333333336),
 ('2504', ('3449',), 3, 0.03, 1.0, 25.0),
 ('4049', ('3449',), 3, 0.03, 1.0, 20.0),
 ('2089', ('2443', '445'), 3, 0.03, 1.0, 8.333333333333334)]

In [18]:
help(fim.apriori)

Help on built-in function apriori in module fim:

apriori(...)
    apriori (tracts, target='s', supp=10, zmin=1, zmax=None, report='a',
             eval='x', agg='x', thresh=10, prune=None, algo='b', mode='',
             border=None)
    Find frequent item sets with the Apriori algorithm.
    tracts  transaction database to mine (mandatory)
            The database must be an iterable of transactions;
            each transaction must be an iterable of items;
            each item must be a hashable object.
            If the database is a dictionary, the transactions are
            the keys, the values their (integer) multiplicities.
    target  type of frequent item sets to find     (default: s)
            s/a   sets/all   all     frequent item sets
            c     closed     closed  frequent item sets
            m     maximal    maximal frequent item sets
            g     gens       generators
            r     rules      association rules
    supp    minimum support of an i

In [None]:
# Calling external C function

In [24]:
import subprocess

In [25]:
def call_apriori(fileinput, fileoutput, delimiter=',', target_type='s', 
                 min_nbr_items=1, min_sup=2, min_conf=2):
    # apriori
    # -t# {m: maximal, c: closed, s: frequent, r: association rules}
    # -m# minimum number of items per item set/association rule
    # -s# minimum support of an item set, positive: percentage, negative: absolute
    # -c# minimum confidence rule percentage
    # -b# line delimiter (,)
    # The default additional information output format for rules is " (%X, %C)"
    # %X relative body set support as a percentage
    # %C rule confidence as a percentage
    # %L lift

    if target_type == 'r':
        call_cmd = ['./apriori', '-b%s' % delimiter, '-t%s' % target_type, '-m%s' % min_nbr_items, 
                    '-s%s' % min_sup, '-c%s' % min_conf, '-v (%X, %C, %L)', 
                    fileinput, fileoutput]
    else:
        call_cmd = ['./apriori', '-b%s' % delimiter, '-t%s' % target_type, 
                           '-m%s' % min_nbr_items, '-s%s' % min_sup, fileinput, fileoutput]

    ret = subprocess.call(call_cmd,  stdout=open('apriori_stdout.txt', 'w'), 
                          stderr=open('apriori_stderr.txt', 'w'))
    return ret

In [26]:
delimiter=','
target_type='s'
min_nbr_items=3
min_sup=1

ret_val = call_apriori('baskets.csv', 'freq_patterns.txt', 
                       delimiter, target_type, 
                       min_nbr_items, min_sup)

In [27]:
delimiter=','
target_type='r'
min_nbr_items=3
min_sup=2
min_conf=25

ret_val = call_apriori('baskets.csv', 'rules.txt', delimiter, target_type, 
                       min_nbr_items, min_sup, min_conf)

In [29]:
def read_rules(filename):
    data = open(filename, 'r')
    rules = list()
    for row in data:
        fileds = row.rstrip('\n\r').split(' <- ')
        cons = fileds[0]
        other = fileds[1].split(' (')
        ant = other[0].split(' ')
        other2 = other[1].split(', ')
        sup = float(other2[0])
        conf = float(other2[1])
        lift = float(other2[2].replace(')', ''))
        rule = {
            'ant': ant,
            'cons': cons,
            'sup': sup,
            'conf': conf,
            'lift': lift
        }
        rules.append(rule)
    data.close()
    return rules

In [30]:
rules = read_rules('rules.txt')
for r in rules[:3]:
    print r['ant'], '-->', r['cons'], ' lift', r['lift'], ' conf', r['conf']

['2729', '2650'] --> 2727  lift 495.026  conf 26.9805
['3448', '2443'] --> 2193  lift 312.362  conf 26.0954
['2193', '2443'] --> 3448  lift 343.622  conf 25.9445


In [31]:
rules[0]

{'ant': ['2729', '2650'],
 'conf': 26.9805,
 'cons': '2727',
 'lift': 495.026,
 'sup': 2.13263}

In [32]:
# Frequent Pattern Mining on Titanic Dataset

In [33]:
df = pd.read_csv("titanic_train.csv")

In [34]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [35]:
def clean_data(df, drop_passenger_id):
    
    # Get the unique values of Sex
    sexes = sorted(df['Sex'].unique())
    
    # Generate a mapping of Sex from a string to a number representation    
    genders_mapping = dict(zip(sexes, range(0, len(sexes) + 1)))
    
    #print genders_mapping

    # Transform Sex from a string to a number representation
    df['Sex_Val'] = df['Sex'].map(genders_mapping).astype(int)
    
    # Get the unique values of Embarked
    #embarked_locs = sorted(df['Embarked'].unique())

    # Generate a mapping of Embarked from a string to a number representation        
    #embarked_locs_mapping = dict(zip(embarked_locs, 
                                     #range(0, len(embarked_locs) + 1)))
    
    
    embarked_locs = sorted(df['Embarked'].unique())
    embarked_locs_mapping = dict(zip(embarked_locs, range(0, len(embarked_locs) + 1)))
    
    #print embarked_locs_mapping
    
    
    df['Embarked_Val'] = df['Embarked'].map(embarked_locs_mapping).astype(int)
    if len(df[df['Embarked'].isnull()] > 0):
        df.replace({'Embarked_Val' : { embarked_locs_mapping[np.nan] : embarked_locs_mapping['S']}}, 
               inplace=True)
    
    ## Transform Embarked from a string to dummy variables
    #df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked_Val')], axis=1)
    
    ## Fill in missing values of Embarked
    ## Since the vast majority of passengers embarked in 'S': 3, 
    ## we assign the missing values in Embarked to 'S':
    #if len(df[df['Embarked'].isnull()] > 0):
    #    df.replace({'Embarked_Val' : 
    #                   { embarked_locs_mapping[np.nan] : embarked_locs_mapping['S'] 
    #                   }
    #               }, 
    #               inplace=True)
    
    # Fill in missing values of Fare with the average Fare
    if len(df[df['Fare'].isnull()] > 0):
        avg_fare = df['Fare'].mean()
        df.replace({ None: avg_fare }, inplace=True)
    
    # To keep Age in tact, make a copy of it called AgeFill 
    # that we will use to fill in the missing ages:
    df['AgeFill'] = df['Age']

    # Determine the Age typical for each passenger class by Sex_Val.  
    # We'll use the median instead of the mean because the Age 
    # histogram seems to be right skewed.
    df['AgeFill'] = df['AgeFill'] \
                        .groupby([df['Sex_Val'], df['Pclass']]) \
                        .apply(lambda x: x.fillna(x.median()))
            
    # Define a new feature FamilySize that is the sum of 
    # Parch (number of parents or children on board) and 
    # SibSp (number of siblings or spouses):
    df['FamilySize'] = df['SibSp'] + df['Parch']
    
    # Drop the columns we won't use:
    df = df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1)
    
    # Drop the Age column since we will be using the AgeFill column instead.
    # Drop the SibSp and Parch columns since we will be using FamilySize.
    # Drop the PassengerId column since it won't be used as a feature.
    df = df.drop(['Age', 'SibSp', 'Parch'], axis=1)
    
    if drop_passenger_id:
        df = df.drop(['PassengerId'], axis=1)
    
    return df

In [36]:
df2 = clean_data(df, drop_passenger_id=False)

In [37]:
df2.head()

Unnamed: 0,PassengerId,Survived,Pclass,Fare,Sex_Val,Embarked_Val,AgeFill,FamilySize
0,1,0,3,7.25,1,3,22.0,1
1,2,1,1,71.2833,0,1,38.0,1
2,3,1,3,7.925,0,3,26.0,0
3,4,1,1,53.1,0,3,35.0,1
4,5,0,3,8.05,1,3,35.0,0


In [40]:
df2['AgeGroup'] = pd.cut(df2['AgeFill'], bins=range(0, 105, 10), 
                         right=False, labels=range(0, 100, 10))

In [41]:
df2.head()

Unnamed: 0,PassengerId,Survived,Pclass,Fare,Sex_Val,Embarked_Val,AgeFill,FamilySize,AgeGroup
0,1,0,3,7.25,1,3,22.0,1,20
1,2,1,1,71.2833,0,1,38.0,1,30
2,3,1,3,7.925,0,3,26.0,0,20
3,4,1,1,53.1,0,3,35.0,1,30
4,5,0,3,8.05,1,3,35.0,0,30


In [42]:
df2['FareGroup'] = pd.cut(df2['Fare'], bins=range(0, 520, 10), 
                          right=False, labels=range(0, 510, 10))

In [43]:
df2.drop(['AgeFill', 'Fare'], axis=1, inplace=True)
df2.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex_Val,Embarked_Val,FamilySize,AgeGroup,FareGroup
0,1,0,3,1,3,1,20,0
1,2,1,1,0,1,1,30,70
2,3,1,3,0,3,0,20,0
3,4,1,1,0,3,1,30,50
4,5,0,3,1,3,0,30,0


In [44]:
df3 = df2
df3['Survived'] = df2['Survived'].astype(str) + '_S'
df3['Pclass'] = df2['Pclass'].astype(str) + '_P'
df3['Sex_Val'] = df2['Sex_Val'].map({1: 'M', 0: 'F'}).astype(str)
df3['Embarked_Val'] = df2['Embarked_Val'].map({2:'Q', 1:'C', 3:'S'}).astype(str)
df3['FamilySize'] = df2['FamilySize'].astype(str) + '_FS'
df3['AgeGroup'] = df2['AgeGroup'].astype(str) + '_A'
df3['FareGroup'] = df2['FareGroup'].astype(str) + '_F'

In [45]:
df3.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex_Val,Embarked_Val,FamilySize,AgeGroup,FareGroup
0,1,0_S,3_P,M,S,1_FS,20_A,0.0_F
1,2,1_S,1_P,F,C,1_FS,30_A,70.0_F
2,3,1_S,3_P,F,S,0_FS,20_A,0.0_F
3,4,1_S,1_P,F,S,1_FS,30_A,50.0_F
4,5,0_S,3_P,M,S,0_FS,30_A,0.0_F


In [46]:
df3.to_csv('titanic_for_patterns.csv', sep=',', header=False)

In [47]:
delimiter=','
target_type='s'
min_nbr_items=2
min_sup=2
min_conf=2

ret_val = call_apriori('titanic_for_patterns.csv', 'titanic_freq_patterns.txt', 
                       delimiter, target_type, min_nbr_items, min_sup, min_conf)

In [48]:
delimiter=','
target_type='r'
min_nbr_items=2
min_sup=2
min_conf=25

ret_val = call_apriori('titanic_for_patterns.csv', 'titanic_rules.txt', delimiter, target_type, 
                       min_nbr_items, min_sup, min_conf)

In [49]:
rules = read_rules('titanic_rules.txt')
for r in rules[:3]:
    print r['ant'], '-->', r['cons'], ' lift', r['lift'], ' conf', r['conf']

['0_A', '3_P', '0_S', 'S'] --> 6_FS  lift 2062.5  conf 27.7778
['0_A', '0_S', 'S'] --> 6_FS  lift 1953.95  conf 26.3158
['30.0_F', 'F', 'S'] --> 6_FS  lift 2735.53  conf 36.8421


In [50]:
rulse_cons_S = list()
for r in rules:
    if r['cons'].endswith('_S'):
        rulse_cons_S.append(r)

In [51]:
print len(rulse_cons_S)

892


In [52]:
sorted_rules_cons_S = sorted(rulse_cons_S, 
                             key=lambda r: r['conf'], reverse=True)

In [53]:
for r in sorted_rules_cons_S[:10]:
    print r['ant'], '-->', r['cons'], ' lift', r['lift'], ' conf', r['conf']

['0_A', '2_FS', 'S'] --> 1_S  lift 260.526  conf 100.0
['1_FS', 'C', '1_P', 'F'] --> 1_S  lift 260.526  conf 100.0
['1_FS', '1_P', 'F', 'S'] --> 1_S  lift 260.526  conf 100.0
['1_FS', '1_P', 'F'] --> 1_S  lift 260.526  conf 100.0
['10.0_F', '2_P', '20_A', 'M', 'S'] --> 0_S  lift 162.295  conf 100.0
['10.0_F', '2_P', '20_A', 'M'] --> 0_S  lift 162.295  conf 100.0
['10.0_F', '20_A', '0_FS', 'M', 'S'] --> 0_S  lift 162.295  conf 100.0
['10.0_F', '20_A', 'M', 'S'] --> 0_S  lift 162.295  conf 100.0
['2_P', '20_A', '0_FS', 'M', 'S'] --> 0_S  lift 162.295  conf 100.0
['2_P', '20_A', '0_FS', 'M'] --> 0_S  lift 162.295  conf 100.0


In [54]:
df3.values[0]

array([1, '0_S', '3_P', 'M', 'S', '1_FS', '20_A', '0.0_F'], dtype=object)

In [55]:
passenger_test = df3.values[0]

In [56]:
for r in rules:
    if set(r['ant']) < set(passenger_test) and r['cons'].endswith('_S'):
        print r['ant'], '-->', r['cons']

['1_FS', '20_A', '3_P', 'S'] --> 1_S
['1_FS', '20_A', '3_P'] --> 1_S
['1_FS', '20_A', 'S'] --> 1_S
['1_FS', '20_A'] --> 1_S
['1_FS', '3_P', 'S'] --> 1_S
['1_FS', '3_P'] --> 1_S
['1_FS', 'S'] --> 1_S
['1_FS'] --> 1_S
['1_FS', '20_A', '3_P', 'M'] --> 0_S
['1_FS', '20_A', '3_P', 'S'] --> 0_S
['1_FS', '20_A', '3_P'] --> 0_S
['1_FS', '20_A', 'M', 'S'] --> 0_S
['1_FS', '20_A', 'M'] --> 0_S
['1_FS', '20_A', 'S'] --> 0_S
['1_FS', '20_A'] --> 0_S
['1_FS', '3_P', 'M', 'S'] --> 0_S
['1_FS', '3_P', 'M'] --> 0_S
['1_FS', '3_P', 'S'] --> 0_S
['1_FS', '3_P'] --> 0_S
['1_FS', 'M', 'S'] --> 0_S
['1_FS', 'M'] --> 0_S
['1_FS', 'S'] --> 0_S
['1_FS'] --> 0_S
['0.0_F', '20_A', '3_P', 'M', 'S'] --> 0_S
['0.0_F', '20_A', '3_P', 'M'] --> 0_S
['0.0_F', '20_A', '3_P', 'S'] --> 0_S
['0.0_F', '20_A', '3_P'] --> 0_S
['0.0_F', '20_A', 'M', 'S'] --> 0_S
['0.0_F', '20_A', 'M'] --> 0_S
['0.0_F', '20_A', 'S'] --> 0_S
['0.0_F', '20_A'] --> 0_S
['0.0_F', '3_P', 'M', 'S'] --> 0_S
['0.0_F', '3_P', 'M'] --> 0_S
['0.0_F', '3_

In [60]:
titanic_baskets_list = list()
for row in df3.values:
    titanic_baskets_list.append(list(row))

In [72]:
rules = apriori(titanic_baskets_list, supp=5, zmin=2, target='r', conf=90, report='ascl') 

In [73]:
for rule in rules:
    if rule[0] == '1_S':
        print rule

('1_S', ('2_P', 'F', 'S'), 61, 0.06846240179573512, 0.9104477611940298, 2.3719560094265515)
('1_S', ('2_P', 'F'), 70, 0.07856341189674523, 0.9210526315789473, 2.399584487534626)
('1_S', ('1_P', 'F', 'S'), 48, 0.05387205387205387, 0.96, 2.5010526315789474)
('1_S', ('1_P', 'F'), 91, 0.10213243546576879, 0.9680851063829787, 2.5221164613661813)
