<span>
<img src="http://www.sobigdata.eu/sites/default/files/logo-SoBigData-DEFINITIVO.png" width="180px" align="right"/>
</span>
<span>
<b>Author:</b> <a href="http://kdd.isti.cnr.it/people/riccardo-guidotti">Riccardo Guidotti</a><br/>
<b>Python version:</b>  3.x<br/>
</span>

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Pattern mining on relational data

In [3]:
df = pd.read_csv("data/small_transactions.csv", 
                 skipinitialspace=True, sep=',')
df.head()

Unnamed: 0,SCONTRINO_ID,COD_MKT_ID
0,2558064013053,1580
1,2558064013053,1661
2,2558064013053,2068
3,2558064013053,2556
4,2558064013053,2650


From relational to transactional

In [4]:
dt = df.groupby('SCONTRINO_ID')['COD_MKT_ID'].agg(list)
dt.head()

SCONTRINO_ID
2558064013053                 [1580, 1661, 2068, 2556, 2650, 4225]
2558064013054    [437, 1278, 1614, 2089, 2243, 2245, 2443, 2551...
2558064013055                         [151, 595, 2650, 4600, 4872]
2558064013056             [142, 437, 2499, 2515, 3458, 3675, 4044]
2558064013057                                          [437, 3087]
Name: COD_MKT_ID, dtype: object

In [7]:
baskets = dt.values.tolist()

In [8]:
baskets[0:10]

[[1580, 1661, 2068, 2556, 2650, 4225],
 [437, 1278, 1614, 2089, 2243, 2245, 2443, 2551, 3448, 6172],
 [151, 595, 2650, 4600, 4872],
 [142, 437, 2499, 2515, 3458, 3675, 4044],
 [437, 3087],
 [445, 446, 2050, 2650, 5046],
 [483, 920, 1461, 1488, 2241, 3682, 4069, 4079, 4109, 4844, 4847],
 [1581, 2650, 2731, 3087, 4176],
 [384, 560, 2065, 2243, 2499, 4041],
 [607, 2122, 2650, 4655]]

In [9]:
from fim import apriori

Extract frequent patterns

In [10]:
itemsets = apriori(baskets, supp=1, zmin=2, target='a')
#target='a' -> means All patterns 

In [11]:
print('Number of itemsets:', len(itemsets))

Number of itemsets: 108


In [12]:
itemsets[:10]

[((597, 599), 15),
 ((1661, 2650), 14),
 ((4228, 2650), 13),
 ((599, 622), 16),
 ((2057, 920), 13),
 ((4655, 2650), 14),
 ((2193, 445), 17),
 ((2193, 920), 20),
 ((446, 445), 21),
 ((446, 2650), 21)]

Extract decision rules

In [13]:
rules = apriori(baskets, supp=1, zmin=2, target='r', conf=60, 
                report='ascl') 

In [14]:
print('Number of rule:', len(rules))

Number of rule: 9


In [17]:
rules[0] # cons, ant, [Abs supp, rel Sup, Conf, Lift] <-- report

(597, (624,), 9, 0.007377049180327869, 0.6, 26.142857142857142)

Print interesting rules

In [18]:
count = 0
for r in rules:
    if r[5] > 2 and r[4] > 0.7:
        print(r)
        count += 1
        if count == 10:
            break

(622, (624,), 11, 0.009016393442622951, 0.7333333333333333, 19.44927536231884)
(920, (2193, 445), 12, 0.009836065573770493, 0.7058823529411765, 5.382352941176471)


In [19]:
help(apriori)

Help on built-in function apriori in module fim:

apriori(...)
    apriori (tracts, target='s', supp=10, zmin=1, zmax=None, report='a',
             eval='x', agg='x', thresh=10, prune=None, algo='b', mode='',
             border=None)
    Find frequent item sets with the Apriori algorithm.
    tracts  transaction database to mine (mandatory)
            The database must be an iterable of transactions;
            each transaction must be an iterable of items;
            each item must be a hashable object.
            If the database is a dictionary, the transactions are
            the keys, the values their (integer) multiplicities.
    target  type of frequent item sets to find     (default: s)
            s/a   sets/all   all     frequent item sets
            c     closed     closed  frequent item sets
            m     maximal    maximal frequent item sets
            g     gens       generators
            r     rules      association rules
    supp    minimum support of an i

In [20]:
baskets[0]

[1580, 1661, 2068, 2556, 2650, 4225]

In [21]:
rules[0][:2]

(597, (624,))

In [23]:
set(rules[0][1])

{624}

In [25]:
set(b)

{597, 599, 615, 624, 1278, 1579, 2243, 3690, 4435}

First basket satisying the first rule

In [22]:
for b in baskets:
    if set(rules[0][1]) < set(b):
        print(b)
        break

[597, 599, 615, 624, 1278, 1579, 2243, 3690, 4435]


In [26]:
r = ('10-20', ('male',))

In [27]:
b = ['male', '1st class', '?']

In [28]:
set(r[1]) < set(b)

True

Pattern mining on categorical data

In [39]:
df = pd.read_csv("data/titanic.csv", skipinitialspace=True, sep=',')

In [40]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [41]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Correct missing values

In [42]:
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

df['Age'] = df['Age'].groupby([df['Sex'], df['Pclass']]).apply(
    lambda x: x.fillna(x.median()))
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
FamilySize       0
dtype: int64

Remove useless columns

In [43]:
column2drop = ['PassengerId', 'Name', 'Cabin', 'SibSp', 
               'Parch', 'Ticket']
df.drop(column2drop, axis=1, inplace=True)

df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,FamilySize
0,0,3,male,22.0,7.25,S,2
1,1,1,female,38.0,71.2833,C,2
2,1,3,female,26.0,7.925,S,1
3,1,1,female,35.0,53.1,S,2
4,0,3,male,35.0,8.05,S,1


Apply binning to continuous features

In [44]:
df['AgeBin'] = pd.cut(df['Age'].astype(int), 10, right=False)
df['FareBin'] = pd.cut(df['Fare'].astype(int), 10, right=False)

df.drop(['Age', 'Fare'], axis=1, inplace=True)

df.head()

Unnamed: 0,Survived,Pclass,Sex,Embarked,FamilySize,AgeBin,FareBin
0,0,3,male,S,2,"[16.0, 24.0)","[0.0, 51.2)"
1,1,1,female,C,2,"[32.0, 40.0)","[51.2, 102.4)"
2,1,3,female,S,1,"[24.0, 32.0)","[0.0, 51.2)"
3,1,1,female,S,2,"[32.0, 40.0)","[51.2, 102.4)"
4,0,3,male,S,1,"[32.0, 40.0)","[0.0, 51.2)"


Remap values

In [45]:
df['Survived'] = df['Survived'].map(
    {0: 'Not Survived', 1: 'Survived'}).astype(str)
df['Pclass'] = df['Pclass'].map(
    {1: '1st', 2: '2nd', 3: '3rd'}).astype(str)

df['FamilySize'] = df['FamilySize'].astype(str) + '_Family'
df['AgeBin'] = df['AgeBin'].astype(str) + '_Age'
df['FareBin'] = df['FareBin'].astype(str) + '_Fare'

df.head()

Unnamed: 0,Survived,Pclass,Sex,Embarked,FamilySize,AgeBin,FareBin
0,Not Survived,3rd,male,S,2_Family,"[16.0, 24.0)_Age","[0.0, 51.2)_Fare"
1,Survived,1st,female,C,2_Family,"[32.0, 40.0)_Age","[51.2, 102.4)_Fare"
2,Survived,3rd,female,S,1_Family,"[24.0, 32.0)_Age","[0.0, 51.2)_Fare"
3,Survived,1st,female,S,2_Family,"[32.0, 40.0)_Age","[51.2, 102.4)_Fare"
4,Not Survived,3rd,male,S,1_Family,"[32.0, 40.0)_Age","[0.0, 51.2)_Fare"


In [46]:
baskets = df.values.tolist()

In [47]:
baskets[0]

['Not Survived',
 '3rd',
 'male',
 'S',
 '2_Family',
 '[16.0, 24.0)_Age',
 '[0.0, 51.2)_Fare']

In [25]:
itemsets = apriori(baskets, supp=1, zmin=2, target='a') 

In [26]:
print('Number of itemsets:', len(itemsets))

Number of itemsets: 2146


In [27]:
itemsets[:10]

[(('[204.8, 256.0)_Fare', '1st'), 11),
 (('[64.0, 72.0)_Age', 'Not Survived', 'male'), 11),
 (('[64.0, 72.0)_Age', 'Not Survived'), 11),
 (('[64.0, 72.0)_Age', 'male'), 11),
 (('7_Family', '3rd', 'S', '[0.0, 51.2)_Fare'), 12),
 (('7_Family', '3rd', 'S'), 12),
 (('7_Family', '3rd', '[0.0, 51.2)_Fare'), 12),
 (('7_Family', '3rd'), 12),
 (('7_Family', 'S', '[0.0, 51.2)_Fare'), 12),
 (('7_Family', 'S'), 12)]

In [28]:
rules = apriori(baskets, supp=10, zmin=2, target='r', conf=60, 
                report='ascl') 

In [29]:
print('Number of rule:', len(rules))

Number of rule: 590


In [30]:
for r in rules:
    if r[0] == 'Survived':
        print(r)

('Survived', ('[51.2, 102.4)_Fare',), 68, 0.07631874298540965, 0.6601941747572816, 1.719979560551865)
('Survived', ('[16.0, 24.0)_Age', 'female'), 68, 0.07631874298540965, 0.68, 1.7715789473684211)
('Survived', ('1st', 'female'), 91, 0.10213243546576879, 0.9680851063829787, 2.5221164613661813)
('Survived', ('1st',), 136, 0.1526374859708193, 0.6296296296296297, 1.6403508771929824)
('Survived', ('female', '1_Family', '[0.0, 51.2)_Fare'), 73, 0.0819304152637486, 0.73, 1.901842105263158)
('Survived', ('female', '1_Family'), 99, 0.1111111111111111, 0.7857142857142857, 2.0469924812030076)
('Survived', ('female', 'S', '[0.0, 51.2)_Fare'), 100, 0.1122334455667789, 0.6329113924050633, 1.6489007328447702)
('Survived', ('female', 'S'), 142, 0.15937149270482603, 0.6926829268292682, 1.8046213093709884)
('Survived', ('female', '[0.0, 51.2)_Fare'), 153, 0.1717171717171717, 0.6681222707423581, 1.7406343369340382)
('Survived', ('female',), 233, 0.2615039281705948, 0.7420382165605095, 1.9332048273550118

In [31]:
set(rules[0][1])

{'[40.0, 48.0)_Age'}