# _Association rules_

In [1]:
import pandas as pd

import pyfpgrowth

In [2]:
spect = pd.read_csv('SPECT.train', header=None, index_col=False,
                    names=['final_diagnosis'] + ['partial_diagnosis_{:02d}'.format(i) for i in range(21)])

spect.head()

Unnamed: 0,final_diagnosis,partial_diagnosis_00,partial_diagnosis_01,partial_diagnosis_02,partial_diagnosis_03,partial_diagnosis_04,partial_diagnosis_05,partial_diagnosis_06,partial_diagnosis_07,partial_diagnosis_08,...,partial_diagnosis_11,partial_diagnosis_12,partial_diagnosis_13,partial_diagnosis_14,partial_diagnosis_15,partial_diagnosis_16,partial_diagnosis_17,partial_diagnosis_18,partial_diagnosis_19,partial_diagnosis_20
0,1,0,0,0,1,0,0,0,1,1,...,0,1,1,0,0,0,0,0,0,0
1,1,0,0,1,1,0,0,0,1,1,...,0,1,1,0,0,0,0,0,0,0
2,1,1,0,1,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
4,1,0,0,0,0,0,0,0,1,0,...,0,1,0,1,1,0,0,0,0,0


In [3]:
spect.shape

(80, 22)

In [4]:
spect.get_dtype_counts()

int64    22
dtype: int64

In [5]:
spect.final_diagnosis.value_counts()

1    40
0    40
Name: final_diagnosis, dtype: int64

In [6]:
# pd.Series, содержащая списки предварительных диагнозов каждого пациента
partial_diag = spect.drop('final_diagnosis', 1).apply(lambda r:r.index[1 == r].tolist(), axis=1)

partial_diag.head()

0    [partial_diagnosis_03, partial_diagnosis_07, p...
1    [partial_diagnosis_02, partial_diagnosis_03, p...
2    [partial_diagnosis_00, partial_diagnosis_02, p...
3    [partial_diagnosis_10, partial_diagnosis_19, p...
4    [partial_diagnosis_07, partial_diagnosis_12, p...
dtype: object

In [7]:
# объединяем с финальным диагнозом (0|1), чтобы получиь список списков в формате pyfpgrowth
diag = partial_diag.combine(spect['final_diagnosis'], lambda x,y:['sick' if y > 0 else 'healthy'] + x)

diag[:5]

0    [sick, partial_diagnosis_03, partial_diagnosis...
1    [sick, partial_diagnosis_02, partial_diagnosis...
2    [sick, partial_diagnosis_00, partial_diagnosis...
3    [sick, partial_diagnosis_10, partial_diagnosis...
4    [sick, partial_diagnosis_07, partial_diagnosis...
dtype: object

In [8]:
# найдём все правила, которые встречаются не менее 5 раз
patterns = pyfpgrowth.find_frequent_patterns(diag.tolist(), 5)

def dict_head(m, n):
    return {k: m[k] for k in list(m)[:n]}

print( len(patterns) )

dict_head(patterns, 10)

1174


{('partial_diagnosis_14',): 6,
 ('partial_diagnosis_07', 'partial_diagnosis_14'): 5,
 ('partial_diagnosis_12', 'partial_diagnosis_14'): 5,
 ('partial_diagnosis_14', 'sick'): 5,
 ('partial_diagnosis_07', 'partial_diagnosis_12', 'partial_diagnosis_14'): 5,
 ('partial_diagnosis_07', 'partial_diagnosis_14', 'sick'): 5,
 ('partial_diagnosis_12', 'partial_diagnosis_14', 'sick'): 5,
 ('partial_diagnosis_07',
  'partial_diagnosis_12',
  'partial_diagnosis_14',
  'sick'): 5,
 ('partial_diagnosis_01', 'partial_diagnosis_17'): 5,
 ('partial_diagnosis_01', 'partial_diagnosis_17', 'sick'): 5}

In [9]:
# сохраним паттерны, содержащие финальный диагноз

healthy_patterns = {}
sick_patterns = {}

for k, v in patterns.items():
    if 'healthy' in k:
        healthy_patterns[k]=v
    if 'sick' in k:
        sick_patterns[k]=v

len(healthy_patterns), len(sick_patterns)

(14, 532)

In [10]:
# найдём все следствия из правил, которые происходят не реже чем в 70% случаев
rules = pyfpgrowth.generate_association_rules(patterns, 0.7)

print( len(rules) )

dict_head(rules, 5)

1047


{('partial_diagnosis_14',): (('partial_diagnosis_07',
   'partial_diagnosis_12',
   'sick'),
  0.8333333333333334),
 ('partial_diagnosis_07',
  'partial_diagnosis_14'): (('partial_diagnosis_12', 'sick'), 1.0),
 ('partial_diagnosis_12',
  'partial_diagnosis_14'): (('partial_diagnosis_07', 'sick'), 1.0),
 ('partial_diagnosis_14',
  'sick'): (('partial_diagnosis_07', 'partial_diagnosis_12'), 1.0),
 ('partial_diagnosis_07',
  'partial_diagnosis_12',
  'partial_diagnosis_14'): (('sick',), 1.0)}

In [11]:
# нас интересуют только правила, которые в выводе содержат финальный диагноз

healthy_rules = {}
sick_rules = {}

for rule, assoc in rules.items():
    if 'healthy' in assoc[0]:
        healthy_rules[rule]=assoc        
    if 'sick' in assoc[0]:
        sick_rules[rule]=assoc

In [12]:
len(healthy_rules)

0

In [13]:
len(sick_rules)

354

# _Выводы_ 

### 1.
Существуют комбинации предварительных диагнозов, которые в 100% случаев влекут за собой _положительный_ финальный диагноз (по крайней мере, для нашего ограниченного датасета).

In [14]:
sick_rules

{('partial_diagnosis_14',): (('partial_diagnosis_07',
   'partial_diagnosis_12',
   'sick'),
  0.8333333333333334),
 ('partial_diagnosis_07',
  'partial_diagnosis_14'): (('partial_diagnosis_12', 'sick'), 1.0),
 ('partial_diagnosis_12',
  'partial_diagnosis_14'): (('partial_diagnosis_07', 'sick'), 1.0),
 ('partial_diagnosis_07',
  'partial_diagnosis_12',
  'partial_diagnosis_14'): (('sick',), 1.0),
 ('partial_diagnosis_01',
  'partial_diagnosis_17'): (('partial_diagnosis_11',
   'partial_diagnosis_15',
   'sick'),
  1.0),
 ('partial_diagnosis_11',
  'partial_diagnosis_17'): (('partial_diagnosis_15', 'sick'), 1.0),
 ('partial_diagnosis_15',
  'partial_diagnosis_17'): (('partial_diagnosis_11', 'sick'), 1.0),
 ('partial_diagnosis_01',
  'partial_diagnosis_11',
  'partial_diagnosis_17'): (('partial_diagnosis_15', 'sick'), 1.0),
 ('partial_diagnosis_01',
  'partial_diagnosis_15',
  'partial_diagnosis_17'): (('partial_diagnosis_11', 'sick'), 1.0),
 ('partial_diagnosis_01',
  'partial_diagnosi

### 2.
Есть один предварительный диагноз, который в большинстве случаев _единственный_ приводил к _положительному_ финальному диагнозу.

In [15]:
for rule, assoc in rules.items():
    if len(rule) == 1:
        if 'sick' in assoc[0]:
            print(rule, assoc)

('partial_diagnosis_14',) (('partial_diagnosis_07', 'partial_diagnosis_12', 'sick'), 0.8333333333333334)


In [16]:
spect[['final_diagnosis','partial_diagnosis_14']][spect.partial_diagnosis_14 == 1]

Unnamed: 0,final_diagnosis,partial_diagnosis_14
4,1,1
8,1,1
14,1,1
34,1,1
39,1,1
50,0,1


### 3.
Некоторые предварительные диагнозы чаще встречаются в комбинациях, приводящих к _положительному_ финальному диагнозу.

In [17]:
from collections import Counter

Counter(list(zip(*sick_rules.keys()))[0]).most_common()

[('partial_diagnosis_00', 99),
 ('partial_diagnosis_02', 84),
 ('partial_diagnosis_01', 36),
 ('partial_diagnosis_06', 27),
 ('partial_diagnosis_03', 25),
 ('partial_diagnosis_07', 19),
 ('partial_diagnosis_04', 18),
 ('partial_diagnosis_12', 13),
 ('partial_diagnosis_11', 10),
 ('partial_diagnosis_10', 6),
 ('partial_diagnosis_05', 5),
 ('partial_diagnosis_08', 4),
 ('partial_diagnosis_16', 2),
 ('partial_diagnosis_09', 2),
 ('partial_diagnosis_14', 1),
 ('partial_diagnosis_15', 1),
 ('partial_diagnosis_13', 1),
 ('partial_diagnosis_19', 1)]

### 4.
Относительно _отрицательного_ финального диагноза никаких закономерностей установить не удалось.

Если взглянуть на паттерны, в которых попадается _отрицательный_ финальный диагноз (**healthy**), то можно видеть, что, с одной стороны, существуют предварительные диагнозы, которые в достаточно большом проценте случаев _не_ приводят к положительному финальному диагнозу. В то же время, очевдно, не существует какой-то комбинации предварительных диагнозов, которые однозначно приводили бы к _отрицательному_ финальному диагнозу.

In [18]:
healthy_patterns

{('healthy', 'partial_diagnosis_18'): 5,
 ('healthy', 'partial_diagnosis_19'): 5,
 ('healthy', 'partial_diagnosis_11'): 5,
 ('healthy', 'partial_diagnosis_02'): 6,
 ('healthy', 'partial_diagnosis_06'): 5,
 ('healthy', 'partial_diagnosis_09'): 8,
 ('healthy', 'partial_diagnosis_04', 'partial_diagnosis_09'): 7,
 ('healthy', 'partial_diagnosis_00', 'partial_diagnosis_09'): 8,
 ('healthy',
  'partial_diagnosis_00',
  'partial_diagnosis_04',
  'partial_diagnosis_09'): 7,
 ('healthy', 'partial_diagnosis_04'): 10,
 ('healthy', 'partial_diagnosis_00', 'partial_diagnosis_04'): 10,
 ('healthy', 'partial_diagnosis_12'): 5,
 ('healthy', 'partial_diagnosis_00'): 11,
 ('healthy',): 40}

In [19]:
spect[['final_diagnosis','partial_diagnosis_00','partial_diagnosis_04']][spect.partial_diagnosis_04 == 1]

Unnamed: 0,final_diagnosis,partial_diagnosis_00,partial_diagnosis_04
2,1,1,1
10,1,1,1
11,1,1,1
12,1,1,1
18,1,1,1
19,1,1,1
21,1,1,1
22,1,1,1
29,1,1,1
32,1,1,1
