# Практическая работа №4

### Используемые библиотеки

In [436]:
import pandas as pd
import mlxtend.frequent_patterns as fp
import mlxtend.preprocessing as pp
from itertools import product, permutations

### Исходные данные

`data1.csv`:
```csv
tid,itemset
t1,ACD
t2,BCD
t3,AC
t4,ABD
t5,ABCD
t6,BCD
```

`data2.csv`:
```csv
itemset,length
 ,6
A,6
B,5
C,4
D,3
AB,5
AC,4
AD,3
BC,3
BD,2
CD,2
ABC,3
ABD,2
ACD,2
BCD,1
ABCD,1
```

`data3.csv`:
```csv
id,sequence
s1,AATACAAGAAC
s2,GTATGGTGAT
s3,AACATGGCCAA
s4,AAGCGTGGTCAA
```

## Задание 1

In [437]:
frame = pd.read_csv('data1.csv').apply(lambda x: x.apply(list) if x.name == 'itemset' else x)
print(f"Исходный набор данных:")
frame

Исходный набор данных:


Unnamed: 0,tid,itemset
0,t1,"[A, C, D]"
1,t2,"[B, C, D]"
2,t3,"[A, C]"
3,t4,"[A, B, D]"
4,t5,"[A, B, C, D]"
5,t6,"[B, C, D]"


### Кодирование данных в виде матрицы

In [438]:
te = pp.TransactionEncoder()
frame = pd.DataFrame(te.fit_transform(frame['itemset'].tolist()), columns=te.columns_)
frame

Unnamed: 0,A,B,C,D
0,True,False,True,True
1,False,True,True,True
2,True,False,True,False
3,True,True,False,True
4,True,True,True,True
5,False,True,True,True


#### NB! Здесь и далее принимаем уровни поддержки деленными на длину набора входных данных.

In [439]:
frame = fp.fpgrowth(frame, min_support=1.0/6.0, use_colnames=True)
frame['length'] = frame['itemsets'].apply(lambda x: len(x))
frame

Unnamed: 0,support,itemsets,length
0,0.833333,(D),1
1,0.833333,(C),1
2,0.666667,(A),1
3,0.666667,(B),1
4,0.666667,"(C, D)",2
5,0.5,"(A, C)",2
6,0.5,"(A, D)",2
7,0.333333,"(B, A)",2
8,0.333333,"(A, C, D)",3
9,0.333333,"(B, A, D)",3


In [440]:
min_gens = []
for item in frame.itertuples():
    gen = True
    for part in frame.itertuples():
        gen = gen and ((part[2] == item[2]) or not (part[2].issubset(item[2]) and part[1] == item[1]))
    if gen:
        min_gens += [item]
print(len(min_gens))
frame.loc[frame['itemsets'].isin(map(lambda x: x[2], min_gens))]

11


Unnamed: 0,support,itemsets,length
0,0.833333,(D),1
1,0.833333,(C),1
2,0.666667,(A),1
3,0.666667,(B),1
4,0.666667,"(C, D)",2
5,0.5,"(A, C)",2
6,0.5,"(A, D)",2
7,0.333333,"(B, A)",2
8,0.333333,"(A, C, D)",3
10,0.166667,"(B, A, C)",3


## Задание 2

In [441]:
frame = pd.read_csv('data2.csv').apply(lambda x: x.apply(lambda y: list(y) if y != ' ' else []) if x.name == 'itemset' else x)
print(f"Исходный набор данных:")
frame

Исходный набор данных:


Unnamed: 0,itemset,length
0,[],6
1,[A],6
2,[B],5
3,[C],4
4,[D],3
5,"[A, B]",5
6,"[A, C]",4
7,"[A, D]",3
8,"[B, C]",3
9,"[B, D]",2


In [442]:
closed = []
for item in frame.itertuples():
    cls = True
    for part in frame.itertuples():
        cls = cls and ((part[0] == item[0]) or not set(item[1]).issubset(part[1]) or part[2] != item[2])
    if cls:
        closed += [item]
frame.loc[frame['itemset'].isin([x[1] for x in closed])]

Unnamed: 0,itemset,length
1,[A],6
5,"[A, B]",5
6,"[A, C]",4
7,"[A, D]",3
11,"[A, B, C]",3
12,"[A, B, D]",2
13,"[A, C, D]",2
15,"[A, B, C, D]",1


#### NB! Выводимый набор - такой, поддержка которого может быть выведена из поддержек его поднаборов.

`BCD` - невыводимый набор, так как его поднаборы имеют поддержки:
3 (BC), 2 (BD), 2 (CD), а собственная его поддержка равна 1.

`ABCD` - выводимый набор, так как его поднаборы имеют поддержки:
3 (ABC), 2 (ABD), 2 (ACD), 1 (BCD), а собственная его поддержка равна 1.

## Задание 3

In [443]:
frame = pd.read_csv('data3.csv').apply(lambda x: x.apply(list) if x.name == 'sequence' else x)
print(f"Исходный набор данных:")
frame

Исходный набор данных:


Unnamed: 0,id,sequence
0,s1,"[A, A, T, A, C, A, A, G, A, A, C]"
1,s2,"[G, T, A, T, G, G, T, G, A, T]"
2,s3,"[A, A, C, A, T, G, G, C, C, A, A]"
3,s4,"[A, A, G, C, G, T, G, G, T, C, A, A]"


In [444]:
def k_comb(k):
    return len([p for p in product(('A', 'C', 'T', 'G'), repeat=k)])

for i in range(1, 12, 2):
    print(f"k = {i}: {k_comb(i)}")
print(f"k = n: 4^n")

k = 1: 4
k = 3: 64
k = 5: 1024
k = 7: 16384
k = 9: 262144
k = 11: 4194304
k = n: 4^n


In [445]:
def smart_enum(lst):
    container = {}
    for elem in lst:
        if elem in container:
            container[elem] += 1
        else:
            container[elem] = 1
    return [(p, l) for p in container for l in range(0, container[p])]

frame = frame.apply(lambda x: x.apply(smart_enum) if x.name == 'sequence' else x)
frame

Unnamed: 0,id,sequence
0,s1,"[(A, 0), (A, 1), (A, 2), (A, 3), (A, 4), (A, 5..."
1,s2,"[(G, 0), (G, 1), (G, 2), (G, 3), (T, 0), (T, 1..."
2,s3,"[(A, 0), (A, 1), (A, 2), (A, 3), (A, 4), (C, 0..."
3,s4,"[(A, 0), (A, 1), (A, 2), (A, 3), (G, 0), (G, 1..."


In [446]:
te = pp.TransactionEncoder()
frame = pd.DataFrame(te.fit_transform(frame['sequence'].tolist()), columns=te.columns_)
frame

Unnamed: 0,"(A, 0)","(A, 1)","(A, 2)","(A, 3)","(A, 4)","(A, 5)","(A, 6)","(C, 0)","(C, 1)","(C, 2)","(G, 0)","(G, 1)","(G, 2)","(G, 3)","(T, 0)","(T, 1)","(T, 2)","(T, 3)"
0,True,True,True,True,True,True,True,True,True,False,True,False,False,False,True,False,False,False
1,True,True,False,False,False,False,False,False,False,False,True,True,True,True,True,True,True,True
2,True,True,True,True,True,False,False,True,True,True,True,True,False,False,True,False,False,False
3,True,True,True,True,False,False,False,True,True,False,True,True,True,True,True,True,False,False


In [447]:
frame = fp.fpgrowth(frame, min_support=1.0, use_colnames=True)
frame

Unnamed: 0,support,itemsets
0,1.0,"((T, 0))"
1,1.0,"((G, 0))"
2,1.0,"((A, 1))"
3,1.0,"((A, 0))"
4,1.0,"((G, 0), (T, 0))"
5,1.0,"((T, 0), (A, 1))"
6,1.0,"((T, 0), (A, 0))"
7,1.0,"((G, 0), (A, 1))"
8,1.0,"((G, 0), (A, 0))"
9,1.0,"((A, 0), (A, 1))"


In [448]:
frame = frame.apply(lambda x: x.apply(lambda y: [z[0] for z in y]) if x.name == 'itemsets' else x)
frame

Unnamed: 0,support,itemsets
0,1.0,[T]
1,1.0,[G]
2,1.0,[A]
3,1.0,[A]
4,1.0,"[G, T]"
5,1.0,"[T, A]"
6,1.0,"[T, A]"
7,1.0,"[G, A]"
8,1.0,"[G, A]"
9,1.0,"[A, A]"


In [449]:
frame = frame.apply(lambda x: x.apply(tuple) if x.name == 'itemsets' else x).drop_duplicates()
frame

Unnamed: 0,support,itemsets
0,1.0,"(T,)"
1,1.0,"(G,)"
2,1.0,"(A,)"
4,1.0,"(G, T)"
5,1.0,"(T, A)"
7,1.0,"(G, A)"
9,1.0,"(A, A)"
10,1.0,"(G, T, A)"
12,1.0,"(T, A, A)"
13,1.0,"(G, A, A)"
