In [1]:
import pickle
from sklearn.model_selection import train_test_split
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk
from typing import Iterable, List, Set, Callable, Optional, Union, Sequence
from grasp import GrASP, CustomAttribute

## Load the data
- Download and unzip the spam dataset **if you have not done this before**

In [None]:
import urllib.request
url = 'http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/smsspamcollection.zip'
filename = './data/smsspamcollection.zip'
urllib.request.urlretrieve(url, filename)

In [None]:
!unzip ./data/smsspamcollection.zip -d ./data

- Load the data

In [3]:
def get_data():
    f = open('data/SMSSpamCollection.txt', 'r')
    texts, labels = [], []
    for line in f:
        line = line.strip()
        tab_idx = line.index('\t')
        label = line[:tab_idx]
        text = line[tab_idx+1:]
        if label == 'ham':
            label = 0
        elif label == 'spam':
            label = 1
        else:
            raise Exception(f"Invalid label - {label}")
        texts.append(text)
        labels.append(label)
    return texts, labels

In [4]:
texts, labels = get_data()
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
len(texts), sum(labels), len(X_test), sum(y_test)

(5574, 747, 1115, 138)

In [5]:
positive = [t for idx, t in enumerate(X_train) if y_train[idx]]
negative = [t for idx, t in enumerate(X_train) if not y_train[idx]]

## Example 1

In [6]:
# Create the GrASP engine
grasp_model = GrASP(gaps_allowed = 0, num_patterns = 100)

In [7]:
# Fit GrASP to the dataset
the_patterns = grasp_model.fit_transform(positive, negative)

  0%|          | 0/488 [00:00<?, ?it/s]

Step 1: Create augmented texts


100%|██████████| 488/488 [00:35<00:00, 13.56it/s]
100%|██████████| 3079/3079 [02:26<00:00, 20.99it/s]


Step 2: Find frequent attributes


  0%|          | 4/1336 [00:00<00:39, 33.70it/s]

Total number of candidate alphabet = 1336, such as ['SPACY:DEP-ROOT', 'SPACY:POS-VERB', 'SPACY:POS-NOUN', 'SPACY:DEP-punct', 'SPACY:POS-PUNCT']
Step 3: Find alphabet set


100%|██████████| 1336/1336 [00:52<00:00, 25.49it/s]


Finding top k: 10 / 100
Finding top k: 20 / 100
Finding top k: 30 / 100
Finding top k: 40 / 100
Finding top k: 50 / 100
Finding top k: 60 / 100
Finding top k: 70 / 100
Finding top k: 80 / 100
Finding top k: 90 / 100
Finding top k: 100 / 100


  0%|          | 0/100 [00:00<?, ?it/s]

Total number of alphabet = 100
['SPACY:POS-NUM', 'SPACY:DEP-compound', 'SPACY:NER-ORG', 'SPACY:POS-PROPN', 'SPACY:NER-DATE', 'TEXT:call', 'HYPERNYM:communication.n.02', 'SPACY:DEP-amod', 'TEXT:i', 'SPACY:POS-SYM', 'TEXT:free', 'TEXT:txt', 'TEXT:claim', 'TEXT:!', 'HYPERNYM:message.n.02', 'TEXT:mobile', 'SPACY:DEP-pobj', 'HYPERNYM:transferred_property.n.01', 'HYPERNYM:abstraction.n.06', 'TEXT:to', 'TEXT:prize', 'HYPERNYM:relation.n.01', 'HYPERNYM:win.v.01', 'SPACY:DEP-appos', 'SENTIMENT:pos', 'HYPERNYM:act.n.02', 'HYPERNYM:object.n.01', 'SPACY:NER-PERSON', 'TEXT:your', 'SPACY:POS-ADJ', 'TEXT:or', 'HYPERNYM:symbol.n.01', 'TEXT:text', 'HYPERNYM:entity.n.01', 'HYPERNYM:statement.n.01', 'TEXT:stop', 'SPACY:DEP-nmod', 'HYPERNYM:converse.v.01', 'HYPERNYM:book.n.01', 'TEXT:-', 'TEXT:150p', 'TEXT:guaranteed', 'TEXT:urgent', 'TEXT:win', 'HYPERNYM:written_communication.n.01', 'TEXT:+', 'SPACY:POS-PRON', 'TEXT:16', 'HYPERNYM:cash.n.02', 'HYPERNYM:artifact.n.01', 'HYPERNYM:creation.n.02', 'TEXT:from

100%|██████████| 100/100 [02:18<00:00,  1.38s/it]


Length 2 / 5; New candidates = 14950
Finding top k: 10 / 100
Finding top k: 20 / 100
Finding top k: 30 / 100
Finding top k: 40 / 100
Finding top k: 50 / 100
Finding top k: 60 / 100
Finding top k: 70 / 100
Finding top k: 80 / 100
Finding top k: 90 / 100
Finding top k: 100 / 100
Example of current patterns
Pattern: [['SPACY:POS-NUM']]
Window size: 1
Class: Negative
Precision: 0.526
Match: 936 (26.2%)
Gain = 0.223
[5m[7m[32mExamples[0m ~ Class Negative:
[5m[32m[MATCH][0m: Do whatever you want . You know what the rules are . We had a talk earlier this week about what had to start happening , you showing responsibility . Yet , every week it 's can i bend the rule this way ? What about that way ? Do whatever . I 'm tired of having thia same argument with you every week . And a   & lt;#&gt ;   movie DOESNT inlude the previews . You 're still getting in after [7m[36m1:['SPACY:POS-NUM'][0m . 
-------------------------
[5m[32m[MATCH][0m: I not free today i haf [7m[36m2:['SPACY:POS

100%|██████████| 73/73 [01:41<00:00,  1.39s/it]


Length 3 / 5; New candidates = 14337
Finding top k: 10 / 100
Finding top k: 20 / 100
Finding top k: 30 / 100
Finding top k: 40 / 100
Finding top k: 50 / 100
Finding top k: 60 / 100
Finding top k: 70 / 100
Finding top k: 80 / 100
Finding top k: 90 / 100
Finding top k: 100 / 100
Example of current patterns
Pattern: [['SPACY:POS-NUM']]
Window size: 1
Class: Negative
Precision: 0.526
Match: 936 (26.2%)
Gain = 0.223
[5m[7m[32mExamples[0m ~ Class Negative:
[5m[32m[MATCH][0m: I luv u soo much u donÂ’t understand how special u r [7m[36m2:['SPACY:POS-NUM'][0m me ring u 2morrow luv u xxx 
-------------------------
[5m[32m[MATCH][0m: Derp . Which is worse , a dude who always wants to party or a dude who files a complaint about the [7m[36mthree:['SPACY:POS-NUM'][0m drug abusers he lives with 
-------------------------
[5m[7m[31mCounterexamples[0m ~ Not class Negative:
[5m[32m[MATCH][0m: You have been selected to stay in [7m[36m1:['SPACY:POS-NUM'][0m of 250 top British hot

100%|██████████| 22/22 [00:26<00:00,  1.20s/it]


Length 4 / 5; New candidates = 4357
Finding top k: 10 / 100
Finding top k: 20 / 100
Finding top k: 30 / 100
Finding top k: 40 / 100
Finding top k: 50 / 100
Finding top k: 60 / 100
Finding top k: 70 / 100
Finding top k: 80 / 100
Finding top k: 90 / 100


0it [00:00, ?it/s]

Finding top k: 100 / 100
Example of current patterns
Pattern: [['SPACY:POS-NUM']]
Window size: 1
Class: Negative
Precision: 0.526
Match: 936 (26.2%)
Gain = 0.223
[5m[7m[32mExamples[0m ~ Class Negative:
[5m[32m[MATCH][0m: is your hamster dead ? Hey so tmr i meet you at [7m[36m1:['SPACY:POS-NUM'][0m pm orchard mrt ? 
-------------------------
[5m[32m[MATCH][0m: I thk [7m[36m50:['SPACY:POS-NUM'][0m shd be ok he said plus minus 10 .. Did Ã¼ leave a line in between paragraphs ? 
-------------------------
[5m[7m[31mCounterexamples[0m ~ Not class Negative:
[5m[32m[MATCH][0m: Not heard from U4 a while . Call [7m[36m4:['SPACY:POS-NUM'][0m rude chat private line 01223585334 to cum . Wan 2C pics of me gettin shagged then text PIX to 8552 . 2End send STOP 8552 SAM xxx 
-------------------------
[5m[32m[MATCH][0m: [7m[36m449050000301:['SPACY:POS-NUM'][0m You have won a Â£2,000 price ! To claim , call 09050000301 . 
-------------------------
Pattern: [['SPACY:NER-DATE




In [8]:
# Print the patterns
for idx, p in enumerate(the_patterns):
    print(f'Rank {idx}')
    print(p)

Rank 0
Pattern: [['SPACY:POS-NUM']]
Window size: 1
Class: Negative
Precision: 0.526
Match: 936 (26.2%)
Gain = 0.223
[5m[7m[32mExamples[0m ~ Class Negative:
[5m[32m[MATCH][0m: [7m[36mOne:['SPACY:POS-NUM'][0m of best dialogue in cute reltnship .. ! ! " Wen i Die , Do nt Come Near My Body .. ! ! Bcoz My Hands May Not Come 2 Wipe Ur Tears Off That Time .. !Gud ni8 
-------------------------
[5m[32m[MATCH][0m: Hi i m having the most relaxing time ever ! we have to get up at [7m[36m7:['SPACY:POS-NUM'][0m am every day ! was the party good the other night ? I get home tomorrow at 5ish . 
-------------------------
[5m[7m[31mCounterexamples[0m ~ Not class Negative:
[5m[32m[MATCH][0m: [7m[36m83039:['SPACY:POS-NUM'][0m 62735=Â£450 UK Break AccommodationVouchers terms & conditions apply . 2 claim you mustprovide your claim number which is 15541 
-------------------------
[5m[32m[MATCH][0m: WINNER ! As a valued network customer you hvae been selected to receive a Â£900 r

Rank 28
Pattern: [['HYPERNYM:abstraction.n.06'], ['HYPERNYM:entity.n.01']]
Window size: 2
Class: Negative
Precision: 0.593
Match: 637 (17.9%)
Gain = 0.077
[5m[7m[32mExamples[0m ~ Class Negative:
[5m[32m[MATCH][0m: Yeah if we do have to get a random dude we need to change our [7m[36minfo:['HYPERNYM:abstraction.n.06'][0m [7m[36msheets:['HYPERNYM:entity.n.01'][0m to PARTY   & lt;#&gt ; /7 NEVER STUDY just to be safe 
-------------------------
[5m[32m[MATCH][0m: Playin [7m[36mspace:['HYPERNYM:abstraction.n.06'][0m [7m[36mpoker:['HYPERNYM:entity.n.01'][0m , u ? 
-------------------------
[5m[7m[31mCounterexamples[0m ~ Not class Negative:
[5m[32m[MATCH][0m: Had your mobile 11 months or more ? [7m[36mU:['HYPERNYM:abstraction.n.06'][0m [7m[36mR:['HYPERNYM:entity.n.01'][0m entitled to Update to the latest colour mobiles with camera for Free ! Call The Mobile Update Co FREE on 08002986030 
-------------------------
[5m[32m[MATCH][0m: Ur cash - balance is cur



In [9]:
for idx, p in enumerate(the_patterns):
    print(f'{idx:>3} {p.support_class}   {round(p.coverage*100, 1):>4}   {p.precision:.3f}   {p.get_pattern_id()}')

  0 Negative   26.2   0.526   [['SPACY:POS-NUM']]
  1 Positive    7.8   0.861   [['SPACY:NER-DATE', 'SPACY:POS-NUM']]
  2 Positive   12.6   0.641   [['SPACY:DEP-compound'], ['SPACY:POS-PROPN']]
  3 Positive   16.3   0.542   [['SPACY:DEP-compound', 'SPACY:POS-PROPN']]
  4 Negative   38.7   0.680   [['SPACY:DEP-compound']]
  5 Negative   21.8   0.559   [['SPACY:DEP-compound'], ['HYPERNYM:abstraction.n.06']]
  6 Negative   19.0   0.527   [['SPACY:NER-ORG']]
  7 Positive    8.6   0.708   [['HYPERNYM:abstraction.n.06'], ['SPACY:POS-PROPN']]
  8 Positive    8.7   0.694   [['SPACY:DEP-compound', 'SPACY:POS-PROPN'], ['HYPERNYM:abstraction.n.06']]
  9 Positive   15.6   0.508   [['HYPERNYM:abstraction.n.06', 'SPACY:POS-PROPN']]
 10 Positive    3.8   1.000   [['TEXT:call'], ['SPACY:POS-NUM']]
 11 Negative   37.8   0.690   [['SPACY:POS-PROPN']]
 12 Positive    7.1   0.743   [['SPACY:DEP-pobj', 'SPACY:POS-NUM']]
 13 Negative   18.3   0.548   [['SPACY:NER-DATE']]
 14 Positive   11.8   0.562   [['SPA

## Example 2

- Create a custom attribute - Hypernym3 (using only hypernyms which are at most three levels above the current synset)

In [7]:
HYPERNYM_DICT = dict()

def _get_anvr_pos(penn_tag: str): # Return Literal['a', 'n', 'v', 'r', None]
    if penn_tag.startswith('JJ'): # Adjective
        return 'a'
    elif penn_tag.startswith('NN'): # Noun
        return 'n'
    elif penn_tag.startswith('VB'): # Verb
        return 'v'
    elif penn_tag.startswith('RB'): # Adverb
        return 'r'
    else: # Invalid wordnet type
        return None 
    
def _get_all_hypernyms(synset: nltk.corpus.reader.wordnet.Synset, above: int) -> Set[nltk.corpus.reader.wordnet.Synset]:
    if above == 0:
        return set()
    
    if (str(synset), above) not in HYPERNYM_DICT:
        ans = set()
        direct_hypernyms = synset.hypernyms() # type: List[nltk.corpus.reader.wordnet.Synset]
        for ss in direct_hypernyms:
            ans.update(_get_all_hypernyms(ss, above-1))
            ans.update(set([ss]))
        HYPERNYM_DICT[(str(synset), above)] = ans
    return HYPERNYM_DICT[(str(synset), above)]
    
def _hypernym_extraction_3(text: str, tokens: List[str]) -> List[Set[str]]:
    ans = []
    for t in nlp(text):
        pos = _get_anvr_pos(t.tag_)
        if pos is not None:
            synset = lesk(tokens, t.text, pos)
            if synset is not None:
                all_hypernyms = set([synset]) # This version of hypernym extraction includes synset of the word itself
                all_hypernyms.update(_get_all_hypernyms(synset, 3))
                ans.append(set([str(ss)[8:-2] for ss in all_hypernyms]))
            else:
                ans.append(set([]))
        else:
            ans.append(set([]))
    return ans

Hypernym3Attribute = CustomAttribute(name = 'HYPERNYM3', extraction_function = _hypernym_extraction_3)

In [19]:
grasp_model_2 = GrASP(gaps_allowed = 2, num_patterns = 100, include_standard = ['TEXT', 'POS', 'SENTIMENT'],
                    include_custom = [Hypernym3Attribute],
                    correlation_threshold = 0.5, alphabet_size = 100)

In [20]:
the_patterns_2 = grasp_model_2.fit_transform(positive, negative)

  1%|          | 3/488 [00:00<00:19, 25.27it/s]

Step 1: Create augmented texts


100%|██████████| 488/488 [00:17<00:00, 27.74it/s]
100%|██████████| 3079/3079 [01:27<00:00, 35.10it/s]


Step 2: Find frequent attributes


  1%|          | 8/1215 [00:00<00:17, 68.54it/s]

Total number of candidate alphabet = 1215, such as ['SPACY:POS-VERB', 'SPACY:POS-NOUN', 'SPACY:POS-PUNCT', 'SPACY:POS-PRON', 'SPACY:POS-ADV']
Step 3: Find alphabet set


100%|██████████| 1215/1215 [00:29<00:00, 41.46it/s]


Finding top k: 10 / 100
Finding top k: 20 / 100
Finding top k: 30 / 100
Finding top k: 40 / 100
Finding top k: 50 / 100
Finding top k: 60 / 100
Finding top k: 70 / 100
Finding top k: 80 / 100
Finding top k: 90 / 100


  0%|          | 0/100 [00:00<?, ?it/s]

Finding top k: 100 / 100
Total number of alphabet = 100
['SPACY:POS-NUM', 'SPACY:POS-PROPN', 'TEXT:call', 'TEXT:i', 'SPACY:POS-SYM', 'TEXT:free', 'TEXT:txt', 'TEXT:claim', 'TEXT:!', 'TEXT:mobile', 'HYPERNYM3:cost.n.01', 'SPACY:POS-ADP', 'HYPERNYM3:message.n.02', 'TEXT:to', 'TEXT:prize', 'HYPERNYM3:communication.n.02', 'HYPERNYM3:win.v.01', 'SENTIMENT:pos', 'HYPERNYM3:symbol.n.01', 'HYPERNYM3:statement.n.01', 'TEXT:your', 'SPACY:POS-ADJ', 'TEXT:or', 'TEXT:text', 'TEXT:stop', 'TEXT:-', 'TEXT:150p', 'TEXT:guaranteed', 'TEXT:urgent', 'HYPERNYM3:textbook.n.01', 'HYPERNYM3:act.n.02', 'TEXT:win', 'HYPERNYM3:oppose.v.01', 'TEXT:+', 'HYPERNYM3:written_communication.n.01', 'SPACY:POS-PRON', 'TEXT:16', 'TEXT:cash', 'HYPERNYM3:abstraction.n.06', 'TEXT:from', 'HYPERNYM3:assertion.n.01', 'TEXT:reply', 'TEXT:now', 'TEXT:.', 'TEXT:tone', 'TEXT:18', 'HYPERNYM3:acquisition.n.02', 'SPACY:POS-DET', 'TEXT:nokia', 'TEXT:a', 'HYPERNYM3:user.n.01', 'TEXT:our', 'HYPERNYM3:mobile.n.02', 'HYPERNYM3:minute.n.01',

100%|██████████| 100/100 [00:57<00:00,  1.73it/s]


Length 2 / 5; New candidates = 14950
Finding top k: 10 / 100
Finding top k: 20 / 100
Finding top k: 30 / 100
Finding top k: 40 / 100
Finding top k: 50 / 100
Finding top k: 60 / 100
Finding top k: 70 / 100
Finding top k: 80 / 100
Finding top k: 90 / 100
Finding top k: 100 / 100
Example of current patterns
Pattern: [['SPACY:POS-NUM']]
Window size: 3
Class: Negative
Precision: 0.526
Match: 936 (26.2%)
Gain = 0.223
Metric (global) = 0.223
[5m[7m[32mExamples[0m ~ Class Negative:
[5m[32m[MATCH][0m: Hey . What happened ? U switch off ur cell [7m[36md:['SPACY:POS-NUM'][0m whole day . This is nt good . Now if u do care , give me a call tomorrow . 
-------------------------
[5m[32m[MATCH][0m: Ok no problem ... Yup i 'm going to sch at [7m[36m4:['SPACY:POS-NUM'][0m if i rem correctly ... 
-------------------------
[5m[7m[31mCounterexamples[0m ~ Not class Negative:
[5m[32m[MATCH][0m: URGENT ! : Your Mobile No . was awarded a Â£2,000 Bonus Caller Prize on 02/09/03 ! This is o

100%|██████████| 68/68 [01:02<00:00,  1.10it/s]


Length 3 / 5; New candidates = 13344
Finding top k: 10 / 100
Finding top k: 20 / 100
Finding top k: 30 / 100
Finding top k: 40 / 100
Finding top k: 50 / 100
Finding top k: 60 / 100
Finding top k: 70 / 100
Finding top k: 80 / 100
Finding top k: 90 / 100


  0%|          | 0/36 [00:00<?, ?it/s]

Finding top k: 100 / 100
Example of current patterns
Pattern: [['SPACY:POS-NUM']]
Window size: 3
Class: Negative
Precision: 0.526
Match: 936 (26.2%)
Gain = 0.223
Metric (global) = 0.223
[5m[7m[32mExamples[0m ~ Class Negative:
[5m[32m[MATCH][0m: THANX4 TODAY CER IT WAS NICE [7m[36m2:['SPACY:POS-NUM'][0m CATCH UP BUT WE AVE 2 FIND MORE TIME MORE OFTEN OH WELL TAKE CARE C U SOON.C 
-------------------------
[5m[32m[MATCH][0m: Hi i wo n't b ard [7m[36m4:['SPACY:POS-NUM'][0m christmas . But do enjoy n merry x'mas . 
-------------------------
[5m[7m[31mCounterexamples[0m ~ Not class Negative:
[5m[32m[MATCH][0m: Message Important information for O2 user . Today is your lucky day ! [7m[36m2:['SPACY:POS-NUM'][0m find out why log onto http://www.urawinner.com there is a fantastic surprise awaiting you 
-------------------------
[5m[32m[MATCH][0m: Your [7m[36m2004:['SPACY:POS-NUM'][0m account for 07XXXXXXXXX shows 786 unredeemed points . To claim call 08719181259 I

100%|██████████| 36/36 [00:31<00:00,  1.13it/s]


Length 4 / 5; New candidates = 7137
Finding top k: 10 / 100
Finding top k: 20 / 100
Finding top k: 30 / 100
Finding top k: 40 / 100
Finding top k: 50 / 100
Finding top k: 60 / 100
Finding top k: 70 / 100
Finding top k: 80 / 100
Finding top k: 90 / 100


  0%|          | 0/5 [00:00<?, ?it/s]

Finding top k: 100 / 100
Example of current patterns
Pattern: [['SPACY:POS-NUM']]
Window size: 3
Class: Negative
Precision: 0.526
Match: 936 (26.2%)
Gain = 0.223
Metric (global) = 0.223
[5m[7m[32mExamples[0m ~ Class Negative:
[5m[32m[MATCH][0m: All boys made fun of me today . Ok i have no problem . I just sent [7m[36mone:['SPACY:POS-NUM'][0m message just for fun 
-------------------------
[5m[32m[MATCH][0m: Me also da , i feel yesterday night   wait til [7m[36m2day:['SPACY:POS-NUM'][0m night dear . 
-------------------------
[5m[7m[31mCounterexamples[0m ~ Not class Negative:
[5m[32m[MATCH][0m: FREE entry into our Â£250 weekly comp just send the word ENTER to [7m[36m84128:['SPACY:POS-NUM'][0m NOW . 18 T&C www.textcomp.com cust care 08712405020 . 
-------------------------
[5m[32m[MATCH][0m: Urgent ! Please call [7m[36m09061743810:['SPACY:POS-NUM'][0m from landline . Your ABTA complimentary 4 * Tenerife Holiday or # 5000 cash await collection SAE T&Cs Box 

100%|██████████| 5/5 [00:03<00:00,  1.28it/s]


Length 5 / 5; New candidates = 994
Finding top k: 10 / 100
Finding top k: 20 / 100
Finding top k: 30 / 100
Finding top k: 40 / 100
Finding top k: 50 / 100
Finding top k: 60 / 100
Finding top k: 70 / 100
Finding top k: 80 / 100
Finding top k: 90 / 100
Finding top k: 100 / 100
Example of current patterns
Pattern: [['SPACY:POS-NUM']]
Window size: 3
Class: Negative
Precision: 0.526
Match: 936 (26.2%)
Gain = 0.223
Metric (global) = 0.223
[5m[7m[32mExamples[0m ~ Class Negative:
[5m[32m[MATCH][0m: The guy ( kadeem ) has n't been selling since the break , I know [7m[36mone:['SPACY:POS-NUM'][0m other guy but he 's paranoid as fuck and does n't like selling without me there and I ca n't be up there til late tonight 
-------------------------
[5m[32m[MATCH][0m: If u dun drive then how i go [7m[36m2:['SPACY:POS-NUM'][0m sch . 
-------------------------
[5m[7m[31mCounterexamples[0m ~ Not class Negative:
[5m[32m[MATCH][0m: URGENT This is our 2nd attempt to contact U. Your Â£90

In [21]:
for idx, p in enumerate(the_patterns_2):
    print(f'Rank {idx}')
    print(p)

Rank 0
Pattern: [['SPACY:POS-NUM']]
Window size: 3
Class: Negative
Precision: 0.526
Match: 936 (26.2%)
Gain = 0.223
Metric (global) = 0.223
[5m[7m[32mExamples[0m ~ Class Negative:
[5m[32m[MATCH][0m: Ãœ come lt [7m[36m25:['SPACY:POS-NUM'][0m n pass to me lar 
-------------------------
[5m[32m[MATCH][0m: Ill be at yours in about [7m[36m3:['SPACY:POS-NUM'][0m mins but look out for me 
-------------------------
[5m[7m[31mCounterexamples[0m ~ Not class Negative:
[5m[32m[MATCH][0m: URGENT ! We are trying to contact you . Last weekends draw shows that you have won a Â£900 prize GUARANTEED . Call [7m[36m09061701851:['SPACY:POS-NUM'][0m . Claim code K61 . Valid 12hours only 
-------------------------
[5m[32m[MATCH][0m: Phony Â£350 award - Todays Voda numbers ending XXXX are selected to receive a Â£350 award . If you have a match please call [7m[36m08712300220:['SPACY:POS-NUM'][0m quoting claim code 3100 standard rates app 
-------------------------
Rank 1
Pattern

- Sort the patterns by their information gain

In [22]:
for idx, p in enumerate(the_patterns_2):
    print(f'{idx+1:>3} {p.support_class}   {round(p.coverage*100, 1):>4}   {p.precision:.3f}   {p.get_pattern_id()}')

  1 Negative   26.2   0.526   [['SPACY:POS-NUM']]
  2 Positive    8.9   0.806   [['SPACY:POS-PROPN'], ['SPACY:POS-NUM']]
  3 Positive    5.1   0.984   [['TEXT:call'], ['SPACY:POS-NUM']]
  4 Positive   14.5   0.579   [['SPACY:POS-PROPN'], ['SPACY:POS-PROPN']]
  5 Positive   11.8   0.613   [['SPACY:POS-ADP'], ['SPACY:POS-NUM']]
  6 Positive    6.8   0.811   [['SPACY:POS-NUM'], ['SPACY:POS-PROPN']]
  7 Negative   18.1   0.526   [['SPACY:POS-NOUN'], ['SPACY:POS-PROPN']]
  8 Positive    5.4   0.856   [['TEXT:to'], ['SPACY:POS-NUM']]
  9 Negative   37.8   0.690   [['SPACY:POS-PROPN']]
 10 Positive    7.6   0.702   [['SPACY:POS-NOUN'], ['SPACY:POS-NOUN'], ['SPACY:POS-NUM']]
 11 Positive    7.1   0.722   [['SPACY:POS-PROPN'], ['SPACY:POS-PROPN'], ['SPACY:POS-PROPN']]
 12 Positive    5.7   0.797   [['TEXT:.'], ['SPACY:POS-NUM']]
 13 Positive    6.7   0.735   [['SPACY:POS-NUM'], ['SPACY:POS-NUM']]
 14 Negative   20.0   0.585   [['SPACY:POS-PROPN'], ['SPACY:POS-NOUN']]
 15 Positive    8.8   0.597

In [25]:
print(f'  #    class Cov(%)    Prec   Gain     Pattern')
for idx, p in enumerate(sorted(the_patterns_2, key = lambda x: x.metric, reverse = True)):
    print(f'{idx+1:>3} {p.support_class}   {round(p.coverage*100, 1):>4}   {p.precision:.3f}   {p.metric:.3f}    {p.get_pattern_id()}')

  #    class Cov(%)    Prec   Gain     Pattern
  1 Negative   26.2   0.526   0.223    [['SPACY:POS-NUM']]
  2 Positive    8.9   0.806   0.175    [['SPACY:POS-PROPN'], ['SPACY:POS-NUM']]
  3 Positive    5.1   0.984   0.152    [['TEXT:call'], ['SPACY:POS-NUM']]
  4 Positive   14.5   0.579   0.148    [['SPACY:POS-PROPN'], ['SPACY:POS-PROPN']]
  5 Positive   11.8   0.613   0.129    [['SPACY:POS-ADP'], ['SPACY:POS-NUM']]
  6 Positive    6.8   0.811   0.129    [['SPACY:POS-NUM'], ['SPACY:POS-PROPN']]
  7 Negative   18.1   0.526   0.119    [['SPACY:POS-NOUN'], ['SPACY:POS-PROPN']]
  8 Positive    5.4   0.856   0.114    [['TEXT:to'], ['SPACY:POS-NUM']]
  9 Negative   37.8   0.690   0.112    [['SPACY:POS-PROPN']]
 10 Positive    7.6   0.702   0.105    [['SPACY:POS-NOUN'], ['SPACY:POS-NOUN'], ['SPACY:POS-NUM']]
 11 Positive    7.1   0.722   0.104    [['SPACY:POS-PROPN'], ['SPACY:POS-PROPN'], ['SPACY:POS-PROPN']]
 12 Positive    5.7   0.797   0.101    [['TEXT:.'], ['SPACY:POS-NUM']]
 13 Positive 

- Sort the patterns by their precision

In [23]:
for idx, p in enumerate(sorted(the_patterns_2, key = lambda x: -x.precision)):
    print(f'{idx+1:>3} {p.support_class}   {round(p.coverage*100, 1):>4}   {p.precision:.3f}   {p.get_pattern_id()}')

  1 Positive    2.2   1.000   [['TEXT:claim']]
  2 Positive    1.6   1.000   [['TEXT:prize']]
  3 Positive    1.2   1.000   [['TEXT:call'], ['SPACY:POS-NUM'], ['SPACY:POS-ADP']]
  4 Positive    1.2   1.000   [['SPACY:POS-PROPN'], ['SPACY:POS-NOUN'], ['TEXT:.'], ['SPACY:POS-NUM']]
  5 Positive    1.9   0.985   [['TEXT:.'], ['TEXT:call'], ['SPACY:POS-NUM']]
  6 Positive    5.1   0.984   [['TEXT:call'], ['SPACY:POS-NUM']]
  7 Negative   37.2   0.983   [['TEXT:i']]
  8 Positive    1.6   0.982   [['HYPERNYM3:call.v.28'], ['SPACY:POS-NUM']]
  9 Positive    1.5   0.982   [['SPACY:POS-NOUN'], ['SPACY:POS-NOUN'], ['TEXT:to'], ['SPACY:POS-NUM']]
 10 Positive    2.0   0.957   [['TEXT:call'], ['SPACY:POS-NUM'], ['SPACY:POS-NOUN']]
 11 Positive    2.4   0.942   [['TEXT:mobile']]
 12 Positive    2.7   0.938   [['TEXT:txt']]
 13 Positive    1.7   0.934   [['SPACY:POS-NUM'], ['TEXT:now']]
 14 Positive    3.4   0.926   [['SPACY:POS-PROPN'], ['SPACY:POS-ADP'], ['SPACY:POS-NUM']]
 15 Positive    2.2   0.

In [29]:
print(f'  #    class Cov(%)    Prec    Gain    Pattern')
for idx, p in enumerate(sorted(the_patterns_2, key = lambda x: x.precision, reverse = True)):
    print(f'{idx+1:>3} {p.support_class}   {round(p.coverage*100, 1):>4}   {p.precision:.3f}   {p.metric:.3f}    {p.get_pattern_id()}')

  #    class Cov(%)    Prec    Gain    Pattern
  1 Positive    2.2   1.000   0.064    [['TEXT:claim']]
  2 Positive    1.6   1.000   0.046    [['TEXT:prize']]
  3 Positive    1.2   1.000   0.036    [['TEXT:call'], ['SPACY:POS-NUM'], ['SPACY:POS-ADP']]
  4 Positive    1.2   1.000   0.036    [['SPACY:POS-PROPN'], ['SPACY:POS-NOUN'], ['TEXT:.'], ['SPACY:POS-NUM']]
  5 Positive    1.9   0.985   0.053    [['TEXT:.'], ['TEXT:call'], ['SPACY:POS-NUM']]
  6 Positive    5.1   0.984   0.152    [['TEXT:call'], ['SPACY:POS-NUM']]
  7 Negative   37.2   0.983   0.067    [['TEXT:i']]
  8 Positive    1.6   0.982   0.044    [['HYPERNYM3:call.v.28'], ['SPACY:POS-NUM']]
  9 Positive    1.5   0.982   0.043    [['SPACY:POS-NOUN'], ['SPACY:POS-NOUN'], ['TEXT:to'], ['SPACY:POS-NUM']]
 10 Positive    2.0   0.957   0.051    [['TEXT:call'], ['SPACY:POS-NUM'], ['SPACY:POS-NOUN']]
 11 Positive    2.4   0.942   0.060    [['TEXT:mobile']]
 12 Positive    2.7   0.938   0.067    [['TEXT:txt']]
 13 Positive    1.7   0

## Example 3
Using F_0.05 as a metric to select patterns.

In [8]:
grasp_model_3 = GrASP(gaps_allowed = 2, num_patterns = 100, include_standard = ['TEXT', 'POS', 'SENTIMENT'],
                    include_custom = [Hypernym3Attribute],
                    correlation_threshold = 0.5, alphabet_size = 100,
                    gain_criteria = 'F_0.05')

In [9]:
the_patterns_3 = grasp_model_3.fit_transform(positive, negative)

  0%|          | 0/488 [00:00<?, ?it/s]

Step 1: Create augmented texts


100%|██████████| 488/488 [00:20<00:00, 23.58it/s]
100%|██████████| 3079/3079 [01:27<00:00, 35.30it/s]


Step 2: Find frequent attributes


  1%|          | 7/1215 [00:00<00:19, 62.64it/s]

Total number of candidate alphabet = 1215, such as ['SPACY:POS-VERB', 'SPACY:POS-NOUN', 'SPACY:POS-PUNCT', 'SPACY:POS-PRON', 'SPACY:POS-ADV']
Step 3: Find alphabet set


100%|██████████| 1215/1215 [00:36<00:00, 33.39it/s]


Finding top k: 10 / 100
Finding top k: 20 / 100
Finding top k: 30 / 100
Finding top k: 40 / 100
Finding top k: 50 / 100
Finding top k: 60 / 100
Finding top k: 70 / 100
Finding top k: 80 / 100
Finding top k: 90 / 100


  0%|          | 0/100 [00:00<?, ?it/s]

Finding top k: 100 / 100
Total number of alphabet = 100
['TEXT:claim', 'TEXT:prize', 'TEXT:i', 'TEXT:won', 'TEXT:...', 'TEXT:150p', 'TEXT:guaranteed', 'TEXT:my', 'HYPERNYM3:assertion.n.01', 'TEXT:;', 'TEXT:tone', 'TEXT:18', 'HYPERNYM3:prize.n.01', 'TEXT:..', 'TEXT:but', 'HYPERNYM3:claim.v.03', 'SPACY:POS-SPACE', 'TEXT:awarded', 'TEXT:â£1000', 'TEXT:150ppm', "TEXT:'m", 'HYPERNYM3:outgo.n.01', 'HYPERNYM3:television_equipment.n.01', 'HYPERNYM3:match.v.01', 'TEXT:urgent', 'TEXT:16', 'TEXT:ok', 'HYPERNYM3:corner.n.08', 'TEXT:it', 'TEXT:come', 'TEXT:me', 'TEXT:that', 'TEXT:10p', 'TEXT:collection', 'TEXT:how', 'TEXT:he', "TEXT:'ll", 'TEXT:got', 'TEXT:service', 'TEXT:mobile', "TEXT:n't", 'TEXT:did', 'TEXT:txt', 'TEXT:going', 'HYPERNYM3:make.v.03', 'HYPERNYM3:then.r.03', 'TEXT:home', 'TEXT:nt', 'HYPERNYM3:not.r.01', 'TEXT:later', 'TEXT:lor', 'HYPERNYM3:cause.v.01', 'TEXT:da', 'TEXT:do', 'TEXT:rate', 'HYPERNYM3:mobile.n.03', 'SPACY:POS-INTJ', 'SPACY:POS-PRON', 'HYPERNYM3:search.v.04', 'TEXT:in',

100%|██████████| 100/100 [00:33<00:00,  2.97it/s]


Length 2 / 5; New candidates = 14950
Finding top k: 10 / 100
Finding top k: 20 / 100
Finding top k: 30 / 100
Finding top k: 40 / 100
Finding top k: 50 / 100
Finding top k: 60 / 100
Finding top k: 70 / 100
Finding top k: 80 / 100
Finding top k: 90 / 100
Finding top k: 100 / 100
Example of current patterns
Pattern: [['TEXT:claim']]
Window size: 3
Class: Positive
Precision: 1.000
Match: 77 (2.2%)
Gain = 0.064
Metric (F_0.05) = 0.987
[5m[7m[32mExamples[0m ~ Class Positive:
[5m[32m[MATCH][0m: URGENT ! We are trying to contact you . Last weekends draw shows that you have won a Â£900 prize GUARANTEED . Call 09061701851 . [7m[36mClaim:['TEXT:claim'][0m code K61 . Valid 12hours only 
-------------------------
[5m[32m[MATCH][0m: You have won a guaranteed Â£200 award or even Â£1000 cashto [7m[36mclaim:['TEXT:claim'][0m UR award call free on 08000407165 ( 18 + ) 2 stop getstop on 88222 PHP . RG21 4JX 
-------------------------
[5m[7m[31mCounterexamples[0m ~ Not class Positive:


100%|██████████| 30/30 [00:18<00:00,  1.64it/s]


Length 3 / 5; New candidates = 5872
Finding top k: 10 / 100
Finding top k: 20 / 100
Finding top k: 30 / 100
Finding top k: 40 / 100
Finding top k: 50 / 100
Finding top k: 60 / 100
Finding top k: 70 / 100
Finding top k: 80 / 100
Finding top k: 90 / 100


  0%|          | 0/1 [00:00<?, ?it/s]

Finding top k: 100 / 100
Example of current patterns
Pattern: [['TEXT:claim']]
Window size: 3
Class: Positive
Precision: 1.000
Match: 77 (2.2%)
Gain = 0.064
Metric (F_0.05) = 0.987
[5m[7m[32mExamples[0m ~ Class Positive:
[5m[32m[MATCH][0m: URGENT ! We are trying to contact U Todays draw shows that you have won a Â£800 prize GUARANTEED . Call 09050000460 from land line . [7m[36mClaim:['TEXT:claim'][0m J89 . po box245c2150pm 
-------------------------
[5m[32m[MATCH][0m: + 123 Congratulations - in this week 's competition draw u have won the Â£1450 prize to [7m[36mclaim:['TEXT:claim'][0m just call 09050002311 b4280703 . T&Cs / stop SMS 08718727868 . Over 18 only 150ppm 
-------------------------
[5m[7m[31mCounterexamples[0m ~ Not class Positive:

Pattern: [['TEXT:prize']]
Window size: 3
Class: Positive
Precision: 1.000
Match: 56 (1.6%)
Gain = 0.046
Metric (F_0.05) = 0.981
[5m[7m[32mExamples[0m ~ Class Positive:
[5m[32m[MATCH][0m: Please call our customer service

100%|██████████| 1/1 [00:00<00:00,  3.40it/s]


Length 4 / 5; New candidates = 199
Finding top k: 10 / 100
Finding top k: 20 / 100
Finding top k: 30 / 100
Finding top k: 40 / 100
Finding top k: 50 / 100
Finding top k: 60 / 100
Finding top k: 70 / 100
Finding top k: 80 / 100
Finding top k: 90 / 100


0it [00:00, ?it/s]

Finding top k: 100 / 100
Example of current patterns
Pattern: [['TEXT:claim']]
Window size: 3
Class: Positive
Precision: 1.000
Match: 77 (2.2%)
Gain = 0.064
Metric (F_0.05) = 0.987
[5m[7m[32mExamples[0m ~ Class Positive:
[5m[32m[MATCH][0m: URGENT ! We are trying to contact U. Todays draw shows that you have won a Â£800 prize GUARANTEED . Call 09050001808 from land line . [7m[36mClaim:['TEXT:claim'][0m M95 . Valid12hrs only 
-------------------------
[5m[32m[MATCH][0m: Congratulations - Thanks to a good friend U have WON the Â£2,000 Xmas prize . 2 [7m[36mclaim:['TEXT:claim'][0m is easy , just call 08712103738 NOW ! Only 10p per minute . BT - national - rate 
-------------------------
[5m[7m[31mCounterexamples[0m ~ Not class Positive:

Pattern: [['TEXT:prize']]
Window size: 3
Class: Positive
Precision: 1.000
Match: 56 (1.6%)
Gain = 0.046
Metric (F_0.05) = 0.981
[5m[7m[32mExamples[0m ~ Class Positive:
[5m[32m[MATCH][0m: URGENT ! Your Mobile number has been award




In [10]:
for idx, p in enumerate(the_patterns_3):
    print(f'Rank {idx}')
    print(p)

Rank 0
Pattern: [['TEXT:claim']]
Window size: 3
Class: Positive
Precision: 1.000
Match: 77 (2.2%)
Gain = 0.064
Metric (F_0.05) = 0.987
[5m[7m[32mExamples[0m ~ Class Positive:
[5m[32m[MATCH][0m: Final Chance ! [7m[36mClaim:['TEXT:claim'][0m ur Â£150 worth of discount vouchers today ! Text YES to 85023 now ! SavaMob , member offers mobile ! T Cs SavaMob POBOX84 , M263UZ . Â£3.00 Subs 16 
-------------------------
[5m[32m[MATCH][0m: Uâ€ ™ ve Bin Awarded Â£50 to Play 4 Instant Cash . Call 08715203028 To [7m[36mClaim:['TEXT:claim'][0m . EVERY 9th Player Wins Min Â£50-Â£500 . OptOut 08718727870 
-------------------------
[5m[7m[31mCounterexamples[0m ~ Not class Positive:

Rank 1
Pattern: [['TEXT:prize']]
Window size: 3
Class: Positive
Precision: 1.000
Match: 56 (1.6%)
Gain = 0.046
Metric (F_0.05) = 0.981
[5m[7m[32mExamples[0m ~ Class Positive:
[5m[32m[MATCH][0m: URGENT ! Your mobile No 077xxx WON a Â£2,000 Bonus Caller [7m[36mPrize:['TEXT:prize'][0m on 02/06/03 

In [18]:
print(f'  #    class Cov(%)    Prec  {grasp_model_3.gain_criteria}    Pattern')
for idx, p in enumerate(sorted(the_patterns_3, key = lambda x: x.metric, reverse = True)):
    print(f'{idx+1:>3} {p.support_class}   {round(p.coverage*100, 1):>4}   {p.precision:.3f}   {p.metric:.3f}    {p.get_pattern_id()}')

  #    class Cov(%)    Prec  F_0.05    Pattern
  1 Positive    2.2   1.000   0.987    [['TEXT:claim']]
  2 Positive    1.6   1.000   0.981    [['TEXT:prize']]
  3 Negative   37.2   0.983   0.980    [['TEXT:i']]
  4 Positive    1.3   1.000   0.978    [['TEXT:won']]
  5 Negative   12.6   0.989   0.975    [['TEXT:...']]
  6 Negative    8.2   0.997   0.973    [['TEXT:i'], ['SPACY:POS-PRON']]
  7 Positive    1.1   1.000   0.971    [['TEXT:150p']]
  8 Positive    1.1   1.000   0.971    [['TEXT:guaranteed']]
  9 Negative   10.7   0.987   0.970    [['TEXT:my']]
 10 Negative   18.0   0.977   0.967    [['SPACY:POS-PRON'], ['SPACY:POS-PRON']]
 11 Negative   11.4   0.983   0.967    [['SPACY:POS-INTJ'], ['SPACY:POS-PRON']]
 12 Positive    0.9   1.000   0.966    [['HYPERNYM3:assertion.n.01']]
 13 Negative    5.4   1.000   0.964    [['TEXT:;']]
 14 Positive    0.8   1.000   0.963    [['TEXT:tone']]
 15 Positive    0.8   1.000   0.963    [['TEXT:18']]
 16 Positive    0.8   1.000   0.962    [['HYPERNYM