In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import grasp
from grasp import GrASP, CustomAttribute, remove_specialized_patterns
from sklearn.model_selection import train_test_split

## Load the data
- Download and unzip the spam dataset **if you have not done this before**

In [None]:
import urllib.request
url = 'http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/smsspamcollection.zip'
filename = './data/smsspamcollection.zip'
urllib.request.urlretrieve(url, filename)

In [None]:
!unzip ./data/smsspamcollection.zip -d ./data

- Load the data

In [4]:
def get_data():
    f = open('data/SMSSpamCollection.txt', 'r')
    texts, labels = [], []
    for line in f:
        line = line.strip()
        tab_idx = line.index('\t')
        label = line[:tab_idx]
        text = line[tab_idx+1:]
        if label == 'ham':
            label = 0
        elif label == 'spam':
            label = 1
        else:
            raise Exception(f"Invalid label - {label}")
        texts.append(text)
        labels.append(label)
    return texts, labels

In [5]:
texts, labels = get_data()
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
len(texts), sum(labels), len(X_test), sum(y_test)

(5574, 747, 1115, 138)

In [6]:
positive = [t for idx, t in enumerate(X_train) if y_train[idx]]
negative = [t for idx, t in enumerate(X_train) if not y_train[idx]]

## Run GrASP

In [7]:
# Create the GrASP engine
grasp_model = GrASP(gaps_allowed = 2, num_patterns = 100, include_standard = ['TEXT', 'POS', 'SENTIMENT'],
                    include_custom = [CustomAttribute(name = 'HYPERNYM3', extraction_function = grasp.get_custom_hypernym_extraction_function(above = 3), translation_function = grasp._hypernym_translation)],
                    correlation_threshold = 0.5, alphabet_size = 200)

In [8]:
# Fit GrASP to the dataset
the_patterns = grasp_model.fit_transform(positive, negative)

  0%|          | 0/488 [00:00<?, ?it/s]

Step 1: Create augmented texts


100%|██████████| 488/488 [00:17<00:00, 27.72it/s]
100%|██████████| 3079/3079 [01:21<00:00, 37.93it/s]


Step 2: Find frequent attributes


  0%|          | 5/1215 [00:00<00:26, 45.58it/s]

Total number of candidate alphabet = 1215, such as ['SPACY:POS-VERB', 'SPACY:POS-NOUN', 'SPACY:POS-PUNCT', 'SPACY:POS-PRON', 'SPACY:POS-ADV']
Step 3: Find alphabet set


100%|██████████| 1215/1215 [00:27<00:00, 43.56it/s]


Finding top k: 20 / 200
Finding top k: 40 / 200
Finding top k: 60 / 200
Finding top k: 80 / 200
Finding top k: 100 / 200
Finding top k: 120 / 200
Finding top k: 140 / 200
Finding top k: 160 / 200
Finding top k: 180 / 200
Finding top k: 200 / 200


  0%|          | 0/200 [00:00<?, ?it/s]

Total number of alphabet = 200
['SPACY:POS-NUM', 'SPACY:POS-PROPN', 'TEXT:call', 'TEXT:i', 'SPACY:POS-SYM', 'TEXT:free', 'TEXT:txt', 'TEXT:claim', 'TEXT:!', 'TEXT:mobile', 'HYPERNYM3:cost.n.01', 'SPACY:POS-ADP', 'HYPERNYM3:message.n.02', 'TEXT:to', 'TEXT:prize', 'HYPERNYM3:communication.n.02', 'HYPERNYM3:win.v.01', 'SENTIMENT:pos', 'HYPERNYM3:symbol.n.01', 'HYPERNYM3:statement.n.01', 'TEXT:your', 'SPACY:POS-ADJ', 'TEXT:or', 'TEXT:text', 'TEXT:stop', 'TEXT:-', 'TEXT:150p', 'TEXT:guaranteed', 'TEXT:urgent', 'HYPERNYM3:textbook.n.01', 'HYPERNYM3:act.n.02', 'TEXT:win', 'HYPERNYM3:oppose.v.01', 'TEXT:+', 'HYPERNYM3:written_communication.n.01', 'SPACY:POS-PRON', 'TEXT:16', 'TEXT:cash', 'HYPERNYM3:abstraction.n.06', 'TEXT:from', 'HYPERNYM3:assertion.n.01', 'TEXT:reply', 'TEXT:now', 'TEXT:.', 'TEXT:tone', 'TEXT:18', 'HYPERNYM3:acquisition.n.02', 'SPACY:POS-DET', 'TEXT:nokia', 'TEXT:a', 'HYPERNYM3:user.n.01', 'TEXT:our', 'HYPERNYM3:mobile.n.02', 'HYPERNYM3:minute.n.01', 'HYPERNYM3:person.n.01',

100%|██████████| 200/200 [03:04<00:00,  1.08it/s]


Length 2 / 5; New candidates = 59900
Finding top k: 10 / 100
Finding top k: 20 / 100
Finding top k: 30 / 100
Finding top k: 40 / 100
Finding top k: 50 / 100
Finding top k: 60 / 100
Finding top k: 70 / 100
Finding top k: 80 / 100
Finding top k: 90 / 100
Finding top k: 100 / 100
Example of current patterns
Pattern: [['SPACY:POS-NUM']]
Window size: 3
Class: Negative
Precision: 0.526
Match: 936 (26.2%)
Gain = 0.223
Metric (global) = 0.223
[5m[7m[32mExamples[0m ~ Class Negative:
[5m[32m[MATCH][0m: Yar i wanted [7m[36m2:['SPACY:POS-NUM'][0m scold u yest but late already ... I where got zhong se qing you ? If u ask me b4 he ask me then i 'll go out w u all lor . N u still can act so real . 
-------------------------
[5m[32m[MATCH][0m: So u gon na get deus [7m[36mex:['SPACY:POS-NUM'][0m ? 
-------------------------
[5m[7m[31mCounterexamples[0m ~ Not class Negative:
[5m[32m[MATCH][0m: Ur cash - balance is currently [7m[36m500:['SPACY:POS-NUM'][0m pounds - to maximize u

100%|██████████| 70/70 [02:12<00:00,  1.89s/it]


Length 3 / 5; New candidates = 27723
Finding top k: 10 / 100
Finding top k: 20 / 100
Finding top k: 30 / 100
Finding top k: 40 / 100
Finding top k: 50 / 100
Finding top k: 60 / 100
Finding top k: 70 / 100
Finding top k: 80 / 100
Finding top k: 90 / 100
Finding top k: 100 / 100
Example of current patterns
Pattern: [['SPACY:POS-NUM']]
Window size: 3
Class: Negative
Precision: 0.526
Match: 936 (26.2%)
Gain = 0.223
Metric (global) = 0.223
[5m[7m[32mExamples[0m ~ Class Negative:
[5m[32m[MATCH][0m: Will be office around [7m[36m4:['SPACY:POS-NUM'][0m pm . Now i am going hospital . 
-------------------------
[5m[32m[MATCH][0m: My trip was ok but quite tiring lor . Uni starts today but it 's ok [7m[36m4:['SPACY:POS-NUM'][0m me cos i 'm not taking any modules but jus concentrating on my final yr project . 
-------------------------
[5m[7m[31mCounterexamples[0m ~ Not class Negative:
[5m[32m[MATCH][0m: Call Germany for only [7m[36m1:['SPACY:POS-NUM'][0m pence per minute !

100%|██████████| 36/36 [00:59<00:00,  1.66s/it]


Length 4 / 5; New candidates = 14334
Finding top k: 10 / 100
Finding top k: 20 / 100
Finding top k: 30 / 100
Finding top k: 40 / 100
Finding top k: 50 / 100
Finding top k: 60 / 100
Finding top k: 70 / 100
Finding top k: 80 / 100
Finding top k: 90 / 100


  0%|          | 0/4 [00:00<?, ?it/s]

Finding top k: 100 / 100
Example of current patterns
Pattern: [['SPACY:POS-NUM']]
Window size: 3
Class: Negative
Precision: 0.526
Match: 936 (26.2%)
Gain = 0.223
Metric (global) = 0.223
[5m[7m[32mExamples[0m ~ Class Negative:
[5m[32m[MATCH][0m: Alex knows a guy who sells mids but he 's down in south tampa and I do n't think I could set it up before like [7m[36m8:['SPACY:POS-NUM'][0m 
-------------------------
[5m[32m[MATCH][0m: [7m[36m8:['SPACY:POS-NUM'][0m at the latest , g 's still there if you can scrounge up some ammo and want to give the new ak a try 
-------------------------
[5m[7m[31mCounterexamples[0m ~ Not class Negative:
[5m[32m[MATCH][0m: SMS AUCTION You have won a Nokia [7m[36m7250i:['SPACY:POS-NUM'][0m . This is what you get when you win our FREE auction . To take part send Nokia to 86021 now . HG / Suite342/2Lands Row / W1JHL 16 + 
-------------------------
[5m[32m[MATCH][0m: Gr8 Poly tones [7m[36m4:['SPACY:POS-NUM'][0m ALL mobs direct 2u 

100%|██████████| 4/4 [00:06<00:00,  1.70s/it]


Length 5 / 5; New candidates = 1595
Finding top k: 10 / 100
Finding top k: 20 / 100
Finding top k: 30 / 100
Finding top k: 40 / 100
Finding top k: 50 / 100
Finding top k: 60 / 100
Finding top k: 70 / 100
Finding top k: 80 / 100
Finding top k: 90 / 100
Finding top k: 100 / 100
Example of current patterns
Pattern: [['SPACY:POS-NUM']]
Window size: 3
Class: Negative
Precision: 0.526
Match: 936 (26.2%)
Gain = 0.223
Metric (global) = 0.223
[5m[7m[32mExamples[0m ~ Class Negative:
[5m[32m[MATCH][0m: After my work ah ... Den [7m[36m6:['SPACY:POS-NUM'][0m plus lor ... U workin oso rite ... Den go orchard lor , no other place to go liao ... 
-------------------------
[5m[32m[MATCH][0m: Hi Jon , Pete here , I ve bin [7m[36m2:['SPACY:POS-NUM'][0m Spain recently & hav sum dinero left , Bill said u or ur Â‘rents mayb interested in it , I hav 12,000pes , so around Â£48 , tb , James . 
-------------------------
[5m[7m[31mCounterexamples[0m ~ Not class Negative:
[5m[32m[MATCH][0m:

In [9]:
# Print the learned patterns
for idx, p in enumerate(the_patterns):
    print(f'Rank {idx+1}')
    print(p)

Rank 1
Pattern: [['SPACY:POS-NUM']]
Window size: 3
Class: Negative
Precision: 0.526
Match: 936 (26.2%)
Gain = 0.223
Metric (global) = 0.223
[5m[7m[32mExamples[0m ~ Class Negative:
[5m[32m[MATCH][0m: [7m[36mOne:['SPACY:POS-NUM'][0m of the joys in lifeis waking up each daywith thoughts that somewhereSomeone cares enough tosend a warm morning greeting .. - 
-------------------------
[5m[32m[MATCH][0m: I will reach before [7m[36mten:['SPACY:POS-NUM'][0m morning 
-------------------------
[5m[7m[31mCounterexamples[0m ~ Not class Negative:
[5m[32m[MATCH][0m: Your free ringtone is waiting to be collected . Simply text the password " MIX " to [7m[36m85069:['SPACY:POS-NUM'][0m to verify . Get Usher and Britney . FML , PO Box 5249 , MK17 92H. 450Ppw 16 
-------------------------
[5m[32m[MATCH][0m: [7m[36m449050000301:['SPACY:POS-NUM'][0m You have won a Â£2,000 price ! To claim , call 09050000301 . 
-------------------------
Rank 2
Pattern: [['SPACY:POS-PROPN'], ['

Rank 64
Pattern: [['TEXT:!'], ['SPACY:POS-NUM']]
Window size: 4
Class: Positive
Precision: 0.851
Match: 87 (2.4%)
Gain = 0.047
Metric (global) = 0.047
[5m[7m[32mExamples[0m ~ Class Positive:
[5m[32m[MATCH][0m: Congrats [7m[36m!:['TEXT:!'][0m [7m[36m1:['SPACY:POS-NUM'][0m year special cinema pass for 2 is yours . call 09061209465 now ! C Suprman V , Matrix3 , StarWars3 , etc all 4 FREE ! bx420-ip4 - 5we . 150pm . Do nt miss out ! 
-------------------------
[5m[32m[MATCH][0m: Ur ringtone service has changed [7m[36m!:['TEXT:!'][0m [7m[36m25:['SPACY:POS-NUM'][0m Free credits ! Go to club4mobiles.com to choose content now ! Stop ? txt CLUB STOP to 87070 . 150p / wk Club4 PO Box1146 MK45 2WT 
-------------------------
[5m[7m[31mCounterexamples[0m ~ Not class Positive:
[5m[32m[MATCH][0m: make that 3 [7m[36m!:['TEXT:!'][0m [7m[36m4:['SPACY:POS-NUM'][0m fucks sake ? ! x 
-------------------------
[5m[32m[MATCH][0m: I m sorry bout last nite it wasnÂ’t ur fau

In [10]:
print(f'  #    class Cov(%)    Prec  Gain    Pattern')
for idx, p in enumerate(the_patterns):
    print(f'{idx+1:>3} {p.support_class}   {round(p.coverage*100, 1):>4}   {p.precision:.3f}   {p.metric:.3f}    {p.get_pattern_id()}')

  #    class Cov(%)    Prec  Gain    Pattern
  1 Negative   26.2   0.526   0.223    [['SPACY:POS-NUM']]
  2 Positive    8.9   0.806   0.175    [['SPACY:POS-PROPN'], ['SPACY:POS-NUM']]
  3 Positive    5.1   0.984   0.152    [['TEXT:call'], ['SPACY:POS-NUM']]
  4 Positive   14.5   0.579   0.148    [['SPACY:POS-PROPN'], ['SPACY:POS-PROPN']]
  5 Positive   11.8   0.613   0.129    [['SPACY:POS-ADP'], ['SPACY:POS-NUM']]
  6 Positive    6.8   0.811   0.129    [['SPACY:POS-NUM'], ['SPACY:POS-PROPN']]
  7 Negative   18.1   0.526   0.119    [['SPACY:POS-NOUN'], ['SPACY:POS-PROPN']]
  8 Positive    5.4   0.856   0.114    [['TEXT:to'], ['SPACY:POS-NUM']]
  9 Negative   37.8   0.690   0.112    [['SPACY:POS-PROPN']]
 10 Positive    7.6   0.702   0.105    [['SPACY:POS-NOUN'], ['SPACY:POS-NOUN'], ['SPACY:POS-NUM']]
 11 Positive    7.1   0.722   0.104    [['SPACY:POS-PROPN'], ['SPACY:POS-PROPN'], ['SPACY:POS-PROPN']]
 12 Positive    5.7   0.797   0.101    [['TEXT:.'], ['SPACY:POS-NUM']]
 13 Positive   

## Post-process the patterns

In [11]:
# Select only patterns of which precision is greater than 0.70
selected_patterns = [p for p in the_patterns if p.precision >= 0.70]
print(f'No. of remaining patterns = {len(selected_patterns)}')

No. of remaining patterns = 54


In [12]:
# For every pair of patterns (p1, p2), remove pattern p2 if there exists p1 in the patterns set such that p2 is a specialization of p1 and metric of p2 is lower than p1
selected_patterns = remove_specialized_patterns(selected_patterns, metric = lambda x: x.precision)
print(f'No. of remaining patterns = {len(selected_patterns)}')

No. of remaining patterns = 44


In [13]:
# Print the remaining patterns sorted by precision
selected_patterns = sorted(selected_patterns, key = lambda x: x.precision, reverse = True)
print(f'  #    class Cov(%)    Prec  Gain    Pattern')
for idx, p in enumerate(selected_patterns):
    print(f'{idx+1:>3} {p.support_class}   {round(p.coverage*100, 1):>4}   {p.precision:.3f}   {p.metric:.3f}    {p.get_pattern_id()}')

  #    class Cov(%)    Prec  Gain    Pattern
  1 Positive    2.2   1.000   0.064    [['TEXT:claim']]
  2 Positive    1.6   1.000   0.046    [['TEXT:prize']]
  3 Positive    1.9   0.985   0.053    [['TEXT:.'], ['TEXT:call'], ['SPACY:POS-NUM']]
  4 Positive    5.1   0.984   0.152    [['TEXT:call'], ['SPACY:POS-NUM']]
  5 Negative   37.2   0.983   0.067    [['TEXT:i']]
  6 Positive    1.5   0.982   0.043    [['SPACY:POS-NOUN'], ['SPACY:POS-NOUN'], ['TEXT:to'], ['SPACY:POS-NUM']]
  7 Positive    2.4   0.942   0.060    [['TEXT:mobile']]
  8 Positive    2.7   0.938   0.067    [['TEXT:txt']]
  9 Positive    1.7   0.934   0.041    [['SPACY:POS-NUM'], ['TEXT:now']]
 10 Positive    3.4   0.926   0.082    [['SPACY:POS-PROPN'], ['SPACY:POS-ADP'], ['SPACY:POS-NUM']]
 11 Positive    2.2   0.923   0.052    [['HYPERNYM3:communication.n.02'], ['SPACY:POS-NUM']]
 12 Positive    2.2   0.922   0.051    [['HYPERNYM3:win.v.01', 'SENTIMENT:pos']]
 13 Positive    1.6   0.914   0.037    [['SPACY:POS-PROPN'], [

## Save the patterns to a json file
We can use this json file as an input of the web demo tool for exploring the learned patterns and the training data

In [14]:
grasp_model.to_json('case_study_1.json', patterns = selected_patterns, comment = 'Rank and group patterns based on precision. The minimum precision was set at 0.70')

100%|██████████| 44/44 [00:00<00:00, 298.09it/s]
100%|██████████| 44/44 [00:00<00:00, 261.05it/s]


Successfully dump the results to case_study_1.json
