In [18]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
import re, string

In [4]:
data=pd.read_csv('glossary_training_data2.csv')

In [5]:
data.head()

Unnamed: 0,glossary1,glossary2,is_related,is_duplicate
0,Postal Code,Postal Index numbers,1,1
1,Postal Code,Zip Code,1,1
2,Postal Code,PIN code,1,1
3,Gender Identity / Expression,Sexual Orientation,1,0
4,Credit Score,Credit Card,1,0


In [6]:
data.shape

(816, 4)

In [178]:
def cleaned_text(text):
    clean = re.sub("\n"," ",text)
    clean=clean.lower()
    clean=re.sub(r"[~.,%/:;?_&+*=!-]"," ",clean)
    clean=re.sub("[^a-z]"," ",clean)
    clean=clean.lstrip()
    clean=clean.rstrip()
    clean=re.sub("\s{2,}"," ",clean)
    return clean
data["cleaned_glossary1"]=data["glossary1"].apply(cleaned_text)
data["cleaned_glossary2"]=data["glossary2"].apply(cleaned_text)
data.head()

Unnamed: 0,glossary1,glossary2,is_related,is_duplicate,cleaned_glossary1,cleaned_glossary2
0,Postal Code,Postal Index numbers,1,1,postal code,postal index numbers
1,Postal Code,Zip Code,1,1,postal code,zip code
2,Postal Code,PIN code,1,1,postal code,pin code
3,Gender Identity / Expression,Sexual Orientation,1,0,gender identity expression,sexual orientation
4,Credit Score,Credit Card,1,0,credit score,credit card


In [21]:
related=data[data['is_related']==1]

In [22]:
related.shape #427 related pairs

(427, 6)

In [24]:
related.head()

Unnamed: 0,glossary1,glossary2,is_related,is_duplicate,cleaned_glossary1,cleaned_glossary2
0,Postal Code,Postal Index numbers,1,1,postal code,postal index numbers
1,Postal Code,Zip Code,1,1,postal code,zip code
2,Postal Code,PIN code,1,1,postal code,pin code
3,Gender Identity / Expression,Sexual Orientation,1,0,gender identity expression,sexual orientation
4,Credit Score,Credit Card,1,0,credit score,credit card


In [31]:
gloss1=list(related['cleaned_glossary1'].apply(lambda x: str(x)))
gloss2=list(related['cleaned_glossary2'].apply(lambda x: str(x)))

In [52]:
gloss1[:10]

['postal code',
 'postal code',
 'postal code',
 'gender identity expression',
 'credit score',
 'benefits and entitlements data ',
 'benefits and entitlements data ',
 'benefits and entitlements data ',
 'benefits and entitlements data ',
 'currency ']

In [40]:
gloss2[:10]

['postal index numbers ',
 'zip code',
 'pin code ',
 'sexual orientation',
 'credit card',
 'benefits ',
 'entitlements ',
 'compensation ',
 'unemployment insurance',
 'bills']

In [55]:
#create related pair list
pair=[]
for i in range(len(gloss1)):
    pair_lst=[]
    pair_lst.append(gloss1[i].rstrip())  #.rstrip() remove the space after text
    pair_lst.append(gloss2[i].rstrip())
    pair.append(pair_lst)

In [57]:
pair[:10]

[['postal code', 'postal index numbers'],
 ['postal code', 'zip code'],
 ['postal code', 'pin code'],
 ['gender identity expression', 'sexual orientation'],
 ['credit score', 'credit card'],
 ['benefits and entitlements data', 'benefits'],
 ['benefits and entitlements data', 'entitlements'],
 ['benefits and entitlements data', 'compensation'],
 ['benefits and entitlements data', 'unemployment insurance'],
 ['currency', 'bills']]

In [58]:
#convert list to dataframe with boolean values
te=TransactionEncoder()
te_array=te.fit(pair).transform(pair)
df=pd.DataFrame(te_array, columns=te.columns_)

In [61]:
df.head(10)

Unnamed: 0,aadhaar india,aba routing number,academic interests,academic records,academic transcripts,access token,account age,account balance,account information,account number,...,weight,western astrology,willingness to purchase a good,work measurement,workers compensation claims,workplace policy,written signatures,x pixel,zip code,zodiac
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [73]:
#find frequently occurring itemsets using apriory algorithm
from mlxtend.frequent_patterns import apriori
frequent_itemsets_ap = apriori(df, min_support=0.0001, use_colnames=True)

In [74]:
print(frequent_itemsets_ap)

      support                                           itemsets
0    0.004684                                    (aadhaar india)
1    0.004684                               (aba routing number)
2    0.002342                               (academic interests)
3    0.002342                                 (academic records)
4    0.009368                             (academic transcripts)
5    0.002342                                     (access token)
6    0.009368                                      (account age)
7    0.004684                                  (account balance)
8    0.002342                              (account information)
9    0.004684                                   (account number)
10   0.009368                                 (account password)
11   0.002342                         (account password history)
12   0.004684                               (ach routing number)
13   0.002342                                (additional salary)
14   0.002342            

In [75]:
#find frequently occurring itemsets using F-P Growth
from mlxtend.frequent_patterns import fpgrowth
frequent_itemsets_fp=fpgrowth(df, min_support=0.0001, use_colnames=True)

In [76]:
print(frequent_itemsets_fp)

      support                                           itemsets
0    0.007026                                      (postal code)
1    0.002342                             (postal index numbers)
2    0.004684                                         (zip code)
3    0.002342                                         (pin code)
4    0.007026                       (gender identity expression)
5    0.002342                               (sexual orientation)
6    0.009368                                     (credit score)
7    0.007026                                      (credit card)
8    0.009368                   (benefits and entitlements data)
9    0.004684                                         (benefits)
10   0.002342                                     (entitlements)
11   0.007026                                     (compensation)
12   0.002342                           (unemployment insurance)
13   0.009368                                         (currency)
14   0.002342            

In [119]:
#mine the association rules
from mlxtend.frequent_patterns import association_rules
rules_ap = association_rules(frequent_itemsets_ap, metric="confidence", min_threshold=0.1)


rules_fp = association_rules(frequent_itemsets_fp, metric="confidence", min_threshold=0.1)


In [120]:
print(rules_ap.head(10))

               antecedents              consequents  antecedent support  \
0          (aadhaar india)  (identification number)            0.004684   
1  (identification number)          (aadhaar india)            0.002342   
2          (aadhaar india)        (passport number)            0.004684   
3     (aba routing number)     (ach routing number)            0.004684   
4     (ach routing number)     (aba routing number)            0.004684   
5     (aba routing number)   (check routing number)            0.004684   
6   (check routing number)     (aba routing number)            0.002342   
7   (academic transcripts)     (academic interests)            0.009368   
8     (academic interests)   (academic transcripts)            0.002342   
9   (academic transcripts)       (academic records)            0.009368   

   consequent support   support  confidence    lift  leverage  conviction  
0            0.002342  0.002342        0.50  213.50  0.002331    1.995316  
1            0.004684 

In [121]:
print(rules_fp.head())

              antecedents             consequents  antecedent support  \
0           (postal code)  (postal index numbers)            0.007026   
1  (postal index numbers)           (postal code)            0.002342   
2           (postal code)              (zip code)            0.007026   
3              (zip code)           (postal code)            0.004684   
4        (state province)              (zip code)            0.007026   

   consequent support   support  confidence        lift  leverage  conviction  
0            0.002342  0.002342    0.333333  142.333333  0.002325    1.496487  
1            0.007026  0.002342    1.000000  142.333333  0.002325         inf  
2            0.004684  0.002342    0.333333   71.166667  0.002309    1.492974  
3            0.007026  0.002342    0.500000   71.166667  0.002309    1.985948  
4            0.004684  0.002342    0.333333   71.166667  0.002309    1.492974  


In [122]:
rules_fp['consequents'][rules_fp.antecedents=='{zip code}']

Series([], Name: consequents, dtype: object)

In [123]:
df_rules_fp=pd.DataFrame(rules_fp)

In [124]:
df_rules_fp.head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(postal code),(postal index numbers),0.007026,0.002342,0.002342,0.333333,142.333333,0.002325,1.496487
1,(postal index numbers),(postal code),0.002342,0.007026,0.002342,1.0,142.333333,0.002325,inf
2,(postal code),(zip code),0.007026,0.004684,0.002342,0.333333,71.166667,0.002309,1.492974
3,(zip code),(postal code),0.004684,0.007026,0.002342,0.5,71.166667,0.002309,1.985948
4,(state province),(zip code),0.007026,0.004684,0.002342,0.333333,71.166667,0.002309,1.492974
5,(zip code),(state province),0.004684,0.007026,0.002342,0.5,71.166667,0.002309,1.985948
6,(postal code),(pin code),0.007026,0.002342,0.002342,0.333333,142.333333,0.002325,1.496487
7,(pin code),(postal code),0.002342,0.007026,0.002342,1.0,142.333333,0.002325,inf
8,(sexual orientation),(gender identity expression),0.002342,0.007026,0.002342,1.0,142.333333,0.002325,inf
9,(gender identity expression),(sexual orientation),0.007026,0.002342,0.002342,0.333333,142.333333,0.002325,1.496487


In [126]:
df_rules_fp['antecedents'][0]

frozenset({'postal code'})

In [129]:
df_rules_fp[df_rules_fp['antecedents']=={'postal code'}]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(postal code),(postal index numbers),0.007026,0.002342,0.002342,0.333333,142.333333,0.002325,1.496487
2,(postal code),(zip code),0.007026,0.004684,0.002342,0.333333,71.166667,0.002309,1.492974
6,(postal code),(pin code),0.007026,0.002342,0.002342,0.333333,142.333333,0.002325,1.496487


In [133]:
df_rules_fp[df_rules_fp['antecedents']=={'social security number'}]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
16,(social security number),(credit card),0.009368,0.007026,0.002342,0.25,35.583333,0.002276,1.323966
199,(social security number),(individual taxpayer identification number),0.009368,0.009368,0.002342,0.25,26.6875,0.002254,1.320843
203,(social security number),(national identification number),0.009368,0.009368,0.002342,0.25,26.6875,0.002254,1.320843
205,(social security number),(nationality),0.009368,0.018735,0.002342,0.25,13.34375,0.002166,1.308353


In [134]:
df_rules_fp[df_rules_fp['antecedents']=={'credit card'}]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
15,(credit card),(credit score),0.007026,0.009368,0.002342,0.333333,35.583333,0.002276,1.485948
17,(credit card),(social security number),0.007026,0.009368,0.002342,0.333333,35.583333,0.002276,1.485948
452,(credit card),(payment history),0.007026,0.004684,0.002342,0.333333,71.166667,0.002309,1.492974


In [146]:
df_rules_fp[df_rules_fp['antecedents']=={'bank account information'}]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
535,(bank account information),(bank account number),0.004684,0.016393,0.002342,0.5,30.5,0.002265,1.967213
537,(bank account information),(routing number),0.004684,0.002342,0.002342,0.5,213.5,0.002331,1.995316


In [151]:
df_rules_fp[df_rules_fp['antecedents']=={'voiceprint'}]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
304,(voiceprint),(voice recognition),0.004684,0.01171,0.002342,0.5,42.7,0.002287,1.976581
306,(voiceprint),(fingerprint),0.004684,0.01171,0.002342,0.5,42.7,0.002287,1.976581


In [149]:
df_rules_fp[df_rules_fp['antecedents']=={'height'}]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
267,(height),(physical descriptions),0.009368,0.009368,0.002342,0.25,26.6875,0.002254,1.320843
269,(height),(weight),0.009368,0.01171,0.004684,0.5,42.7,0.004574,1.976581
271,(height),(vertical distance),0.009368,0.002342,0.002342,0.25,106.75,0.00232,1.330211


In [152]:
df_rules_fp[df_rules_fp['antecedents']=={'age'}]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
815,(age),(marital status),0.002342,0.009368,0.002342,1.0,106.75,0.00232,inf


In [161]:
df_rules_fp[df_rules_fp['antecedents']=={'social security number'}]['consequents']

16                                   (credit card)
199    (individual taxpayer identification number)
203               (national identification number)
205                                  (nationality)
Name: consequents, dtype: object

In [162]:
def display_output(input_term):
    return df_rules_fp[df_rules_fp['antecedents']=={input_term}]['consequents']

In [163]:
a='social security number'
display_output(a)

16                                   (credit card)
199    (individual taxpayer identification number)
203               (national identification number)
205                                  (nationality)
Name: consequents, dtype: object

In [180]:
a=input()
display_output(a)

 social security number


16                                   (credit card)
199    (individual taxpayer identification number)
203               (national identification number)
205                                  (nationality)
Name: consequents, dtype: object