### Libraries

In [1]:
import re
import pandas as pd
import numpy as np

### Reading the given Text files
- Coverting them to lower case
- One can also _automate_ this process by reading from the directory. 


In [2]:
cinderella = open('cinderella.txt', 'r',encoding='utf-8').read().lower()
jackandthebeanstalk= open("jackandthebeanstalk.txt",'r',encoding='utf-8').read().lower()
thefarmerandthebadger=open("thefarmerandthebadger.txt",'r',encoding='utf-8').read().lower()
theprincessandthepea=open("theprincessandthepea.txt",'r',encoding='utf-8').read().lower()

### Delete punctuations and get unique words

In [3]:
#Cinderella
cinderella_words = re.findall(r'(\b[A-Za-z][a-z]{2,9}\b)', cinderella)
cinderella_words_uniq = sorted(list(set(cinderella_words)))

#Jack and the Bean Stalk
jackandthebeanstalk_words = re.findall(r'(\b[A-Za-z][a-z]{2,9}\b)', jackandthebeanstalk)
jackandthebeanstalk_words_uniq = sorted(list(set(jackandthebeanstalk_words)))

#The Farmer and the Badger
thefarmerandthebadger_words = re.findall(r'(\b[A-Za-z][a-z]{2,9}\b)', thefarmerandthebadger)
thefarmerandthebadger_words_uniq = sorted(list(set(thefarmerandthebadger_words)))

#The princess and the pea
theprincessandthepea_words = re.findall(r'(\b[A-Za-z][a-z]{2,9}\b)', theprincessandthepea)
theprincessandthepea_words_uniq = sorted(list(set(theprincessandthepea_words)))

### Creating Bag Of Words

In [4]:
words_bag= list(set(cinderella_words_uniq+jackandthebeanstalk_words_uniq+thefarmerandthebadger_words_uniq+theprincessandthepea_words_uniq))

words_bag.sort()
len(words_bag)

1126

### Boolean incidence Matrix for term-document

In [5]:
term_doc_df= pd.DataFrame(index=words_bag, columns=['cinderella','jackandthebeanstalk','thefarmerandthebadger','theprincessandthepea'])
term_doc_df.head()

Unnamed: 0,cinderella,jackandthebeanstalk,thefarmerandthebadger,theprincessandthepea
able,,,,
about,,,,
above,,,,
abundantly,,,,
account,,,,


In [6]:
for index, row in term_doc_df.iterrows():
    if index in cinderella_words_uniq:
       # print(type(index))
        row['cinderella']=True
    if index in jackandthebeanstalk_words_uniq:
        row['jackandthebeanstalk']=True
        #print("jackandthebeanstalk")
    if index in thefarmerandthebadger_words_uniq:
        row['thefarmerandthebadger']=True
        #print("thefarmerandthebadger")
    if index in theprincessandthepea_words_uniq:
        row['theprincessandthepea']=True

term_doc_df.fillna(False, inplace=True)
term_doc_df.head(10)

Unnamed: 0,cinderella,jackandthebeanstalk,thefarmerandthebadger,theprincessandthepea
able,True,False,True,False
about,False,True,True,True
above,True,False,False,False
abundantly,True,False,False,False
account,True,False,True,False
across,False,False,True,False
adjust,True,False,False,False
admired,True,False,False,False
advised,True,False,False,False
afraid,False,False,True,False


In [7]:
term_doc_df.describe()

Unnamed: 0,cinderella,jackandthebeanstalk,thefarmerandthebadger,theprincessandthepea
count,1126,1126,1126,1126
unique,2,2,2,2
top,True,False,True,False
freq,635,944,583,976


### Perform the following Boolean Operations

In [8]:
#term1 AND term2
term_doc_df.loc['animal', :] & term_doc_df.loc['beautiful', :]

cinderella               False
jackandthebeanstalk      False
thefarmerandthebadger     True
theprincessandthepea     False
dtype: bool

In [9]:
#term1 AND NOT (term2 OR term3)
term_doc_df.loc['badger', :] & ~(term_doc_df.loc['animal', :] | term_doc_df.loc['country', :])

cinderella               False
jackandthebeanstalk      False
thefarmerandthebadger    False
theprincessandthepea     False
dtype: bool

### Creating posting list

In [11]:
term_doc_df['posting_list']=0
term_doc_df['posting_list']=term_doc_df['posting_list'].astype(str)
for index, row in term_doc_df.iterrows():
    val=list(term_doc_df.columns[(row ==True)])
    term_doc_df.at[index,'posting_list']= val
     
term_doc_df.head(10)

Unnamed: 0,cinderella,jackandthebeanstalk,thefarmerandthebadger,theprincessandthepea,posting_list
able,True,False,True,False,"[cinderella, thefarmerandthebadger]"
about,False,True,True,True,"[jackandthebeanstalk, thefarmerandthebadger, t..."
above,True,False,False,False,[cinderella]
abundantly,True,False,False,False,[cinderella]
account,True,False,True,False,"[cinderella, thefarmerandthebadger]"
across,False,False,True,False,[thefarmerandthebadger]
adjust,True,False,False,False,[cinderella]
admired,True,False,False,False,[cinderella]
advised,True,False,False,False,[cinderella]
afraid,False,False,True,False,[thefarmerandthebadger]


In [13]:
term_doc_df.loc['account']

cinderella                                              True
jackandthebeanstalk                                    False
thefarmerandthebadger                                   True
theprincessandthepea                                   False
posting_list             [cinderella, thefarmerandthebadger]
Name: account, dtype: object

In [12]:
term_doc_df.loc['asked']

cinderella                                                            True
jackandthebeanstalk                                                   True
thefarmerandthebadger                                                 True
theprincessandthepea                                                  True
posting_list             [cinderella, jackandthebeanstalk, thefarmerand...
Name: asked, dtype: object