# Crawling

In [104]:
from pathlib import Path

In [1]:
# data = {keys=file_name, value=file.read}
data = {}

for doc_path in list(Path('data/documents').iterdir()):
    if doc_path.suffix != '.txt':
        continue
        
    with open(doc_path) as f:
        doc_name = doc_path.stem.replace('_', ' ').title()
        # dic_name[key] = value
        data[doc_name] = f.read()

NameError: name 'Path' is not defined

# Text PreProcessing

In [131]:
from abc import ABC, abstractmethod
from unidecode import unidecode  
import string

class TextProcessor(ABC):
    
    @abstractmethod
    def transform(self, text):
        pass



class ConvertCase(TextProcessor):
    def __init__(self, conversion='lower'):
        self.conversion = conversion
    
    def transform(self, text):
        if self.conversion == 'lower':
            return text.lower()
        elif self.conversion == 'upper':
            return text.upper()
        elif self.conversion == 'title':
            return text.title()
        

        
class RemoveDigit(TextProcessor):
    
    def transform(self, text):
        return ''.join(filter(lambda char: not char.isdigit(), text))

    
    
# import string

class RemovePunkt(TextProcessor):
    
    def transform(self, text):
        return ''.join(filter(lambda char: char not in string.punctuation, text))
    
    
    
class RemoveSpace(TextProcessor):
    se
    def transform(self, text):
        return ' '.join(text.split())
    
    
    
# from unidecode import unidecode  

class StripAccent(TextProcessor):
    
    def transform(self, text):
        return unidecode(text)



class TextPipeline(TextProcessor):
    def __init__(self, *args):
        self.transformers = args
        
    def transform(self, text):
        for tf in self.transformers:
            text = tf.transform(text)
            
        return text
    
    def __repr__(self):
        transformers = '\n '.join([f'Step_{i+1}: {tf.__class__.__name__}' for i, tf in enumerate(self.transformers)])  # enumerate('iterable') --> (0, i)
        return f'Pipeline:\n {transformers}'



In [149]:
pipe = TextPipeline(ConvertCase(),
    StripAccent(),
    RemoveDigit(),
    RemovePunkt(),
    RemoveSpace(),
)                   


In [165]:
# Solution 1

# Processed All Content 
# Creat Dictionary By Dictionary Comprehension

# processed_data = {
#     doc_name: pipe.transform(content) for doc_name, content in data.items()
# }

# Stop Words

In [None]:
# Other Solution - for modular Programming


# self.stop_words_path = stop_words_path
# self.stop_words = self.read_stopwords()
  
    
# def read_stopwords(self):
#         stop_words = open(self.stop_words_path).read().split('\n')
#         stop_words = set(map(pipe.transform, stop_words))
#         print(stop_words)
#         return stop_words


In [256]:
stop_words = open('data/stop_words.txt').read().split('\n')
stop_words = set(map(pipe.transform, stop_words))

# Indexing

In [106]:
# index = {'Word': {'file_name', }, 
#          'Azadi': {'Mahsa amini', 'Mohsen Shekari', ....}
#           }

In [107]:
# index = {}

# for doc_name, doc_content in data.items():
#     words = doc_content.split()
    
#     for word in words:
#         if word in index:
#             index[word].add(doc_name)
#         else:
#             index[word] = {doc_name}


In [261]:
index = {}

for doc_name, doc_content in data.items():
    for word in  doc_content.split():
        word = pipe.transform(word)
        
        # Empty Words
        if not word:
            continue
            
        # Ignore Stop Words
        if word in stop_words:
            continue
        
        # Add to index
        if word in index:
            index[word].add(doc_name)
        else:
            index[word] = {doc_name}

In [296]:
# Print 
from termcolor import colored

def print_success(text):
    print(colored(text, 'green'))

def print_warning(text):
    print(colored(text, 'yelow'))
        
def print_error(text):
    print(colored(text, 'red'))

# Search

In [302]:
while True:
    # Get User Input
    search_input = input('Enter to find a Document: ( Q to Quit )\n')
    search_input = pipe.transform(search_input)
    
    # Quit Search
    if search_input.upper() == 'Q':
        break
    
    
    # Get Input Tokens
    search_tokens = search_input.split()
    
    # Get Relevant Documents
    docs = []
    for token in search_tokens:
        # Ignore Stop Words In Input Search
        if token in stop_words:
            continue
        docs.extend(index.get(token, '*' ))
    

    for doc in docs:
        if doc == '*':
            print_error('Not Result')
        else:
            print_success(f' - {doc}')

Enter to find a Document: ( Q to Quit )
 zan


[31mNot Result[0m


Enter to find a Document: ( Q to Quit )
 zendegi


[31mNot Result[0m


Enter to find a Document: ( Q to Quit )
 azadi


[31mNot Result[0m


Enter to find a Document: ( Q to Quit )
 women


[32m - Mahsa Amini[0m


Enter to find a Document: ( Q to Quit )
 woman


[32m - Mahsa Amini[0m
[32m - Hadis Najafi[0m


Enter to find a Document: ( Q to Quit )
 life


[31mNot Result[0m


Enter to find a Document: ( Q to Quit )
 freedome


[31mNot Result[0m


Enter to find a Document: ( Q to Quit )
 free


[31mNot Result[0m


Enter to find a Document: ( Q to Quit )
 q
