### ToDos:
* command line for input, output folder path.
* Add Logging
* Add test

In [8]:
import re
import os

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

In [24]:
class LocalFilesSystem(object):
    def __init__(self, input_dir, output_dir):
        self.input_dir = input_dir
        self.output_dir = output_dir
     
    def load_file(self, file_name):
        try:    
            with open(file_name, 'r') as f:
                data = f.readlines()
            return data
        except Exception as e:
            return None

    def save_file(self, data, file_name):
        try:    
            final_file_name = os.path.join(self.output_dir, file_name)
            with open(final_file_name, "w") as f:
                f.write(data)
        except Exception as e:
            return None
               
    def list_files_in_input(self):
        try:
            all_file_paths = []
            for each_file in os.listdir(self.input_dir):
                file_path = os.path.join(self.input_dir, each_file)
                all_file_paths.append(file_path)
            return all_file_paths
        except Exception as e:
            return []
            
class PreProcessing(object):
    def __init__(self):
        self.wordnet_lemmatizer = WordNetLemmatizer()
        self.tag_only = 'NN'
        self.pattern_2_save = re.compile('[^a-zA-Z]')
        self.stop_words = set(stopwords.words('english'))

    def convert_list_2_str(self, token_list):
        return '\n'.join(token_list)

    def preprocessing_folder(self, l):
        count = 1
        all_files_in_folder = l.list_files_in_input()
        for each_file in all_files_in_folder:
            data = l.load_file(each_file)
            filtered_tokens = self.get_filtered_token_list(data)
            output_file_name = str(count) + ".txt"
            data_str = self.convert_list_2_str(filtered_tokens)
            l.save_file(data_str, output_file_name)
            count += 1

    def get_filtered_token_list(self, data):
        all_word_tokens = []
        filtered_tokens = []
        for each_para in data:
            sent_tokenize_list = sent_tokenize(each_para)
            for each_line in sent_tokenize_list:
                word_token_list = word_tokenize(each_line)
                pos_tagged_list = pos_tag(word_token_list)
                only_noun_form = [
                    tagged[0] for tagged in pos_tagged_list if tagged[1].startswith(self.tag_only)]
                all_word_tokens.extend(only_noun_form)

        for word in all_word_tokens:
            if not word.isalnum():
                continue
            word = self.pattern_2_save.sub('', word).lower()
            if word not in self.stop_words:
                word = self.wordnet_lemmatizer.lemmatize(word)
                filtered_tokens.append(word)
        return filtered_tokens


if __name__ == "__main__":
    l = LocalFilesSystem("input_test", "output_test")
    p = PreProcessing()
    p.preprocessing_folder(l)