In [1]:
import os
import glob

import pandas as pd

import texthero as hero
from texthero import preprocessing

# root word
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [2]:
def func_count_keywords(pandas_series_of_strings, list_keywords):
    series_count_keywords = pd.Series(dtype="float64")
    
    # lemmatize text
    wordnet_lemmatizer = WordNetLemmatizer()
    
    list_lemmatized_words = []
    for string in pandas_series_of_strings:
        for word in word_tokenize(string):
            list_lemmatized_words.append(wordnet_lemmatizer.lemmatize(word, pos="v"))

    
    for keyword in list_keywords:
        str_lemmatized_keyword = wordnet_lemmatizer.lemmatize(keyword, pos="v")
        series_count_keywords[keyword] = list_lemmatized_words.count(str_lemmatized_keyword)
        
    return series_count_keywords

In [3]:
# create a custom cleaning pipeline
custom_pipeline = [preprocessing.fillna,
                   preprocessing.lowercase,
                   preprocessing.remove_digits,
                   preprocessing.remove_punctuation,
                   preprocessing.remove_diacritics,
                   preprocessing.remove_stopwords,
                   preprocessing.remove_whitespace,
                   #preprocessing.stem,                  # pass the custom_pipeline to the pipeline argument
                  ]

In [4]:
# Set filepath and directory
directory_cleaned_txt = os.path.join("data","cleaned", "TXT","")
directory_output_csv  = os.path.join("data","output","CSV")

In [5]:
# check if input directory exists
if not os.path.isdir(directory_cleaned_txt):
    raise ValueError("Directory " + directory_cleaned_txt + " does not exists. Please specify correct data input directory holding the business plans.")
    
if not os.path.isdir(directory_output_csv):
    os.makedirs(directory_output_csv)

In [6]:
# Get filepath for all business plans in .txt-Format
directory_cleaned_txt_list = glob.glob(directory_cleaned_txt + "*.txt")

In [7]:
# initialize pandas dataframe to be appended
df_library = pd.DataFrame()

# choose library 
# !!! do NOT use "_" in the name !!!
library = "libraryOne"


# define keywords through chosen library
if library == "libraryOne":
    list_keywords_library = ['refuse', 'rethink', 'reduce', 'reuse', 'repair', 'refurbish', 'remanufacture', 'repurpose', 'recycle', 'recover']
elif library == "libraryTwo":
    list_keywords_library = ['refuse', 'rethink', 'reduce', 'reuse', 'repair', 'refurbish', 'remanufacture', 'repurpose', 'recycle', 'recover']
else:
    raise ValueError('Library ' + library + ' is not defined!')

In [8]:
# loop over all path to .txt-files
for filepath in directory_cleaned_txt_list:

    # define name of library to save
    name_to_save = filepath.replace(directory_cleaned_txt, "").replace(os.path.join(".txt"), "")
    name_to_save = name_to_save + "_" + library
    
    # import business plans as list of strings where each string represents a page
    with open(filepath) as f:
        text_pages = f.readlines()
        
    # clean text pages using customized pipeline
    series_text_pages = pd.Series(text_pages, dtype="string")
    series_text_pages_cleaned = hero.clean(series_text_pages, custom_pipeline)
    
    # count the frequency of each keyword in cleaned text
    series_count_keywords = func_count_keywords(series_text_pages_cleaned, list_keywords_library)

    # create pandas Dataframe holding name and library (occurence of keywords)
    df_tmp = pd.DataFrame(series_count_keywords.values.reshape(1,-1), columns=series_count_keywords.index.to_list())
    df_tmp.insert(0, "Name", name_to_save)
    
    # concat dataframes to existing one
    df_library = pd.concat([df_library, df_tmp], ignore_index=True)

In [9]:
# save library One as csv
df_library.to_csv(os.path.join(directory_output_csv, "circular_index_" + library + ".csv"), sep=';', decimal=',', index=False)