In [None]:
import glob

In [None]:
file_list = glob.glob('C:/Users/smsta/Desktop/GIT_REPOSITORIES/zendesk_tag_cloud/newman/*.json') 
for filename in file_list:
    print(filename)

In [None]:
import ijson


def parse_json(json_filename):
    with open(filename, 'r', encoding="utf8") as file:
        # load json iteratively
        parser = ijson.parse(file)
        for prefix, event, value in parser:
            print('prefix={}, event={}, value={}'.format(prefix, event, value))


if __name__ == '__main__':
    parse_json(filename)

In [None]:
import ijson
import re
from string import punctuation


def extract_ticket_text_generator(json_filename):
    """This function takes a list of files with tickets and extracts text from each ticket. The result is a list of text strings."""
    for filename in file_list:
        with open(filename, 'r', encoding="utf8") as input_file:
            # Extract specific items from the file
            tickets = ijson.items(input_file, 'run.executions.item.assertions.item.assertion')
            for ticket in tickets:
                # Extract the substring between two markers
                l = re.findall('plain_body(.+?)public', ticket)
                # Remove all of the punctuation in any item in the list. The result is for each ticket a list of comments.
                m = [''.join(c for c in s if c not in punctuation) for s in l]
                # Join list elements without any separator. The result is for each ticket a list of merged comments.
                n = [' '.join(m)] 
                yield(n)
                
                
                
if __name__ == '__main__':
    extract_ticket_text_generator(filename)

In [None]:
def create_txt_files():
    """This function takes a list of text strings and saves each ticket in a .txt file."""
    data = extract_ticket_text_generator(filename)
    # Make a a flat list out of list of lists.
    flat_list = [item for sublist in data for item in sublist]
    for i in range(len(flat_list)):
        with open("ticket_%d.txt" % (i+1), 'w', encoding="utf-8") as f:
            f.write(flat_list[i])

In [None]:
create_txt_files()

In [None]:
from pathlib import Path

all_txt_files =[]
for file in Path("zendesk_txt").rglob("*.txt"):
     all_txt_files.append(file.parent / file.name)
# counts the length of the list
n_files = len(all_txt_files)
print(n_files)

In [None]:
all_docs = []
for txt_file in all_txt_files:
    with open(txt_file, encoding="utf-8") as f:
        txt_file_as_string = f.read()
    all_docs.append(txt_file_as_string)

In [None]:
#import the TfidfVectorizer from Scikit-Learn.  
from sklearn.feature_extraction.text import TfidfVectorizer
 
vectorizer = TfidfVectorizer(analyzer='word',
                             max_df=.65, 
                             min_df=10, 
                             stop_words='english', 
                             lowercase=True,                   
                             token_pattern='[a-zA-Z0-9]{3,}',
                             use_idf=True, 
                             norm=None, 
                             max_features=50000,)
transformed_documents = vectorizer.fit_transform(all_docs)

In [None]:
transformed_documents.shape

In [None]:
transformed_documents_as_array = transformed_documents.toarray()
# use this line of code to verify that the numpy array represents the same number of documents that we have in the file list
len(transformed_documents_as_array)

In [None]:
import pandas as pd

# make the output folder if it doesn't already exist
Path("./tf_idf_output").mkdir(parents=True, exist_ok=True)

In [None]:
# construct a list of output file paths using the previous list of text files the relative path for tf_idf_output
output_filenames = [str(txt_file).replace(".txt", ".csv").replace("zendesk_txt/", "tf_idf_output/") for txt_file in all_txt_files]

# loop each item in transformed_documents_as_array, using enumerate to keep track of the current position
for counter, doc in enumerate(transformed_documents_as_array):
    # construct a dataframe
    tf_idf_tuples = list(zip(vectorizer.get_feature_names(), doc))
    one_doc_as_df = pd.DataFrame.from_records(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)

# output to a csv using the enumerated value for the filename
one_doc_as_df.to_csv(output_filenames[counter])