In [2]:
# !pip install -U scikit-learn
# !pip install -U stanza
# !pip install nltk

In [1]:
import os
import json

In [2]:
data_dir_name = "data"
database_dir_name = "corpus"
working_dir = os.path.join(data_dir_name, database_dir_name)
database_file_name_1 = "lifestyle_channels_full_db.json"
database_file_name_2 = "lifestyle_channels_full_db_2.json"
database_file_1 = os.path.join(working_dir, database_file_name_1)
database_file_2 = os.path.join(working_dir, database_file_name_2)

In [None]:
with open(database_file_1, "r") as file:
    lifestyle_database_1 = json.load(file)

with open(database_file_2, "r") as file:
    lifestyle_database_2 = json.load(file)

lifestyle_database = lifestyle_database_1 + lifestyle_database_2
full_transcripts = [item["transcript"] for item in lifestyle_database if "transcript" in item and  not "File missing â€” could not transcribe." in item["transcript"]]

print(f"The database of video tanscript now has {len(lifestyle_database)} entries.")
print(f"The full transcrtipts/documents variable now contains {len(full_transcripts)} full raw transcripts.")

In [None]:
import stanza
stanza.download("en")
nlp = stanza.Pipeline("en", processors="tokenize, pos, lemma", verbose=False)

In [7]:
from tqdm import tqdm

def stanzafy_transcripts(transcrips):
    annotated_docs = [nlp(item) for item in tqdm(
                                                transcrips,
                                                desc="Stanza-fying the Transcripts. It may be a minute..."
                                                )
                        ]
    return annotated_docs

In [None]:
lifestyle_documents = stanzafy_transcripts(full_transcripts)

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk import download

download('punkt_tab')
nltk.download('stopwords')

import sklearn
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

stop_words_nltk = stopwords.words('english')
stop_words = set(stop_words_nltk).union(set(ENGLISH_STOP_WORDS))

In [14]:
def tokenize_single_text(document, remove_stopwords=True, pseudo_text=True, remove_punctuation=True):

    if remove_punctuation:
        tokens = [token.text.lower() for sentence in document.sentences for token in sentence.tokens if token.text[0].isalpha()]

    else:
        tokens = [token.text.lower() for sentence in document.sentences for token in sentence.tokens]
    
    if remove_stopwords:
        tokens = [token for token in tokens if token not in stop_words]
    
    return " ".join(tokens) if pseudo_text else tokens

In [15]:
def batch_tokenize(documents, remove_stopwords=True, pseudo_text=True, remove_punctuation=True):
    
    return [tokenize_single_text(doc, remove_stopwords, pseudo_text, remove_punctuation) for doc in documents]

In [68]:
def lemmatize_single_text(document, remove_stopwords=True, pseudo_text=True):
    
    lemmas = [word.lemma.lower() for sentence in document.sentences for word in sentence.words if word.lemma[0].isalpha()]
    
    if remove_stopwords:
        lemmas = [lemma for lemma in lemmas if lemma not in stop_words]

    return " ".join(lemmas) if pseudo_text else lemmas

In [69]:
def batch_lemmatize(documents, remove_stopwords=True, pseudo_text=True):
    
    return [lemmatize_single_text(doc, remove_stopwords, pseudo_text) for doc in documents]

In [20]:
def export_to_json(data, name_of_file):
    """
    Exports a list of dictionaries to a JSON file.

    Args:
        data (list): The data to export.
        name_of_file (str): File path or name (without .json extension).
    """
    try:
        if isinstance(data, list):
            json_file = f"{name_of_file}.json"
            with open(json_file, "w", encoding="utf-8") as file:
                json.dump(data, file, indent=4, ensure_ascii=False)
            print(f"Exported to {json_file}")
        else:
            print(f"Invalid DB type: {type(data)}. Expected a list.")
    except Exception as e:
        print(f"JSON export failed for {name_of_file}: {str(e)}")

In [None]:
lemmatized_lifestyle_documents = batch_lemmatize(lifestyle_documents)
export_to_json(lemmatized_lifestyle_documents, os.path.join(working_dir, "lemmatized_lifestyle_documents"))

tokenized_lifestyle_documents = batch_tokenize(lifestyle_documents)
export_to_json(tokenized_lifestyle_documents, os.path.join(working_dir, "tokenized_lifestyle_documents"))

tokenized_lifestyle_documents_w_stopwords = batch_tokenize(lifestyle_documents, remove_stopwords=False)
export_to_json(tokenized_lifestyle_documents_w_stopwords, os.path.join(working_dir, "tokenized_lifestyle_documents_w_stopwords"))

transcripts_no_stopwords = batch_tokenize(lifestyle_documents, remove_punctuation=False)
export_to_json(transcripts_no_stopwords, os.path.join(working_dir, transcripts_no_stopwords))

transcripts_as_complete_tokens = batch_tokenize(lifestyle_documents, remove_punctuation=False, remove_stopwords=False, pseudo_text=False)
export_to_json(transcripts_as_complete_tokens, os.path.join(working_dir, "transcripts_as_complete_tokens"))