# Cross-validation Workbook

### 1 - Load libraries and Defining fuctions for each stage

#### 1.1 Load libraries

In [1]:
from pathlib import Path
import spacy
import pandas as pd
import html

# Import and initialize tqdm for Pandas
from tqdm import tqdm
tqdm.pandas()

from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from transformers import BertTokenizer, BertModel
import torch

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import numpy as np

from gensim.models import LdaModel, Nmf, CoherenceModel
import pyLDAvis

# Depress DeprecationWarnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import os.path
from sklearn.model_selection import KFold

from dvclive import Live
import dvc.api

import pickle

#### 1.2 `preprocess(input_path)`

In [2]:
def preprocess(input_path, pct=1, RANDOM_SEED=42):
    nlp = spacy.load('en_core_web_sm')

    if input_path.suffix == ".csv":
        df = pd.read_csv(input_path)
    elif input_path.suffix == '.jsonl':
        df = pd.read_json(input_path, lines=True)
    elif input_path.suffix == '.parquet':
        df = pd.read_parquet(input_path)
    elif input_path.suffix == '.feather':
        df = pd.read_feather(input_path)
    else:
        raise ValueError(f"Unknown file type: {input_path.suffix}")
    
    # Extract a subset of the data
    if pct < 1:
        df = df.sample(frac=pct, random_state=RANDOM_SEED)
    
    # Decode HTML entities back to original characters and remove whitespaces
    df['review'] = df['review'].apply(html.unescape).str.replace(r'[\r\n\t]', '', regex=True).str.strip()

    # Remove wrong condition values and keep the rows
    df.loc[df.condition.notna() & df.condition.str.contains('users found this comment helpful'), 'condition'] = None
    
    # Remove rows with empty reviews
    df = df[df['review'].notna()]
    df = df[df['review'] != '"-"']
    df = df[df['review'] != '']

    # Generate lemmas for each token, remove stopwords and punctuations
    #df['procd_review'] = df['review'].progress_apply(
    #    #lambda x: ' '.join([token.lemma_ for token in nlp(x) if not token.is_stop and not token.is_punct])
    #    lambda x: [token.lemma_ for token in nlp(x) if not token.is_stop and not token.is_punct]
    #)
    def lemma(row):
        # Skip if review is empty
        if pd.isnull(row['review']): return row

        # lemma_w_stpwrd: with stop words, for word2vec and bert embeddings
        row['lemma_w_stpwrd'] = [token.lemma_ for token in nlp(row['review']) if not (token.is_punct or token.is_space or token.lemma_.strip() == '')]
        # lemma_wo_stpwrd: lower without stop words, for BoW and TF-IDF embeddings
        row['lemma_wo_stpwrd'] = [token.lemma_.lower() for token in nlp(row['review']) if not (token.is_stop or token.is_punct or token.is_space or token.lemma_.strip() == '')]
        
        # For reviews with only stop words, use lemma_w_stpwrd
        if len(row['lemma_wo_stpwrd']) == 0:
            row['lemma_wo_stpwrd'] = row['lemma_w_stpwrd']
        return row
    
    procd_df = df.progress_apply(lemma, axis=1)

    return procd_df

#### 1.3 `feature_engineering(procd_df, procd_text, ngram, bert, bert_pretrained_model, RANDOM_SEED)`

In [3]:
def feature_engineering(procd_data, ngram, bert, bert_pretrained_model=None, RANDOM_SEED=42):
    # BERT Embeddings
    if bert:
        tokenizer = BertTokenizer.from_pretrained(bert_pretrained_model)
        bert_model = BertModel.from_pretrained(bert_pretrained_model)
        # Move model to GPU if available
        if torch.cuda.is_available():
            device = torch.device('cuda')
            torch.cuda.manual_seed_all(RANDOM_SEED)
            torch.backends.cudnn.deterministic = True
        #elif torch.backends.mps.is_available():
        #    device = torch.device('mps')
        #    torch.mps.manual_seed(RANDOM_SEED)
        #    torch.backends.mps.deterministic = True
        else:
            device = torch.device('cpu')
            torch.manual_seed(RANDOM_SEED)
            torch.backends.cudnn.deterministic = True

        bert_model = bert_model.to(device)
        print(f"Using device: {device}")
    
        def get_bert_embeddings(text):
            inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
            # Move inputs to GPU if available
            if device.type != 'cpu':
                inputs = {key: val.to(device) for key, val in inputs.items()}

            outputs = bert_model(**inputs)
            return outputs.last_hidden_state.mean(dim=1)
        
        bert_embeddings = [
            get_bert_embeddings(doc).cpu().detach().numpy() for doc in tqdm(procd_data, desc='Generating BERT Embeddings')
        ]
    
    # Extract BOW and TF-IDF features
    # Add bigrams
    if ngram == 'bigram':
        phrase_model = Phrases(procd_data, min_count=1, threshold=1, connector_words=ENGLISH_CONNECTOR_WORDS)
        procd_data_bigram = procd_data.progress_apply(lambda x: phrase_model[x])

        dictionary = Dictionary(procd_data_bigram)
    elif ngram == 'unigram':
        dictionary = Dictionary(procd_data)

    # BoW
    bow_corpus = [dictionary.doc2bow(doc) for doc in tqdm(procd_data, desc='Generating BoW')]

    # TF-IDF
    tfidf_model = TfidfModel(bow_corpus)
    tfidf_corpus = [tfidf_model[doc] for doc in tqdm(bow_corpus, desc='Generating TF-IDF')]
    
    return len(procd_data), bert_embeddings if bert else None, procd_data_bigram if ngram == 'bigram' else None, dictionary, bow_corpus, tfidf_corpus
                

#### 1.4 `clustering(bert_embeddings, algorithm, num_clusters, RANDOM_SEED)`

In [4]:
def clustering(bert_embeddings, algorithm, num_clusters, RANDOM_SEED):
    """
    Cluster the input data using the specified algorithm and number of clusters.
    """
    # Prepare the input data for clustering
    bert_embedding_avg = [np.mean(embedding, axis=0) for embedding in bert_embeddings]
    input_data = np.vstack(bert_embedding_avg)

    # Scale the input data
    scaler = StandardScaler()
    input_data = scaler.fit_transform(input_data)

    # Initialize clustering algorithm
    if algorithm == 'kmeans':
        clustering_model = KMeans(
            n_clusters=num_clusters,
            n_init='auto',
            random_state=RANDOM_SEED,
        )
    elif algorithm == 'hierarchical':
        clustering_model = AgglomerativeClustering(
            n_clusters=num_clusters,
        )
    else:
        raise ValueError(f'Unknown clustering algorithm: {algorithm}')

    # Fit the clustering algorithm to the data and get the labels
    clustering_model.fit(input_data)
    labels = clustering_model.labels_

    # Calculate the metrics if possible
    if len(set(labels)) > 1:
        silhouette = silhouette_score(input_data, labels)
        davies_bouldin = davies_bouldin_score(input_data, labels)
        calinski_harabasz = calinski_harabasz_score(input_data, labels)
    else:
        silhouette_avg = davies_bouldin = calinski_harabasz = np.nan

    return clustering_model, silhouette, davies_bouldin, calinski_harabasz

#### 1.5 `topic_modeling(procd_data, corpus, dictionary, algorithm, num_topics, RANDOM_SEED)`

In [5]:
def topic_modeling(procd_data, corpus, dictionary, algorithm, num_topics, RANDOM_SEED):
    # Set up topic model
    # LDA Model
    if algorithm == 'lda':
        topic_model = LdaModel(
            corpus,
            num_topics=num_topics,
            id2word=dictionary,
            random_state=RANDOM_SEED,
        )
        perplexity = topic_model.log_perplexity(corpus)
    
    # NMF Model
    elif algorithm == 'nmf':
        topic_model = Nmf(
            corpus,
            num_topics=num_topics,
            id2word=dictionary,
            random_state=RANDOM_SEED,
        )
        perplexity = None

    # Calculate Coherence score
    coherence_model = CoherenceModel(
        model=topic_model,
        texts=procd_data.tolist(),
        #corpus=corpus,
        dictionary=dictionary,
        coherence='c_v',
        #coherence='u_mass',
        processes=-1
    )
    coherence = coherence_model.get_coherence()

    return topic_model, perplexity, coherence


def prepare_topic_model_viz(topic_model, dictionary, corpus):
    # Extract the topic-term matrix
    topic_term_matrix = topic_model.get_topics()

    # Extract the document-topic matrix
    num_topics = topic_model.num_topics
    doc_topic_matrix = []

    for doc in tqdm(topic_model[corpus]):
        doc_topics = dict(doc)
        doc_topic_vec = [doc_topics.get(i, 0.0) for i in range(num_topics)]
        doc_topic_matrix.append(doc_topic_vec)

    # Normalize topic_term_matrix and doc_topic_matrix
    topic_term_matrix = topic_term_matrix / np.sum(topic_term_matrix, axis=1, keepdims=True)
    doc_topic_matrix = doc_topic_matrix / np.sum(doc_topic_matrix, axis=1, keepdims=True)

    doc_topic_matrix = np.array(doc_topic_matrix)
    
    # Vocabulary and term frequencies
    vocab = [dictionary[i] for i in range(len(dictionary))]

    term_freq = np.zeros(len(vocab))
    for doc in corpus:
        for idx, freq in doc:
            term_freq[idx] += freq
    
    # Prepare the data in pyLDAvis format
    vis_data = pyLDAvis.prepare(
        doc_lengths=np.array([sum(dict(doc).values()) for doc in corpus]),
        vocab=vocab,
        term_frequency=term_freq,
        topic_term_dists=topic_term_matrix,
        doc_topic_dists=doc_topic_matrix
    )

    # Return the visualization data
    return vis_data

#### 1.6 `pipeline()`

In [6]:
def pipeline(procd_df, procd_text, ngram, bert_pretrained_model, clustering, clustering_algorithms=None, num_clusters=None, feature='lda', topic_modeling_algorithm='lda', num_topics=8, RANDOM_SEED=42):
    # Load preprocessed text data
    procd_data = procd_df[procd_text].progress_apply(lambda x: eval(x))
    procd_data = procd_data[procd_data.progress_apply(lambda row: len(row) > 0)]
    
    # Feature engineering
    bert = clustering
    num_docs, bert_embeddings, procd_data_bigram, dictionary, bow_corpus, tfidf_corpus = feature_engineering(
        procd_data,
        ngram,
        bert,
        bert_pretrained_model,
        RANDOM_SEED
    )

    # Clustering
    if clustering:
        clustering_model, silhouette, davies_bouldin, calinski_harabasz = clustering(
            bert_embeddings,
            clustering_algorithms,
            num_clusters,
            RANDOM_SEED
        )
    
    # Topic Modeling
    if feature == 'bow': corpus = bow_corpus
    elif feature == 'tfidf': corpus = tfidf_corpus
    else: raise ValueError(f'Unknown feature: {feature}')

    if clustering:
        # Add cluster labels to corpus
        print(f"Topic modeling with clustering via Bert {clustering_algorithms}...")
        # Get the all the cluster labels
        labels = clustering_model.labels_

        # Add cluster labels to the preprocessed text data
        grouped_procd_data = pd.DataFrame({'cluster_label': labels, 'procd_text': procd_data_bigram}).groupby('cluster_label')

        # Apply LDA to each clustered corpus
        topic_models = {}
        coherence_scores = {}
        perplexity_scores = {}

        for label, group in tqdm(grouped_procd_data, desc='Training topic models for each cluster'):
            # Extract the clustered corpus and texts
            clustered_corpus = [corpus[i] for i in group.index]
            clustered_texts = group['procd_text']
            
            # Train the topic model for this cluster
            topic_model, perplexity, coherence = topic_modeling(clustered_texts, clustered_corpus, dictionary, topic_modeling_algorithm, RANDOM_SEED)
            
            # Save the topic model
            topic_models[label] = topic_model
            
            # Save the scores
            coherence_scores[label] = coherence
            if perplexity is not None: perplexity_scores[label] = perplexity

        coherence = np.mean(list(coherence_scores.values()))
        if len(perplexity_scores) == 0: perplexity = None
        else: perplexity = np.mean(list(perplexity_scores.values()))
    
    else:
        print('Topic modeling without clustering...')
        topic_model, perplexity, coherence = topic_modeling(procd_data_bigram, corpus, dictionary, topic_modeling_algorithm, num_topics, RANDOM_SEED)
    
    return num_docs, clustering_model if clustering else None, silhouette if clustering else None, calinski_harabasz if clustering else None, davies_bouldin if clustering else None, topic_models if clustering else topic_model, coherence, perplexity

### 2 - Cross-Validation

#### 2.1 Set up path and settings

In [7]:
RANDOM_SEED = 42
train_data_path = '../data/raw/lewtun-drug-reviews/train.jsonl'

# Feature Engineering
procd_texts = ['lemma_wo_stpwrd', 'lemma_w_stpwrd']
ngram = 'bigram'
feature = 'tfidf'
#bert_pretrained_model = 'bert-base-uncased'

# Clustering
clustering = False
#clustering_algorithms = ['kmeans', 'hierarchical']
#num_clusters = 2

# Topic Modeling
topic_modeling_algorithm = 'lda'
nums_topics = [6, 10, 14, 16, 18]

# Model Settings
models = [
    {'procd_text': 'lemma_w_stpwrd', 'num_topics': 18},
    {'procd_text': 'lemma_w_stpwrd', 'num_topics': 16},
    {'procd_text': 'lemma_w_stpwrd', 'num_topics': 14},
    {'procd_text': 'lemma_w_stpwrd', 'num_topics': 10},
    {'procd_text': 'lemma_w_stpwrd', 'num_topics': 6},
    {'procd_text': 'lemma_wo_stpwrd', 'num_topics': 16}
    #{'procd_text': 'lemma_w_stpwrd', 'clustering': True, 'clustering_algorithms': 'kmeans', 'num_clusters': 2, 'topic_modeling_algorithm': 'lda', 'num_topics': 8},
    #{'procd_text': 'lemma_wo_stpwrd', 'clustering': True, 'clustering_algorithms': 'hierarchical', 'num_clusters': 3, 'topic_modeling_algorithm': 'lda', 'num_topics': 12}
]

#### 2.2 Preprocess train data

In [10]:
procd_df_path = '../cross_validation/procd_train.csv'

if os.path.isfile(procd_df_path): procd_df = pd.read_csv(procd_df_path)
else:
    procd_df = preprocess(Path(train_data_path), pct=1, RANDOM_SEED=RANDOM_SEED)
    procd_df.to_csv(procd_df_path, index=False)
    procd_df = pd.read_csv(procd_df_path)

#### 2.3 Cross-Validation

In [12]:
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

with Live(dir='../cross_validation', report='html') as live:
    for fold, (train, test) in tqdm(enumerate(kf.split(procd_df)), desc=f'Folds:'):
        print(f'Fold {fold}')
        train_procd_df = procd_df.iloc[train]
        test_procd_df = procd_df.iloc[test]

        for procd_text in tqdm(procd_texts, desc=f'Preprocessed Text:'):
            print(procd_text)
            # Load preprocessed text data
            train_procd_data = train_procd_df[procd_text].progress_apply(lambda x: eval(x))
            train_procd_data = train_procd_data[train_procd_data.progress_apply(lambda row: len(row) > 0)]

            test_procd_data = test_procd_df[procd_text].progress_apply(lambda x: eval(x))
            test_procd_data = test_procd_data[test_procd_data.progress_apply(lambda row: len(row) > 0)]
            
            # Extract features from the training data
            train_num_docs, _, procd_data_bigram, dictionary, train_bow_corpus, train_tfidf_corpus = feature_engineering(
                procd_data = train_procd_data,
                ngram = ngram,
                bert = False,
                bert_pretrained_model = None,
                RANDOM_SEED = RANDOM_SEED
            )

            # Extract features from the test data
            #_, _, _, _, test_bow_corpus, test_tfidf_corpus = feature_engineering(
            #    procd_data = test_procd_data,
            #    ngram = ngram,
            #    bert = False,
            #    bert_pretrained_model = None,
            #    RANDOM_SEED = RANDOM_SEED
            #)
            
            for num_topics in tqdm(nums_topics, desc=f'Training topic models:'):
                if (num_topics <= 12) and (procd_text == 'lemma_wo_stpwrd'): continue
                print(f'{num_topics} topics')
                # Train the topic model
                if feature == 'bow': train_corpus = train_bow_corpus
                elif feature == 'tfidf': train_corpus = train_tfidf_corpus
                else: raise ValueError(f'Unknown feature: {feature}')

                topic_model, perplexity, coherence = topic_modeling(
                    procd_data_bigram, train_corpus, dictionary, topic_modeling_algorithm, num_topics, RANDOM_SEED
                )

                live.log_param('Preprocessed Text', procd_text)
                live.log_param('ngram', ngram)
                live.log_param('Feature', feature)
                live.log_param('Topic Modeling Algorithm', topic_modeling_algorithm)
                live.log_param('Num of Topics', num_topics)

                live.log_metric('Fold', fold)
                live.log_metric('Num of Docs (Train)', train_num_docs)
                live.log_metric('Coherence (Train)', coherence)
                if perplexity is not None: live.log_metric('Perplexity (Train)', perplexity)

                # Evaluate the model
                #if feature == 'bow': test_corpus = test_bow_corpus
                #elif feature == 'tfidf': test_corpus = test_tfidf_corpus
                #else: raise ValueError(f'Unknown feature: {feature}')

                # Calculate Coherence score
                coherence_model = CoherenceModel(
                    model=topic_model,
                    texts=test_procd_data.tolist(),
                    #corpus=test_corpus,
                    dictionary=dictionary,
                    coherence='c_v',
                    #coherence='u_mass',
                    processes=-1
                )
                coherence = coherence_model.get_coherence()
                live.log_metric('Coherence (Test)', coherence)

                live.next_step()

Folds:: 0it [00:00, ?it/s]

Fold 0



Preprocessed Text::   0%|          | 0/2 [00:00<?, ?it/s]

lemma_wo_stpwrd




  0%|          | 0/129036 [00:00<?, ?it/s]

  1%|          | 1510/129036 [00:00<00:08, 14982.17it/s]

  2%|▏         | 3127/129036 [00:00<00:08, 15631.93it/s]

  4%|▍         | 5008/129036 [00:00<00:07, 17038.39it/s]

  5%|▌         | 6756/129036 [00:00<00:07, 17177.49it/s]

  7%|▋         | 8717/129036 [00:00<00:06, 18018.27it/s]

  8%|▊         | 10808/129036 [00:00<00:06, 18982.80it/s]

 10%|▉         | 12723/129036 [00:00<00:06, 19002.08it/s]

 12%|█▏        | 15020/129036 [00:00<00:05, 20236.03it/s]

 13%|█▎        | 17044/129036 [00:01<00:18, 6097.88it/s] 

 15%|█▌        | 19399/129036 [00:01<00:13, 8155.28it/s]

 17%|█▋        | 21818/129036 [00:01<00:10, 10461.94it/s]

 19%|█▊        | 24155/129036 [00:01<00:08, 12647.57it/s]

 21%|██        | 26567/129036 [00:02<00:06, 14891.52it/s]

 22%|██▏       | 28831/129036 [00:02<00:06, 16572.88it/s]

 24%|██▍       | 31021/129036 [00:02<00:05, 17837.32it/s]

 26%|██▌       | 33211/129036 [00:02<00:05, 18578.47it/s]

 27%|██▋       |

14 topics




Training topic models::  60%|██████    | 3/5 [01:27<00:58, 29.26s/it]

16 topics




Training topic models::  80%|████████  | 4/5 [03:03<00:50, 50.55s/it]

18 topics




Training topic models:: 100%|██████████| 5/5 [04:43<00:00, 56.71s/it]

Preprocessed Text::  50%|█████     | 1/2 [05:11<05:11, 311.55s/it]

lemma_w_stpwrd




  0%|          | 0/129036 [00:00<?, ?it/s]

  1%|          | 922/129036 [00:00<00:13, 9201.21it/s]

  1%|▏         | 1891/129036 [00:00<00:13, 9445.56it/s]

  2%|▏         | 2900/129036 [00:00<00:12, 9704.29it/s]

  3%|▎         | 3971/129036 [00:00<00:12, 10083.76it/s]

  4%|▍         | 5001/129036 [00:00<00:12, 10161.14it/s]

  5%|▍         | 6021/129036 [00:00<00:12, 10161.27it/s]

  5%|▌         | 7038/129036 [00:00<00:12, 10161.29it/s]

  6%|▋         | 8101/129036 [00:00<00:11, 10296.15it/s]

  7%|▋         | 9169/129036 [00:00<00:11, 10411.08it/s]

  8%|▊         | 10307/129036 [00:01<00:11, 10645.42it/s]

  9%|▉         | 11383/129036 [00:01<00:11, 10657.31it/s]

 10%|▉         | 12449/129036 [00:01<00:11, 10540.67it/s]

 10%|█         | 13504/129036 [00:01<00:11, 10362.70it/s]

 11%|█▏        | 14558/129036 [00:01<00:11, 10389.99it/s]

 12%|█▏        | 15598/129036 [00:01<00:10, 10319.41it/s]

 13%|█▎        | 16637/129036 [00:01<00:10, 10324.40it/s]

 14%|█▎        | 17674/

6 topics




Training topic models::  20%|██        | 1/5 [01:15<05:01, 75.49s/it]

10 topics




Training topic models::  40%|████      | 2/5 [02:40<04:03, 81.17s/it]

14 topics




Training topic models::  60%|██████    | 3/5 [04:17<02:56, 88.15s/it]

16 topics


: 