# Cross-validation Workbook

### 1 - Load libraries and Defining fuctions for each stage

#### 1.1 Load libraries

In [1]:
from pathlib import Path
import spacy
import pandas as pd
import html

# Import and initialize tqdm for Pandas
from tqdm import tqdm
tqdm.pandas()

from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from transformers import BertTokenizer, BertModel
import torch

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import numpy as np

from gensim.models import LdaModel, Nmf, CoherenceModel
import pyLDAvis

# Depress DeprecationWarnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import os.path
from sklearn.model_selection import KFold

from dvclive import Live
import dvc.api

import pickle

#### 1.2 `preprocess(input_path)`

In [2]:
def preprocess(input_path, pct=1, RANDOM_SEED=42):
    nlp = spacy.load('en_core_web_sm')

    if input_path.suffix == ".csv":
        df = pd.read_csv(input_path)
    elif input_path.suffix == '.jsonl':
        df = pd.read_json(input_path, lines=True)
    elif input_path.suffix == '.parquet':
        df = pd.read_parquet(input_path)
    elif input_path.suffix == '.feather':
        df = pd.read_feather(input_path)
    else:
        raise ValueError(f"Unknown file type: {input_path.suffix}")
    
    # Extract a subset of the data
    if pct < 1:
        df = df.sample(frac=pct, random_state=RANDOM_SEED)
    
    # Decode HTML entities back to original characters and remove whitespaces
    df['review'] = df['review'].apply(html.unescape).str.replace(r'[\r\n\t]', '', regex=True).str.strip()

    # Remove wrong condition values and keep the rows
    df.loc[df.condition.notna() & df.condition.str.contains('users found this comment helpful'), 'condition'] = None
    
    # Remove rows with empty reviews
    df = df[df['review'].notna()]
    df = df[df['review'] != '"-"']
    df = df[df['review'] != '']

    # Generate lemmas for each token, remove stopwords and punctuations
    #df['procd_review'] = df['review'].progress_apply(
    #    #lambda x: ' '.join([token.lemma_ for token in nlp(x) if not token.is_stop and not token.is_punct])
    #    lambda x: [token.lemma_ for token in nlp(x) if not token.is_stop and not token.is_punct]
    #)
    def lemma(row):
        # Skip if review is empty
        if pd.isnull(row['review']): return row

        # lemma_w_stpwrd: with stop words, for word2vec and bert embeddings
        row['lemma_w_stpwrd'] = [token.lemma_ for token in nlp(row['review']) if not (token.is_punct or token.is_space or token.lemma_.strip() == '')]
        # lemma_wo_stpwrd: lower without stop words, for BoW and TF-IDF embeddings
        row['lemma_wo_stpwrd'] = [token.lemma_.lower() for token in nlp(row['review']) if not (token.is_stop or token.is_punct or token.is_space or token.lemma_.strip() == '')]
        
        # For reviews with only stop words, use lemma_w_stpwrd
        if len(row['lemma_wo_stpwrd']) == 0:
            row['lemma_wo_stpwrd'] = row['lemma_w_stpwrd']
        return row
    
    procd_df = df.progress_apply(lemma, axis=1)

    return procd_df

#### 1.3 `feature_engineering(procd_df, procd_text, ngram, bert, bert_pretrained_model, RANDOM_SEED)`

In [3]:
def feature_engineering(procd_data, ngram, bert, bert_pretrained_model=None, tfidf=True, RANDOM_SEED=42):
    # BERT Embeddings
    if bert:
        tokenizer = BertTokenizer.from_pretrained(bert_pretrained_model)
        bert_model = BertModel.from_pretrained(bert_pretrained_model)
        # Move model to GPU if available
        if torch.cuda.is_available():
            device = torch.device('cuda')
            torch.cuda.manual_seed_all(RANDOM_SEED)
            torch.backends.cudnn.deterministic = True
        #elif torch.backends.mps.is_available():
        #    device = torch.device('mps')
        #    torch.mps.manual_seed(RANDOM_SEED)
        #    torch.backends.mps.deterministic = True
        else:
            device = torch.device('cpu')
            torch.manual_seed(RANDOM_SEED)
            torch.backends.cudnn.deterministic = True

        bert_model = bert_model.to(device)
        print(f"Using device: {device}")
    
        def get_bert_embeddings(text):
            inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
            # Move inputs to GPU if available
            if device.type != 'cpu':
                inputs = {key: val.to(device) for key, val in inputs.items()}

            outputs = bert_model(**inputs)
            return outputs.last_hidden_state.mean(dim=1)
        
        bert_embeddings = [
            get_bert_embeddings(doc).cpu().detach().numpy() for doc in tqdm(procd_data, desc='Generating BERT Embeddings')
        ]
    
    # Extract BOW and TF-IDF features
    if tfidf:
        # Add bigrams
        if ngram == 'bigram':
            phrase_model = Phrases(procd_data, min_count=1, threshold=1, connector_words=ENGLISH_CONNECTOR_WORDS)
            procd_data_bigram = procd_data.progress_apply(lambda x: phrase_model[x])
    
            dictionary = Dictionary(procd_data_bigram)
        elif ngram == 'unigram':
            dictionary = Dictionary(procd_data)
    
        # BoW
        bow_corpus = [dictionary.doc2bow(doc) for doc in tqdm(procd_data, desc='Generating BoW')]
    
        # TF-IDF
        tfidf_model = TfidfModel(bow_corpus)
        tfidf_corpus = [tfidf_model[doc] for doc in tqdm(bow_corpus, desc='Generating TF-IDF')]
    
    return len(procd_data), bert_embeddings if bert else None, procd_data_bigram if ngram == 'bigram' else None, dictionary if tfidf else None, bow_corpus if tfidf else None, tfidf_corpus if tfidf else None
                

#### 1.4 `clustering(bert_embeddings, algorithm, num_clusters, RANDOM_SEED)`

In [4]:
def clustering(bert_embeddings, algorithm, num_clusters, RANDOM_SEED):
    """
    Cluster the input data using the specified algorithm and number of clusters.
    """
    # Prepare the input data for clustering
    bert_embedding_avg = [np.mean(embedding, axis=0) for embedding in bert_embeddings]
    input_data = np.vstack(bert_embedding_avg)

    # Scale the input data
    scaler = StandardScaler()
    input_data = scaler.fit_transform(input_data)

    # Initialize clustering algorithm
    if algorithm == 'kmeans':
        clustering_model = KMeans(
            n_clusters=num_clusters,
            n_init='auto',
            random_state=RANDOM_SEED,
        )
    elif algorithm == 'hierarchical':
        clustering_model = AgglomerativeClustering(
            n_clusters=num_clusters,
        )
    else:
        raise ValueError(f'Unknown clustering algorithm: {algorithm}')

    # Fit the clustering algorithm to the data and get the labels
    clustering_model.fit(input_data)
    labels = clustering_model.labels_

    # Calculate the metrics if possible
    if len(set(labels)) > 1:
        silhouette = silhouette_score(input_data, labels)
        davies_bouldin = davies_bouldin_score(input_data, labels)
        calinski_harabasz = calinski_harabasz_score(input_data, labels)
    else:
        silhouette_avg = davies_bouldin = calinski_harabasz = np.nan

    return clustering_model, silhouette, davies_bouldin, calinski_harabasz

#### 1.5 `topic_modeling(procd_data, corpus, dictionary, algorithm, num_topics, RANDOM_SEED)`

In [5]:
def topic_modeling(procd_data, corpus, dictionary, algorithm, num_topics, RANDOM_SEED):
    # Set up topic model
    # LDA Model
    if algorithm == 'lda':
        topic_model = LdaModel(
            corpus,
            num_topics=num_topics,
            id2word=dictionary,
            random_state=RANDOM_SEED,
        )
        perplexity = topic_model.log_perplexity(corpus)
    
    # NMF Model
    elif algorithm == 'nmf':
        topic_model = Nmf(
            corpus,
            num_topics=num_topics,
            id2word=dictionary,
            random_state=RANDOM_SEED,
        )
        perplexity = None

    # Calculate Coherence score
    coherence_model = CoherenceModel(
        model=topic_model,
        texts=procd_data.tolist(),
        #corpus=corpus,
        dictionary=dictionary,
        coherence='c_v',
        #coherence='u_mass',
        processes=-1
    )
    coherence = coherence_model.get_coherence()

    return topic_model, perplexity, coherence


def prepare_topic_model_viz(topic_model, dictionary, corpus):
    # Extract the topic-term matrix
    topic_term_matrix = topic_model.get_topics()

    # Extract the document-topic matrix
    num_topics = topic_model.num_topics
    doc_topic_matrix = []

    for doc in tqdm(topic_model[corpus]):
        doc_topics = dict(doc)
        doc_topic_vec = [doc_topics.get(i, 0.0) for i in range(num_topics)]
        doc_topic_matrix.append(doc_topic_vec)

    # Normalize topic_term_matrix and doc_topic_matrix
    topic_term_matrix = topic_term_matrix / np.sum(topic_term_matrix, axis=1, keepdims=True)
    doc_topic_matrix = doc_topic_matrix / np.sum(doc_topic_matrix, axis=1, keepdims=True)

    doc_topic_matrix = np.array(doc_topic_matrix)
    
    # Vocabulary and term frequencies
    vocab = [dictionary[i] for i in range(len(dictionary))]

    term_freq = np.zeros(len(vocab))
    for doc in corpus:
        for idx, freq in doc:
            term_freq[idx] += freq
    
    # Prepare the data in pyLDAvis format
    vis_data = pyLDAvis.prepare(
        doc_lengths=np.array([sum(dict(doc).values()) for doc in corpus]),
        vocab=vocab,
        term_frequency=term_freq,
        topic_term_dists=topic_term_matrix,
        doc_topic_dists=doc_topic_matrix
    )

    # Return the visualization data
    return vis_data

#### 1.6 `pipeline()`

In [6]:
def pipeline(procd_df, procd_text, ngram, bert_pretrained_model, clustering, clustering_algorithms=None, num_clusters=None, feature='lda', topic_modeling_algorithm='lda', num_topics=8, RANDOM_SEED=42):
    # Load preprocessed text data
    procd_data = procd_df[procd_text].progress_apply(lambda x: eval(x))
    procd_data = procd_data[procd_data.progress_apply(lambda row: len(row) > 0)]
    
    # Feature engineering
    bert = clustering
    num_docs, bert_embeddings, procd_data_bigram, dictionary, bow_corpus, tfidf_corpus = feature_engineering(
        procd_data,
        ngram,
        bert,
        bert_pretrained_model,
        RANDOM_SEED
    )

    # Clustering
    if clustering:
        clustering_model, silhouette, davies_bouldin, calinski_harabasz = clustering(
            bert_embeddings,
            clustering_algorithms,
            num_clusters,
            RANDOM_SEED
        )
    
    # Topic Modeling
    if feature == 'bow': corpus = bow_corpus
    elif feature == 'tfidf': corpus = tfidf_corpus
    else: raise ValueError(f'Unknown feature: {feature}')

    if clustering:
        # Add cluster labels to corpus
        print(f"Topic modeling with clustering via Bert {clustering_algorithms}...")
        # Get the all the cluster labels
        labels = clustering_model.labels_

        # Add cluster labels to the preprocessed text data
        grouped_procd_data = pd.DataFrame({'cluster_label': labels, 'procd_text': procd_data_bigram}).groupby('cluster_label')

        # Apply LDA to each clustered corpus
        topic_models = {}
        coherence_scores = {}
        perplexity_scores = {}

        for label, group in tqdm(grouped_procd_data, desc='Training topic models for each cluster'):
            # Extract the clustered corpus and texts
            clustered_corpus = [corpus[i] for i in group.index]
            clustered_texts = group['procd_text']
            
            # Train the topic model for this cluster
            topic_model, perplexity, coherence = topic_modeling(clustered_texts, clustered_corpus, dictionary, topic_modeling_algorithm, RANDOM_SEED)
            
            # Save the topic model
            topic_models[label] = topic_model
            
            # Save the scores
            coherence_scores[label] = coherence
            if perplexity is not None: perplexity_scores[label] = perplexity

        coherence = np.mean(list(coherence_scores.values()))
        if len(perplexity_scores) == 0: perplexity = None
        else: perplexity = np.mean(list(perplexity_scores.values()))
    
    else:
        print('Topic modeling without clustering...')
        topic_model, perplexity, coherence = topic_modeling(procd_data_bigram, corpus, dictionary, topic_modeling_algorithm, num_topics, RANDOM_SEED)
    
    return num_docs, clustering_model if clustering else None, silhouette if clustering else None, calinski_harabasz if clustering else None, davies_bouldin if clustering else None, topic_models if clustering else topic_model, coherence, perplexity

### 2 - Topic Modeling Cross-Validation

#### 2.1 Set up path and settings

In [7]:
RANDOM_SEED = 42
train_data_path = '../data/raw/lewtun-drug-reviews/train.jsonl'

# Feature Engineering
procd_texts = ['lemma_wo_stpwrd', 'lemma_w_stpwrd']
ngram = 'bigram'
feature = 'tfidf'
#bert_pretrained_model = 'bert-base-uncased'

# Clustering
clustering = False
#clustering_algorithms = ['kmeans', 'hierarchical']
#num_clusters = 2

# Topic Modeling
topic_modeling_algorithm = 'lda'
nums_topics = [6, 10, 14, 16, 18]

# Model Settings
models = [
    {'procd_text': 'lemma_w_stpwrd', 'num_topics': 18},
    {'procd_text': 'lemma_w_stpwrd', 'num_topics': 16},
    {'procd_text': 'lemma_w_stpwrd', 'num_topics': 14},
    {'procd_text': 'lemma_w_stpwrd', 'num_topics': 10},
    {'procd_text': 'lemma_w_stpwrd', 'num_topics': 6},
    {'procd_text': 'lemma_wo_stpwrd', 'num_topics': 16}
    #{'procd_text': 'lemma_w_stpwrd', 'clustering': True, 'clustering_algorithms': 'kmeans', 'num_clusters': 2, 'topic_modeling_algorithm': 'lda', 'num_topics': 8},
    #{'procd_text': 'lemma_wo_stpwrd', 'clustering': True, 'clustering_algorithms': 'hierarchical', 'num_clusters': 3, 'topic_modeling_algorithm': 'lda', 'num_topics': 12}
]

#### 2.2 Preprocess train data

In [8]:
procd_df_path = '../cross_validation/procd_train.csv'

if os.path.isfile(procd_df_path): procd_df = pd.read_csv(procd_df_path)
else:
    procd_df = preprocess(Path(train_data_path), pct=1, RANDOM_SEED=RANDOM_SEED)
    procd_df.to_csv(procd_df_path, index=False)
    procd_df = pd.read_csv(procd_df_path)

#### 2.3 Topic modeling cross-validation

In [9]:
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

with Live(dir='../cross_validation', report='html') as live:
    for fold, (train, test) in tqdm(enumerate(kf.split(procd_df)), desc=f'Folds'):
        print(f'Fold {fold}')
        train_procd_df = procd_df.iloc[train]
        test_procd_df = procd_df.iloc[test]

        for procd_text in tqdm(procd_texts, desc=f'Preprocessed Text'):
            print(procd_text)
            # Load preprocessed text data
            train_procd_data = train_procd_df[procd_text].progress_apply(lambda x: eval(x))
            train_procd_data = train_procd_data[train_procd_data.progress_apply(lambda row: len(row) > 0)]

            test_procd_data = test_procd_df[procd_text].progress_apply(lambda x: eval(x))
            test_procd_data = test_procd_data[test_procd_data.progress_apply(lambda row: len(row) > 0)]
            
            # Extract features from the training data
            train_num_docs, _, procd_data_bigram, dictionary, train_bow_corpus, train_tfidf_corpus = feature_engineering(
                procd_data = train_procd_data,
                ngram = ngram,
                bert = False,
                bert_pretrained_model = None,
                tfidf = True,
                RANDOM_SEED = RANDOM_SEED
            )

            # Extract features from the test data
            #_, _, _, _, test_bow_corpus, test_tfidf_corpus = feature_engineering(
            #    procd_data = test_procd_data,
            #    ngram = ngram,
            #    bert = False,
            #    bert_pretrained_model = None,
            #    tfidf = True,
            #    RANDOM_SEED = RANDOM_SEED
            #)

            # Train the topic models
            for num_topics in tqdm(nums_topics, desc=f'Training topic models'):
                if (num_topics <= 12) and (procd_text == 'lemma_wo_stpwrd'): continue
                print(f'{num_topics} topics')
                
                if feature == 'bow': train_corpus = train_bow_corpus
                elif feature == 'tfidf': train_corpus = train_tfidf_corpus
                else: raise ValueError(f'Unknown feature: {feature}')

                topic_model, perplexity, coherence = topic_modeling(
                    procd_data_bigram, train_corpus, dictionary, topic_modeling_algorithm, num_topics, RANDOM_SEED
                )

                live.log_param('Preprocessed Text', procd_text)
                live.log_param('ngram', ngram)
                live.log_param('Feature', feature)
                live.log_param('Topic Modeling Algorithm', topic_modeling_algorithm)
                live.log_param('Num of Topics', num_topics)

                live.log_metric('Fold', fold)
                live.log_metric(f'Num of Docs - {procd_text} - {num_topics} - Train', train_num_docs)
                live.log_metric(f'Coherence - {procd_text} - {num_topics} - Train', coherence)
                if perplexity is not None: live.log_metric(f'Perplexity - {procd_text} - {num_topics} - Train', perplexity)

                # Evaluate the model
                #if feature == 'bow': test_corpus = test_bow_corpus
                #elif feature == 'tfidf': test_corpus = test_tfidf_corpus
                #else: raise ValueError(f'Unknown feature: {feature}')

                # Calculate Coherence score
                coherence_model = CoherenceModel(
                    model=topic_model,
                    texts=test_procd_data.tolist(),
                    #corpus=test_corpus,
                    dictionary=dictionary,
                    coherence='c_v',
                    #coherence='u_mass',
                    processes=-1
                )
                coherence = coherence_model.get_coherence()
                live.log_metric(f'Coherence - {procd_text} - {num_topics} - Test', coherence)

        live.next_step()

Folds: 0it [00:00, ?it/s]

Fold 0



Preprocessed Text:   0% 0/2 [00:00<?, ?it/s]

lemma_wo_stpwrd




  0% 0/129036 [00:00<?, ?it/s]

  2% 2013/129036 [00:00<00:06, 20120.52it/s]

  3% 4088/129036 [00:00<00:06, 20488.27it/s]

  5% 6137/129036 [00:00<00:06, 20308.91it/s]

  6% 8172/129036 [00:00<00:05, 20319.25it/s]

  8% 10221/129036 [00:00<00:05, 20377.76it/s]

 10% 12289/129036 [00:00<00:05, 20476.71it/s]

 11% 14337/129036 [00:00<00:05, 20435.31it/s]

 13% 16381/129036 [00:00<00:05, 20324.92it/s]

 14% 18414/129036 [00:00<00:05, 20203.00it/s]

 16% 20453/129036 [00:01<00:05, 20259.52it/s]

 17% 22483/129036 [00:01<00:05, 20267.40it/s]

 19% 24510/129036 [00:01<00:05, 20235.89it/s]

 21% 26534/129036 [00:01<00:05, 20196.92it/s]

 22% 28555/129036 [00:01<00:04, 20200.79it/s]

 24% 30594/129036 [00:01<00:04, 20257.21it/s]

 25% 32645/129036 [00:01<00:04, 20330.05it/s]

 27% 34679/129036 [00:01<00:04, 19786.73it/s]

 28% 36752/129036 [00:01<00:04, 20060.82it/s]

 30% 38873/129036 [00:01<00:04, 20400.63it/s]

 32% 40929/129036 [00:02<00:04, 20446.01it/s]

 33% 42976/129036 [00:02<00:04

14 topics




Training topic models:  60% 3/5 [00:49<00:32, 16.35s/it]

16 topics




Training topic models:  80% 4/5 [02:46<00:48, 48.95s/it]

18 topics




Training topic models: 100% 5/5 [04:36<00:00, 55.37s/it]

Preprocessed Text:  50% 1/2 [05:05<05:05, 305.71s/it]

lemma_w_stpwrd




  0% 0/129036 [00:00<?, ?it/s]

  1% 997/129036 [00:00<00:12, 9954.80it/s]

  2% 1993/129036 [00:00<00:12, 9897.21it/s]

  2% 2983/129036 [00:00<00:12, 9879.52it/s]

  3% 3989/129036 [00:00<00:12, 9944.77it/s]

  4% 4984/129036 [00:00<00:12, 9810.19it/s]

  5% 5966/129036 [00:00<00:12, 9673.31it/s]

  5% 6934/129036 [00:00<00:12, 9653.95it/s]

  6% 7900/129036 [00:00<00:12, 9655.64it/s]

  7% 8889/129036 [00:00<00:12, 9724.85it/s]

  8% 9887/129036 [00:01<00:12, 9801.51it/s]

  8% 10898/129036 [00:01<00:11, 9893.40it/s]

  9% 11888/129036 [00:01<00:11, 9789.12it/s]

 10% 12868/129036 [00:01<00:11, 9756.33it/s]

 11% 13847/129036 [00:01<00:11, 9766.07it/s]

 11% 14824/129036 [00:01<00:11, 9708.55it/s]

 12% 15796/129036 [00:01<00:11, 9600.35it/s]

 13% 16783/129036 [00:01<00:11, 9679.04it/s]

 14% 17764/129036 [00:01<00:11, 9716.28it/s]

 15% 18751/129036 [00:01<00:11, 9759.82it/s]

 15% 19740/129036 [00:02<00:11, 9797.34it/s]

 16% 20720/129036 [00:02<00:11, 9770.31it/s]

 17% 21698/

6 topics




Training topic models:  20% 1/5 [00:42<02:49, 42.44s/it]

10 topics




Training topic models:  40% 2/5 [01:31<02:19, 46.38s/it]

14 topics




Training topic models:  60% 3/5 [03:43<02:50, 85.26s/it]

16 topics




Training topic models:  80% 4/5 [06:19<01:53, 113.52s/it]

18 topics




Training topic models: 100% 5/5 [07:26<00:00, 89.34s/it] 

Preprocessed Text: 100% 2/2 [13:32<00:00, 406.11s/it]
Folds: 1it [13:32, 812.29s/it]

Fold 1



Preprocessed Text:   0% 0/2 [00:00<?, ?it/s]

lemma_wo_stpwrd




  0% 0/129036 [00:00<?, ?it/s]

  2% 2073/129036 [00:00<00:06, 20728.04it/s]

  3% 4176/129036 [00:00<00:05, 20903.41it/s]

  5% 6267/129036 [00:00<00:05, 20733.99it/s]

  6% 8341/129036 [00:00<00:05, 20627.12it/s]

  8% 10449/129036 [00:00<00:05, 20788.19it/s]

 10% 12541/129036 [00:00<00:05, 20831.89it/s]

 11% 14641/129036 [00:00<00:05, 20877.62it/s]

 13% 16729/129036 [00:00<00:05, 20675.84it/s]

 15% 18843/129036 [00:00<00:05, 20817.34it/s]

 16% 20946/129036 [00:01<00:05, 20880.25it/s]

 18% 23069/129036 [00:01<00:05, 20983.99it/s]

 20% 25168/129036 [00:01<00:04, 20789.11it/s]

 21% 27248/129036 [00:01<00:04, 20764.64it/s]

 23% 29337/129036 [00:01<00:04, 20800.25it/s]

 24% 31436/129036 [00:01<00:04, 20855.01it/s]

 26% 33522/129036 [00:01<00:04, 20614.57it/s]

 28% 35602/129036 [00:01<00:04, 20669.01it/s]

 29% 37685/129036 [00:01<00:04, 20714.17it/s]

 31% 39763/129036 [00:01<00:04, 20731.98it/s]

 32% 41837/129036 [00:02<00:04, 20511.88it/s]

 34% 43931/129036 [00:02<00:04

14 topics




Training topic models:  60% 3/5 [00:50<00:33, 16.70s/it]

16 topics




Training topic models:  80% 4/5 [02:40<00:46, 46.86s/it]

18 topics




Training topic models: 100% 5/5 [03:35<00:00, 43.12s/it]

Preprocessed Text:  50% 1/2 [04:04<04:04, 244.63s/it]

lemma_w_stpwrd




  0% 0/129036 [00:00<?, ?it/s]

  1% 1359/129036 [00:00<00:09, 13588.72it/s]

  2% 2718/129036 [00:00<00:09, 13464.12it/s]

  3% 4065/129036 [00:00<00:09, 13399.55it/s]

  4% 5417/129036 [00:00<00:09, 13442.61it/s]

  5% 6762/129036 [00:00<00:09, 13369.78it/s]

  6% 8100/129036 [00:00<00:09, 13323.53it/s]

  7% 9433/129036 [00:00<00:09, 13281.14it/s]

  8% 10780/129036 [00:00<00:08, 13337.96it/s]

  9% 12114/129036 [00:00<00:08, 13169.57it/s]

 10% 13447/129036 [00:01<00:08, 13214.96it/s]

 11% 14769/129036 [00:01<00:08, 13183.48it/s]

 12% 16099/129036 [00:01<00:08, 13218.15it/s]

 14% 17422/129036 [00:01<00:08, 13217.62it/s]

 15% 18769/129036 [00:01<00:08, 13291.53it/s]

 16% 20099/129036 [00:01<00:08, 13213.54it/s]

 17% 21421/129036 [00:01<00:08, 13197.50it/s]

 18% 22778/129036 [00:01<00:07, 13307.95it/s]

 19% 24109/129036 [00:01<00:07, 13177.09it/s]

 20% 25438/129036 [00:01<00:07, 13207.90it/s]

 21% 26762/129036 [00:02<00:07, 13214.71it/s]

 22% 28103/129036 [00:02<00:07, 1

6 topics




Training topic models:  20% 1/5 [00:42<02:51, 42.98s/it]

10 topics




Training topic models:  40% 2/5 [03:07<05:07, 102.60s/it]

14 topics




Training topic models:  60% 3/5 [03:58<02:38, 79.07s/it] 

16 topics




Training topic models:  80% 4/5 [06:26<01:46, 106.52s/it]

18 topics




Training topic models: 100% 5/5 [07:36<00:00, 91.20s/it] 

Preprocessed Text: 100% 2/2 [12:30<00:00, 375.23s/it]
Folds: 2it [26:02, 775.98s/it]

Fold 2



Preprocessed Text:   0% 0/2 [00:00<?, ?it/s]

lemma_wo_stpwrd




  0% 0/129036 [00:00<?, ?it/s]

  2% 2916/129036 [00:00<00:04, 29156.62it/s]

  5% 5832/129036 [00:00<00:04, 28649.77it/s]

  7% 8709/129036 [00:00<00:04, 28703.41it/s]

  9% 11675/129036 [00:00<00:04, 29072.64it/s]

 11% 14583/129036 [00:00<00:03, 29033.77it/s]

 14% 17487/129036 [00:00<00:03, 28989.03it/s]

 16% 20418/129036 [00:00<00:03, 29091.53it/s]

 18% 23328/129036 [00:00<00:03, 28988.67it/s]

 20% 26228/129036 [00:00<00:03, 28967.57it/s]

 23% 29125/129036 [00:01<00:03, 28818.24it/s]

 25% 32027/129036 [00:01<00:03, 28879.06it/s]

 27% 34916/129036 [00:01<00:03, 28810.24it/s]

 29% 37798/129036 [00:01<00:03, 28761.03it/s]

 32% 40698/129036 [00:01<00:03, 28832.54it/s]

 34% 43613/129036 [00:01<00:02, 28927.67it/s]

 36% 46513/129036 [00:01<00:02, 28948.55it/s]

 38% 49414/129036 [00:01<00:02, 28963.11it/s]

 41% 52350/129036 [00:01<00:02, 29080.11it/s]

 43% 55262/129036 [00:01<00:02, 29091.51it/s]

 45% 58188/129036 [00:02<00:02, 29140.93it/s]

 47% 61107/129036 [00:02<00:0

14 topics




Training topic models:  60% 3/5 [00:53<00:35, 17.81s/it]

16 topics




Training topic models:  80% 4/5 [01:47<00:29, 29.37s/it]

18 topics




Training topic models: 100% 5/5 [02:44<00:00, 32.87s/it]

Preprocessed Text:  50% 1/2 [03:11<03:11, 191.66s/it]

lemma_w_stpwrd




  0% 0/129036 [00:00<?, ?it/s]

  1% 1339/129036 [00:00<00:09, 13379.04it/s]

  2% 2688/129036 [00:00<00:09, 13443.17it/s]

  3% 4044/129036 [00:00<00:09, 13494.57it/s]

  4% 5394/129036 [00:00<00:09, 13463.25it/s]

  5% 6741/129036 [00:00<00:09, 13257.48it/s]

  6% 8068/129036 [00:00<00:09, 13247.68it/s]

  7% 9421/129036 [00:00<00:08, 13338.68it/s]

  8% 10799/129036 [00:00<00:08, 13477.43it/s]

  9% 12148/129036 [00:00<00:08, 13353.85it/s]

 10% 13520/129036 [00:01<00:08, 13463.69it/s]

 12% 14880/129036 [00:01<00:08, 13502.93it/s]

 13% 16231/129036 [00:01<00:08, 13472.82it/s]

 14% 17579/129036 [00:01<00:08, 13463.12it/s]

 15% 18926/129036 [00:01<00:08, 13421.32it/s]

 16% 20269/129036 [00:01<00:08, 13227.96it/s]

 17% 21594/129036 [00:01<00:08, 13232.53it/s]

 18% 22960/129036 [00:01<00:07, 13358.09it/s]

 19% 24297/129036 [00:01<00:07, 13275.44it/s]

 20% 25646/129036 [00:01<00:07, 13337.89it/s]

 21% 26997/129036 [00:02<00:07, 13385.32it/s]

 22% 28336/129036 [00:02<00:07, 1

6 topics




Training topic models:  20% 1/5 [00:44<02:58, 44.74s/it]

10 topics




Training topic models:  40% 2/5 [01:46<02:44, 54.71s/it]

14 topics




Training topic models:  60% 3/5 [04:06<03:06, 93.48s/it]

16 topics




Training topic models:  80% 4/5 [06:33<01:54, 114.63s/it]

18 topics




Training topic models: 100% 5/5 [09:06<00:00, 109.37s/it]

Preprocessed Text: 100% 2/2 [13:08<00:00, 394.04s/it]
Folds: 3it [39:11, 781.54s/it]

Fold 3



Preprocessed Text:   0% 0/2 [00:00<?, ?it/s]

lemma_wo_stpwrd




  0% 0/129036 [00:00<?, ?it/s]

  2% 2910/129036 [00:00<00:04, 29098.29it/s]

  5% 5820/129036 [00:00<00:04, 29042.74it/s]

  7% 8725/129036 [00:00<00:04, 28754.84it/s]

  9% 11675/129036 [00:00<00:04, 29044.77it/s]

 11% 14580/129036 [00:00<00:03, 28985.00it/s]

 14% 17479/129036 [00:00<00:03, 28793.57it/s]

 16% 20407/129036 [00:00<00:03, 28951.21it/s]

 18% 23303/129036 [00:00<00:03, 28947.49it/s]

 20% 26198/129036 [00:00<00:03, 28639.94it/s]

 23% 29086/129036 [00:01<00:03, 28711.68it/s]

 25% 31993/129036 [00:01<00:03, 28820.42it/s]

 27% 34876/129036 [00:01<00:03, 28709.05it/s]

 29% 37814/129036 [00:01<00:03, 28907.89it/s]

 32% 40706/129036 [00:01<00:03, 28765.00it/s]

 34% 43589/129036 [00:01<00:02, 28783.84it/s]

 36% 46522/129036 [00:01<00:02, 28946.22it/s]

 38% 49417/129036 [00:01<00:02, 28787.54it/s]

 41% 52297/129036 [00:02<00:06, 11009.86it/s]

 43% 55233/129036 [00:02<00:05, 13583.46it/s]

 45% 58142/129036 [00:02<00:04, 16171.08it/s]

 47% 61112/129036 [00:02<00:0

14 topics




Training topic models:  60% 3/5 [00:54<00:36, 18.06s/it]

16 topics




Training topic models:  80% 4/5 [01:46<00:29, 29.25s/it]

18 topics




Training topic models: 100% 5/5 [02:43<00:00, 32.78s/it]

Preprocessed Text:  50% 1/2 [03:09<03:09, 189.39s/it]

lemma_w_stpwrd




  0% 0/129036 [00:00<?, ?it/s]

  1% 1334/129036 [00:00<00:09, 13335.50it/s]

  2% 2680/129036 [00:00<00:09, 13402.15it/s]

  3% 4065/129036 [00:00<00:09, 13604.44it/s]

  4% 5426/129036 [00:00<00:09, 13452.12it/s]

  5% 6772/129036 [00:00<00:09, 13312.46it/s]

  6% 8104/129036 [00:00<00:09, 13181.13it/s]

  7% 9442/129036 [00:00<00:09, 13243.76it/s]

  8% 10812/129036 [00:00<00:08, 13386.38it/s]

  9% 12154/129036 [00:00<00:08, 13393.64it/s]

 10% 13494/129036 [00:01<00:08, 13280.99it/s]

 11% 14825/129036 [00:01<00:08, 13288.31it/s]

 13% 16155/129036 [00:01<00:08, 13248.08it/s]

 14% 17489/129036 [00:01<00:08, 13272.44it/s]

 15% 18834/129036 [00:01<00:08, 13325.51it/s]

 16% 20193/129036 [00:01<00:08, 13404.45it/s]

 17% 21534/129036 [00:01<00:08, 13256.29it/s]

 18% 22894/129036 [00:01<00:07, 13357.65it/s]

 19% 24231/129036 [00:01<00:07, 13248.80it/s]

 20% 25571/129036 [00:01<00:07, 13291.24it/s]

 21% 26901/129036 [00:02<00:07, 13287.19it/s]

 22% 28234/129036 [00:02<00:07, 1

6 topics




Training topic models:  20% 1/5 [01:00<04:01, 60.47s/it]

10 topics




Training topic models:  40% 2/5 [01:49<02:42, 54.03s/it]

14 topics




Training topic models:  60% 3/5 [04:09<03:06, 93.14s/it]

16 topics




Training topic models:  80% 4/5 [06:33<01:52, 112.99s/it]

18 topics




Training topic models: 100% 5/5 [07:45<00:00, 93.01s/it] 

Preprocessed Text: 100% 2/2 [11:44<00:00, 352.09s/it]
Folds: 4it [50:55, 751.04s/it]

Fold 4



Preprocessed Text:   0% 0/2 [00:00<?, ?it/s]

lemma_wo_stpwrd




  0% 0/129036 [00:00<?, ?it/s]

  2% 2918/129036 [00:00<00:04, 29168.97it/s]

  5% 5835/129036 [00:00<00:04, 28954.33it/s]

  7% 8731/129036 [00:00<00:04, 28598.97it/s]

  9% 11612/129036 [00:00<00:04, 28678.52it/s]

 11% 14508/129036 [00:00<00:03, 28777.44it/s]

 13% 17387/129036 [00:00<00:03, 28662.07it/s]

 16% 20287/129036 [00:00<00:03, 28768.83it/s]

 18% 23185/129036 [00:00<00:03, 28835.71it/s]

 20% 26069/129036 [00:00<00:03, 28753.70it/s]

 22% 28957/129036 [00:01<00:03, 28791.96it/s]

 25% 31843/129036 [00:01<00:03, 28812.53it/s]

 27% 34725/129036 [00:01<00:03, 28603.82it/s]

 29% 37629/129036 [00:01<00:03, 28732.91it/s]

 31% 40503/129036 [00:01<00:03, 28702.72it/s]

 34% 43374/129036 [00:01<00:03, 28391.92it/s]

 36% 46215/129036 [00:01<00:02, 28390.70it/s]

 38% 49055/129036 [00:01<00:02, 28325.92it/s]

 40% 51888/129036 [00:01<00:02, 28096.74it/s]

 42% 54757/129036 [00:01<00:02, 28272.97it/s]

 45% 57585/129036 [00:02<00:02, 28190.79it/s]

 47% 60405/129036 [00:02<00:0

14 topics




Training topic models:  60% 3/5 [00:52<00:35, 17.59s/it]

16 topics




Training topic models:  80% 4/5 [01:46<00:29, 29.11s/it]

18 topics




Training topic models: 100% 5/5 [03:46<00:00, 45.36s/it]

Preprocessed Text:  50% 1/2 [04:12<04:12, 252.94s/it]

lemma_w_stpwrd




  0% 0/129036 [00:00<?, ?it/s]

  1% 1346/129036 [00:00<00:09, 13447.64it/s]

  2% 2691/129036 [00:00<00:09, 13326.25it/s]

  3% 4034/129036 [00:00<00:09, 13371.32it/s]

  4% 5372/129036 [00:00<00:09, 13271.74it/s]

  5% 6700/129036 [00:00<00:09, 13131.84it/s]

  6% 8014/129036 [00:00<00:09, 13091.58it/s]

  7% 9326/129036 [00:00<00:09, 13098.59it/s]

  8% 10706/129036 [00:00<00:08, 13319.50it/s]

  9% 12039/129036 [00:00<00:08, 13214.74it/s]

 10% 13383/129036 [00:01<00:08, 13282.96it/s]

 11% 14715/129036 [00:01<00:08, 13292.75it/s]

 12% 16045/129036 [00:01<00:08, 13115.82it/s]

 13% 17366/129036 [00:01<00:08, 13143.01it/s]

 14% 18699/129036 [00:01<00:08, 13194.80it/s]

 16% 20064/129036 [00:01<00:08, 13330.78it/s]

 17% 21398/129036 [00:01<00:08, 13311.65it/s]

 18% 22777/129036 [00:01<00:07, 13452.40it/s]

 19% 24123/129036 [00:01<00:07, 13270.57it/s]

 20% 25476/129036 [00:01<00:07, 13345.54it/s]

 21% 26812/129036 [00:02<00:07, 13338.62it/s]

 22% 28152/129036 [00:02<00:07, 1

6 topics




Training topic models:  20% 1/5 [00:52<03:28, 52.16s/it]

10 topics




Training topic models:  40% 2/5 [02:33<04:03, 81.29s/it]

14 topics




Training topic models:  60% 3/5 [03:41<02:30, 75.14s/it]

16 topics




Training topic models:  80% 4/5 [04:49<01:12, 72.08s/it]

18 topics




Training topic models: 100% 5/5 [06:02<00:00, 72.59s/it]

Preprocessed Text: 100% 2/2 [11:18<00:00, 339.12s/it]
Folds: 5it [1:02:13, 746.73s/it]
    You can remove it from Git, then add to DVC.
        To stop tracking from Git:
            git rm -r --cached '../data/preprocessed/procd_sample_train.csv'
            git commit -m "stop tracking ../data/preprocessed/procd_sample_train.csv" 


### 3 - Clustering Cross-Validation

#### 3.1 Set up path and settings

In [7]:
RANDOM_SEED = 42
train_data_path = '../data/raw/lewtun-drug-reviews/train.jsonl'

# Feature Engineering
procd_texts = ['lemma_wo_stpwrd', 'lemma_w_stpwrd']
bert_pretrained_model = 'bert-base-uncased'

# Clustering
clustering_algorithms = ['kmeans', 'hierarchical']
nums_clusters = [2, 3, 4, 5]

#### 3.2 Preprocess train data

In [12]:
procd_df_path = '../cross_validation/procd_train.csv'

if os.path.isfile(procd_df_path):
    procd_df = pd.read_csv(procd_df_path)
    #procd_df = procd_df.sample(frac=0.001, random_state=RANDOM_SEED)
else:
    procd_df = preprocess(Path(train_data_path), pct=1, RANDOM_SEED=RANDOM_SEED)
    procd_df.to_csv(procd_df_path, index=False)
    procd_df = pd.read_csv(procd_df_path)

#### 3.3 Clustering cross-validation

In [13]:
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

with Live(dir='../cross_validation/cluster', report='html') as live:
    for fold, (train, test) in tqdm(enumerate(kf.split(procd_df)), desc=f'Folds'):
        print(f'Fold {fold}')
        train_procd_df = procd_df.iloc[train]
        test_procd_df = procd_df.iloc[test]

        for procd_text in tqdm(procd_texts, desc=f'Preprocessed Text'):
            print(procd_text)
            # Load preprocessed text data
            train_procd_data = train_procd_df[procd_text].progress_apply(lambda x: eval(x))
            train_procd_data = train_procd_data[train_procd_data.progress_apply(lambda row: len(row) > 0)]

            test_procd_data = test_procd_df[procd_text].progress_apply(lambda x: eval(x))
            test_procd_data = test_procd_data[test_procd_data.progress_apply(lambda row: len(row) > 0)]

            # Extract bert embeddings from the training data
            train_num_docs, train_bert_embeddings, _, _, _, _ = feature_engineering(
                procd_data = train_procd_data,
                ngram = None,
                bert = True,
                bert_pretrained_model = bert_pretrained_model,
                tfidf = False,
                RANDOM_SEED = RANDOM_SEED
            )

            # Extract bert embeddings from the test data
            test_num_docs, test_bert_embeddings, _, _, _, _ = feature_engineering(
                procd_data = test_procd_data,
                ngram = None,
                bert = True,
                bert_pretrained_model = bert_pretrained_model,
                tfidf = False,
                RANDOM_SEED = RANDOM_SEED
            )
            # Prepare the input test data for clustering
            test_bert_embedding_avg = [np.mean(embedding, axis=0) for embedding in test_bert_embeddings]
            test_data = np.vstack(test_bert_embedding_avg)
            # Scale the test data
            scaler = StandardScaler()
            test_data = scaler.fit_transform(test_data)

            # Train the clustering models
            for clustering_algorithm in tqdm(clustering_algorithms, desc=f'Training clustering models'):
                print(f'{clustering_algorithm}')
                for num_clusters in tqdm(nums_clusters):
                    print(f'{num_clusters} clusters')
                    
                    clustering_model, silhouette, davies_bouldin, calinski_harabasz = clustering(
                        bert_embeddings = train_bert_embeddings,
                        algorithm = clustering_algorithm,
                        num_clusters = num_clusters,
                        RANDOM_SEED = RANDOM_SEED
                    )
    
                    live.log_param('Preprocessed Text', procd_text)
                    live.log_param('Clustering Algorithm', clustering_algorithm)
                    live.log_param('Num of Clusters', num_clusters)
                        
                    live.log_metric('Fold', fold)
                    live.log_metric(f'Num of Docs - Train', train_num_docs)
                    live.log_metric(
                        f'Silhouette - {procd_text} - {clustering_algorithm} - {num_clusters} - Train',
                        silhouette
                    )
                    live.log_metric(
                        f'Davies_Bouldin - {procd_text} - {clustering_algorithm} - {num_clusters} - Train',
                        davies_bouldin
                    )
                    live.log_metric(
                        f'Calinski_Harabasz - {procd_text} - {clustering_algorithm} - {num_clusters} - Train',
                        calinski_harabasz
                    )
    
                    # Evaluate the model
                    # Predict the clustering labels for the test data
                    if clustering_algorithm == 'kmeans':
                        labels = clustering_model.predict(test_data)
                    elif clustering_algorithm == 'hierarchical':
                        labels = clustering_model.fit_predict(test_data)
                    # Calculate the metrics if possible
                    if len(set(labels)) > 1:
                        silhouette = silhouette_score(test_data, labels)
                        davies_bouldin = davies_bouldin_score(test_data, labels)
                        calinski_harabasz = calinski_harabasz_score(test_data, labels)
                    else:
                        silhouette_avg = davies_bouldin = calinski_harabasz = np.nan
                    
                    live.log_metric(f'Num of Docs - Test', test_num_docs)
                    live.log_metric(
                        f'Silhouette - {procd_text} - {clustering_algorithm} - {num_clusters} - Test',
                        silhouette
                    )
                    live.log_metric(
                        f'Davies_Bouldin - {procd_text} - {clustering_algorithm} - {num_clusters} - Test',
                        davies_bouldin
                    )
                    live.log_metric(
                        f'Calinski_Harabasz - {procd_text} - {clustering_algorithm} - {num_clusters} - Test',
                        calinski_harabasz
                    )

        live.next_step()

Folds: 0it [00:00, ?it/s]

Fold 0



Preprocessed Text:   0%|          | 0/2 [00:00<?, ?it/s]

lemma_wo_stpwrd




  0%|          | 0/129036 [00:00<?, ?it/s]

  2%|▏         | 1993/129036 [00:00<00:06, 19824.66it/s]

  3%|▎         | 3976/129036 [00:00<00:06, 18892.81it/s]

  5%|▍         | 5982/129036 [00:00<00:06, 19330.10it/s]

  6%|▌         | 7918/129036 [00:00<00:06, 18585.12it/s]

  8%|▊         | 9811/129036 [00:00<00:06, 18683.48it/s]

  9%|▉         | 11980/129036 [00:00<00:05, 19649.69it/s]

 11%|█         | 13950/129036 [00:00<00:06, 19166.38it/s]

 12%|█▏        | 15872/129036 [00:00<00:06, 18812.28it/s]

 14%|█▍        | 17977/129036 [00:00<00:05, 19457.62it/s]

 16%|█▌        | 20159/129036 [00:01<00:05, 20136.89it/s]

 17%|█▋        | 22178/129036 [00:01<00:05, 19996.77it/s]

 19%|█▊        | 24183/129036 [00:01<00:05, 19988.49it/s]

 20%|██        | 26389/129036 [00:01<00:04, 20576.46it/s]

 22%|██▏       | 28764/129036 [00:01<00:04, 21488.22it/s]

 24%|██▍       | 30970/129036 [00:01<00:04, 21600.22it/s]

 26%|██▌       | 33219/129036 [00:01<00:04, 21858.68it/s]

 28%|██▊       

Using device: cuda




Generating BERT Embeddings:   0%|          | 0/129036 [00:00<?, ?it/s]

Generating BERT Embeddings:   0%|          | 6/129036 [00:00<37:05, 57.99it/s]

Generating BERT Embeddings:   0%|          | 15/129036 [00:00<29:35, 72.67it/s]

Generating BERT Embeddings:   0%|          | 25/129036 [00:00<26:24, 81.42it/s]

Generating BERT Embeddings:   0%|          | 34/129036 [00:00<25:33, 84.15it/s]

Generating BERT Embeddings:   0%|          | 43/129036 [00:00<26:02, 82.57it/s]

Generating BERT Embeddings:   0%|          | 52/129036 [00:00<28:35, 75.17it/s]

Generating BERT Embeddings:   0%|          | 62/129036 [00:00<26:55, 79.86it/s]

Generating BERT Embeddings:   0%|          | 72/129036 [00:00<25:27, 84.41it/s]

Generating BERT Embeddings:   0%|          | 81/129036 [00:00<25:01, 85.89it/s]

Generating BERT Embeddings:   0%|          | 90/129036 [00:01<25:51, 83.10it/s]

Generating BERT Embeddings:   0%|          | 99/129036 [00:01<26:29, 81.12it/s]

Generating BERT Embeddings:   0%|   

Using device: cuda




Generating BERT Embeddings:   0%|          | 0/32259 [00:00<?, ?it/s]

Generating BERT Embeddings:   0%|          | 6/32259 [00:00<09:00, 59.63it/s]

Generating BERT Embeddings:   0%|          | 13/32259 [00:00<08:18, 64.75it/s]

Generating BERT Embeddings:   0%|          | 21/32259 [00:00<07:35, 70.81it/s]

Generating BERT Embeddings:   0%|          | 30/32259 [00:00<07:03, 76.13it/s]

Generating BERT Embeddings:   0%|          | 41/32259 [00:00<06:18, 85.15it/s]

Generating BERT Embeddings:   0%|          | 51/32259 [00:00<06:07, 87.55it/s]

Generating BERT Embeddings:   0%|          | 60/32259 [00:00<06:09, 87.06it/s]

Generating BERT Embeddings:   0%|          | 69/32259 [00:00<06:14, 85.97it/s]

Generating BERT Embeddings:   0%|          | 79/32259 [00:00<06:00, 89.22it/s]

Generating BERT Embeddings:   0%|          | 89/32259 [00:01<05:58, 89.86it/s]

Generating BERT Embeddings:   0%|          | 98/32259 [00:01<05:58, 89.74it/s]

Generating BERT Embeddings:   0%|          | 107

kmeans





  0%|          | 0/4 [00:00<?, ?it/s]

2 clusters





 25%|██▌       | 1/4 [04:44<14:13, 284.35s/it]

3 clusters





 50%|█████     | 2/4 [09:16<09:14, 277.07s/it]

4 clusters





 75%|███████▌  | 3/4 [13:51<04:36, 276.31s/it]

5 clusters





100%|██████████| 4/4 [18:25<00:00, 276.33s/it]


Training clustering models:  50%|█████     | 1/2 [18:25<18:25, 1105.34s/it]

hierarchical





  0%|          | 0/4 [00:00<?, ?it/s]

2 clusters


  0%|          | 0/4 [47:12<?, ?it/s]
Training clustering models:  50%|█████     | 1/2 [1:05:37<1:05:37, 3937.82s/it]
Preprocessed Text:   0%|          | 0/2 [1:34:19<?, ?it/s]
Folds: 0it [1:34:19, ?it/s]
    You can remove it from Git, then add to DVC.
        To stop tracking from Git:
            git rm -r --cached '..\data\preprocessed\procd_sample_train.csv'
            git commit -m "stop tracking ..\data\preprocessed\procd_sample_train.csv" 


MemoryError: unable to allocate array data.