In [1]:
import pandas as pd
import pickle
from scipy.stats import spearmanr

Load in all the required files

In [2]:
CM_df = pd.read_csv('example_data/example_CM_inverter_records_with_embeddings.csv')
# need to parse the embedding from a string into a list of tuples for each row
CM_df['FullDescEmbeds'] = CM_df['FullDescEmbeds'].apply(lambda embed :
                                                        [tuple([int(num) for num in pair.split(', ')])
                                                        for pair in embed[2:-2].split('), (')]
                                                        )

with open('fitted_models/corpus_dictionary.pkl', 'rb') as file:
    corpus_dictionary = pickle.load(file)

lda_models = {}
for k in [5, 10, 15, 20, 25, 30]:
    with open(f'fitted_models/stm_lda_model_k{k}.pkl', 'rb') as file:
        lda_models[k] = pickle.load(file)

Below, for $k=30$, we can pull out a `dict` containing the terms associated with each topic. They are sorted from the highest probability to the lowest. The `dict` has format: `topic_terms[topic_id] = (term, probability)`.

In [3]:
k = 30
topic_terms = {}
for topic_id in range(k):
    topic_term_embeds = lda_models[k].get_topic_terms(topic_id, topn=len(corpus_dictionary))
    topic_terms[topic_id] = [(corpus_dictionary[embed], prob) if prob > 1e-2 else (corpus_dictionary[embed], 0)
                                    for (embed, prob) in topic_term_embeds]

# print the top 5 terms for topic_id 0
print(topic_terms[0][:5])

[('inverter', 0.056787804), ('fault', 0.037120067), ('ground', 0.03252461), ('offline', 0.022443272), ('cb', 0.018650837)]


#### Investigating correlations between topics

One thing we can look at is how correlated different topics are. We can do this by getting the topic probabilities associated with each document and finding the Spearman (rank) correlation coefficient for a given pair of topics.
<br><br>
LDA assigns nonzero probabilities for every topic to each document. Therefore, in the code below, probabilities belows a cutoff value (`1e-2`) are considered to be zero.

In [24]:
# for each document, get the probability of each topic
document_topic_probs = [{topic_id : prob if prob > 1e-2 else 0
                            for (topic_id, prob) in lda_models[k].get_document_topics(doc, minimum_probability=0)}
                                for doc in CM_df['FullDescEmbeds']]

# scale up so the probabilities add to one
document_topic_probs = [{topic_id : prob / sum(doc.values()) for (topic_id, prob) in doc.items()} for doc in document_topic_probs]

# for each topic, list out the probabilities for each document
topic_probs = {topic_id : [doc[topic_id] for doc in document_topic_probs] for topic_id in range(k)}

# get all the pairwise correlations between topics
topic_correlations = {}
for topic_1 in range(k):
    for topic_2 in (range(topic_1 + 1, k)):
            correlation = spearmanr(topic_probs[topic_1], topic_probs[topic_2]).statistic
            if correlation > -1: # filters out cases where the spearman coef is nan (when one of the arrays is all constants)
                topic_correlations[topic_1, topic_2] = correlation

  correlation = spearmanr(topic_probs[topic_1], topic_probs[topic_2]).statistic


We can then put the results into a `DataFrame` and look at the most highly correlated topics. The code below finds the 5 most correlated pairs of topics and prints the top 10 words for each topic, as well as their correlation.

In [25]:
corr_df = pd.DataFrame(topic_correlations.items())
corr_df = corr_df.rename(columns={1:'spearman'})
corr_df['topic 1'] = corr_df[0].apply(lambda x: x[0]).astype(int)
corr_df['topic 2'] = corr_df[0].apply(lambda x: x[1]).astype(int)
corr_df = corr_df.drop(columns=[0])

corr_df = corr_df.sort_values('spearman', ascending=False)

for _, row in corr_df.head(5).iterrows():
    topic_1, topic_2 = int(row['topic 1']), int(row['topic 2'])
    print(f'Topic {topic_1} top words:', [term[0] for term in topic_terms[topic_1][:10]])
    print(f'Topic {topic_2} top words:', [term[0] for term in topic_terms[topic_2][:10]])
    print('Correlation:', round(row['spearman'], 3))
    print()

Topic 21 top words: ['inverter', 'production', 'position', 'outage', 'found', 'offline', 'techdispatched', 'comms', 'inverters', 'closed']
Topic 22 top words: ['inverter', 'inverters', 'site', 'communicating', 'set', 'techdispatched', 'communication', 'data', 'system', 'pf']
Correlation: 0.232

Topic 14 top words: ['inverter', 'outage', 'production', 'w', 'techdispatched', 'communicating', 'c', 'stopped', 'system', 'hmi']
Topic 22 top words: ['inverter', 'inverters', 'site', 'communicating', 'set', 'techdispatched', 'communication', 'data', 'system', 'pf']
Correlation: 0.173

Topic 1 top words: ['inverters', 'faults', 'inverter', 'techdispatched', 'created', 'wo', 'outage', 'central', 'pf', 'b']
Topic 24 top words: ['inverter', 'outage', 'b', 'technician', 'site', 'reset', 'back', 'e', 'communicating', 'c']
Correlation: 0.156

Topic 7 top words: ['inverter', 'hardware', 'replaced', 'offline', 'dc', 'replacement', 'due', 'communicating', 'inverters', 'stopped']
Topic 17 top words: ['inv

In this case, the results should be taken with a grain of salt, since most documents are only assigned a single topic. However, this example illustrates the process that could be used with a larger number of documents to get more refined topics and hence better results.

#### Survival analysis

In [None]:
!pip install scikit-survival

In [49]:
from sksurv.nonparametric import kaplan_meier_estimator

In [58]:
CM_df['Asset'].value_counts()

Asset
Inverter                    142
Central Inverter             17
Inverter/String Inverter      2
String Inverter               1
Inverter module               1
Name: count, dtype: int64

In [51]:
help(kaplan_meier_estimator)

Help on function kaplan_meier_estimator in module sksurv.nonparametric:

kaplan_meier_estimator(event, time_exit, time_enter=None, time_min=None, reverse=False, conf_level=0.95, conf_type=None)
    Kaplan-Meier estimator of survival function.
    
    See [1]_ for further description.
    
    Parameters
    ----------
    event : array-like, shape = (n_samples,)
        Contains binary event indicators.
    
    time_exit : array-like, shape = (n_samples,)
        Contains event/censoring times.
    
    time_enter : array-like, shape = (n_samples,), optional
        Contains time when each individual entered the study for
        left truncated survival data.
    
    time_min : float, optional
        Compute estimator conditional on survival at least up to
        the specified time.
    
    reverse : bool, optional, default: False
        Whether to estimate the censoring distribution.
        When there are ties between times at which events are observed,
        then events com