<a href="https://colab.research.google.com/github/raz0208/Natural-Language-Processing-Practices/blob/main/TopicModelling/EmbeddingsAnalysis_TopicModelling_TTDefault.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Topic Modelling

## Semantic Signal Separation

In [None]:
!pip install turftopic

In [None]:
# Import required libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

# import model and turftopic libraries
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from turftopic import SemanticSignalSeparation
import plotly.express as px

In [10]:
# Read and load dataset
dataset = pd.read_csv('gdb_abstract.csv')

# Show the datasets
### Abstract Embeddings Sample Dataset
print('Node Content:', dataset.shape)
print(dataset)

Node Content: (1000, 4)
     Unnamed: 0                                              title  \
0             0  Phenotypic variability of Niemann-Pick disease...   
1             1  Recurrent hypoglycemia secondary to metformin ...   
2             2  Adaptation of the Ambulatory and Home Care Rec...   
3             3  Multidimensional family therapy in adolescents...   
4             4  Balanced crystalloids versus isotonic saline i...   
..          ...                                                ...   
995         995  Molecular Sex Identification in Dioecious  Hip...   
996         996  Antimicrobial Peptides: Powerful Biorecognitio...   
997         997  Analysis of Metabolites in White Flowers of  M...   
998         998  Improved Cold Tolerance of Mango Fruit with En...   
999         999  Characterization of the Complete Chloroplast G...   

                                              abstract  year  
0    Background Niemann-Pick disease type C (NPC) i...  2018  
1    Back

In [11]:
# Extract only the 'abstract' column and drop others
abstracts = dataset['abstract'].dropna().reset_index(drop=True)

# Display a few samples to verify
print(abstracts)

0      Background Niemann-Pick disease type C (NPC) i...
1      Background Metformin toxicity is well known to...
2      Background Measuring service use and costs is ...
3      Background Substance use and delinquency are c...
4      Objectives Intravenous fluids are one of the m...
                             ...                        
995    The dioecious property of the sea buckthorn ( ...
996    Bacterial infections represent a serious threa...
997    A total of seven phenolics and 44 metabolites ...
998    Red fruits were suggested to be tolerant to co...
999    Buddleja colvilei Hook.f. & Thomson (Scrophula...
Name: abstract, Length: 1000, dtype: object


In [None]:
encoder = SentenceTransformer("paraphrase-MiniLM-L12-v2")
embeddings = encoder.encode(abstracts, show_progress_bar=True)

In [12]:
embeddings

array([[-0.18085335, -0.02877346, -0.01457151, ...,  0.25280115,
         0.00423292, -0.11268158],
       [-0.09924358, -0.01678822, -0.02370374, ...,  0.08323223,
         0.04386293, -0.17668416],
       [ 0.07698026, -0.10633833, -0.03089516, ...,  0.23365116,
        -0.14438123, -0.07243016],
       ...,
       [-0.05083593,  0.02510994, -0.15768604, ...,  0.1525963 ,
         0.23367059, -0.03094891],
       [-0.03681076,  0.01801717, -0.19822934, ..., -0.04061324,
        -0.11218578,  0.22541872],
       [-0.15221222,  0.26953474, -0.1986737 , ...,  0.03437556,
         0.07468171,  0.06973902]], dtype=float32)

In [None]:
model = SemanticSignalSeparation(4, encoder=encoder, random_state=42)
doc_topic_matrix = model.fit_transform(abstracts, embeddings=embeddings)

In [13]:
model.print_topics(top_k=10)

In [14]:
model.plot_concept_compass(0, 1)

In [15]:
model.rename_topics({
    0: "Topic0",
    1: "Topic1",
    2: "Topic2",
    3: "Topic4",
})

In [16]:
model.print_topic_distribution("I am a socialist and I am concerned with the growing inequality in our societies. I'd like to see governments do more to prevent the exploitation of workers.")

In [17]:
df = pd.DataFrame(doc_topic_matrix, columns=model.topic_names)

fig = px.scatter_matrix(df, dimensions=model.topic_names, color="Topic0", template="plotly_white")
fig = fig.update_traces(diagonal_visible=False, showupperhalf=False, marker=dict(opacity=0.6))
fig.show()