<a href="https://colab.research.google.com/github/raz0208/Natural-Language-Processing-Practices/blob/main/TopicModelling/NLP_TopicsModellingPractices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Topic Modelling Practices in NLP


1. Semantic Signal Separation
2. KeyNMF
3. ClusteringTopicModel



### 1. Semantic Signal Separation

In [None]:
!pip install -q transformers datasets huggingface_hub

In [None]:
from huggingface_hub import login



In [None]:
# Install turftopic libraries
!pip install turftopic
!pip install datasets

In [1]:
# import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from turftopic import SemanticSignalSeparation
from datasets import load_dataset
from sentence_transformers import SentenceTransformer

In [2]:
# Read and load dataset
ds = pd.read_csv('political_ideologies_train.csv')
texts = ds["statement"]

print(texts)

0       Climate change, and the escalating environment...
1       I believe in the foundational importance of th...
2       I firmly believe that the principle of separat...
3       I firmly believe in the separation of church a...
4       I firmly believe in the power of free markets ...
                              ...                        
2555    I believe in the power of free markets to driv...
2556    I firmly believe in the traditional family str...
2557    Every individual, regardless of their gender, ...
2558    I firmly believe in the significance of religi...
2559    I firmly believe in the principle of individua...
Name: statement, Length: 2560, dtype: object


In [3]:
encoder = SentenceTransformer('paraphrase-MiniLM-L12-v2')
embeddings = encoder.encode(texts, show_progress_bar=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/80 [00:00<?, ?it/s]

In [4]:
print(embeddings)

[[-0.20092632  0.33961082  0.02055947 ... -0.14055775 -0.09066175
  -0.05947793]
 [-0.05251725  0.2111203  -0.2350792  ... -0.19393641  0.06882618
  -0.02158776]
 [-0.06461112  0.07165432 -0.08635489 ... -0.05988945  0.02278206
   0.0677686 ]
 ...
 [ 0.2636646   0.07064892 -0.10288888 ...  0.00702172  0.0945406
  -0.02747649]
 [ 0.2114343   0.13071813 -0.26949868 ... -0.00532549 -0.03988217
  -0.08325328]
 [-0.3067356   0.26985478 -0.18770279 ... -0.16361783  0.10027055
  -0.09515021]]


In [5]:
model = SemanticSignalSeparation(4, encoder=encoder, random_state=42)
doc_topic_matrix = model.fit_transform(texts, embeddings=embeddings)

Output()

In [6]:
model.print_topics(top_k=10)

In [7]:
model.plot_concept_compass(0, 1)

In [8]:
model.rename_topics({
    0: "Religiosity",
    1: "Economic vs Social",
    2: "Environmentalism",
    3: "Personality"
})

In [9]:
model.print_topic_distribution("I am a socialist and I am concerned with the growing inequality in our societies. I'd like to see governments do more to prevent the exploitation of workers.")

In [10]:
import plotly.express as px

df = pd.DataFrame(doc_topic_matrix, columns=model.topic_names)
df["party"] = ["Liberal" if label == 1 else "Conservative" for label in ds["label"]]

fig = px.scatter_matrix(df, dimensions=model.topic_names, color="party", template="plotly_white", width=1000, height=1100)
fig = fig.update_traces(diagonal_visible=False, showupperhalf=False, marker=dict(opacity=0.6))
fig.show()

### 2. Clustering Analysing
#### Building a Taxonomy of Machine Learning Papers

In [None]:
!pip install -U datasets

In [None]:
!pip install turftopic[umap-learn]
!pip install turftopic[datamapplot]

In [None]:
# import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
#from turftopic import SemanticSignalSeparation
from datasets import load_dataset
from sentence_transformers import SentenceTransformer

In [None]:
from datasets import load_dataset

ds = load_dataset("CShorten/ML-ArXiv-Papers", split="train")

# Subsampling dataset
ds = ds.train_test_split(seed=42, test_size=10_000)["test"]
abstracts = ds["abstract"]

In [None]:
encoder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = encoder.encode(abstracts, show_progress_bar=True)

In [None]:
ds = pd.read_csv('hf://datasets/CShorten/ML-ArXiv-Papers/ML-Arxiv-Papers.csv')

In [None]:
# Subsample 1000 fisrt row
ds = ds.head(1000)
ds

In [None]:
abstract = ds["abstract"]
abstract

In [None]:
encoder = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = encoder.encode(abstract, show_progress_bar=True)

In [None]:
from turftopic import Top2Vec

model = Top2Vec(encoder=encoder, random_state=42)
topic_data = model.prepare_topic_data(abstract, embeddings=embeddings)

In [None]:
model.print_topics()

In [None]:
model.reduce_topics(n_reduce_to=25)
print(model.hierarchy.cut(3))

In [None]:
for topic_id in model.hierarchy:
    print(topic_id)  # or: print(topic_id, type(topic_id))

In [None]:
fig = model.hierarchy.plot_tree()
fig.show()

In [None]:
# We will reset the hierarchy, so that we can see all topics at once.
model.reset_topics()
fig = model.plot_clusters_datamapplot(hover_text=ds["title"])
fig.show()