# Open Sustainable Technology Topic Modeling

This is a prototype notebook for experimenting with the Open Sustainable Technology dataset in combination with natural language processing. The first goal is to identify and cluster topics within open source projects in the field of environmental sustainability.

ToDO:
* Create Topic Blacklist
* Hierarchical Topic Modeling: https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html

Tips and Tricks given by the BARTopic Maintainer Maarten:
* Split README into sentences --> Topic Modeling on every sentence.
* Just use the first two paragraphs of the README, which is the actual description of the project.
* Remove special character / HTML / badges in the preprocessing   
* Representation Model to have "better" words
* Example to use: https://maartengr.github.io/BERTopic/getting_started/representation/representation.html#keybertinspired --> 10 words per topics -->
* Parse -->GPT4



In [None]:
!pip install bertopic keybert pandas
!wget https://raw.githubusercontent.com/protontypes/AwesomeCure/main/csv/projects_with_readmes.csv

## Reconfigure CSV Reading

In [None]:
import pandas as pd
import sys
import csv

maxInt = sys.maxsize

# find the largest possible file size
while True:
    try:
        csv.field_size_limit(maxInt)
        break
    except OverflowError:
        maxInt = int(maxInt/10)
df = pd.read_csv("projects_with_readmes.csv", engine='python')

In [None]:
from torch.utils.data import dataloader
import re
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
nltk.download('punkt')

data = pd.DataFrame(columns=["Sentence", "readme_content"])
readmes = [str(doc) for doc in df.readme_content.tolist()]

sentences_cleaned = []
for index, readme in enumerate(readmes):
  sents = sent_tokenize(re.sub(' +', ' ', BeautifulSoup(readme, "lxml").text.replace(r"\n", "")))
  sentences_cleaned.extend(sents)

  for sentence in sents:
    data.loc[len(data), :] = [sentence, index]

In [None]:
from bertopic.representation import KeyBERTInspired
from bertopic import BERTopic

# Create your representation model
representation_model = KeyBERTInspired()

# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model, verbose=True).fit(sentences_cleaned)
data["Topic"] = topic_model.topics_

In [None]:
topic_model.get_topic_info().head(20)

In [None]:
# # Calculate the topic distributions on a token-level
# topic_distr, topic_token_distr = topic_model.approximate_distribution(docs[:2], calculate_tokens=True)

# # Visualize the token-level distributions
# df = topic_model.visualize_approximate_distribution(docs[1], topic_token_distr[1])
# df


In [None]:
from sklearn.datasets import fetch_20newsgroups
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP

# Prepare embeddings
docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=False)

# Train BERTopic
topic_model = BERTopic().fit(docs, embeddings)

# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)


## Preprocessing and Cleanup of dataset

In [None]:
# Replace nan with empty strings in the readmes
import numpy as np
df = df.replace(np.inf, np.nan)
df_readme = df['readme'].fillna('')
docs = df_readme.to_list()

# Replace nan with empty strings in topics
df_topics = df["topics"].fillna('')
supervised_topics = df_topics.to_list()

# create a subset of the dataset to speedup the processing
docs_subset = docs

In [None]:
# Create a flat list of all userdefined topics
supervised_topics_flat = []
for word in supervised_topics:
    word = word.split(",")
    supervised_topics_flat.extend(word)
supervised_topics_flat = [x for x in supervised_topics_flat if x]

In [None]:
supervised_topics_flat

In [None]:
# Define a list of blacklist topics and remove them from the topic list
topic_black_list = ["install","data","python","using","use"]
topic_black_list_set = set(topic_black_list)
supervised_topics_flat_cleaned = [x for x in supervised_topics_flat if x not in topic_black_list_set]

In [None]:
from keybert import KeyBERT
from sklearn.feature_extraction.text import CountVectorizer


# Extract keywords
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(docs)

vocabulary = [k[0] for keyword in keywords for k in keyword]
vocabulary = list(set(vocabulary))
vocabulary = [str(i) for i in vocabulary]

vectorizer_model= CountVectorizer(vocabulary=vocabulary,stop_words="english")


In [None]:
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer

ctfidf_model = ClassTfidfTransformer(bm25_weighting=True)

topic_model = BERTopic(ctfidf_model=ctfidf_model, min_topic_size=10)
topics, probs = topic_model.fit_transform(docs)

In [None]:
topic_model.get_topic_info()

In [None]:
subset_training = topic_model.get_document_info(docs_subset)

In [None]:
subset_training