In [None]:
!wget https://raw.githubusercontent.com/protontypes/AwesomeCure/main/csv/projects_with_readme.csv
!pip install nltk pandas yake multi_rake keybert

In [None]:
import pandas as pd
pd.set_option('display.max_columns',200)
pd.set_option('display.max_colwidth', 50)

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stpwords = set(stopwords.words('english'))

In [None]:
raw = pd.read_csv('projects_with_readme.csv')
print(raw.shape)
raw.head()

# 1. Goal: Reduce the list of topics below into subtopics


In [None]:
raw['rubric'].value_counts()

In [None]:
df = raw[['project_name','oneliner','rubric','topics','git_namespace','readme_content']]
df.head()

# 2. Sample Row

In [None]:
# Let's get a look at the relevant columns
print(df['rubric'][0])
print(df['oneliner'][0])
print(df['topics'][0])

In [None]:
# Save the columns for the first row into relevant variables for testing
oneliner = df['oneliner'][0]
topics = df['topics'][0]
readme = df['readme_content'][0]

In [None]:
# Combining oneliner, topics, and readme to get the most keywords as possible
sample = oneliner +' '+ topics +' '+ readme
sample

In [None]:
# Text cleaning
import re
clean_sample1 = sample.replace('\\n',' ') # Remove \\n
clean_sample2 = re.sub(r'<[^>]+>', ' ', clean_sample1)  # Remove HTML tags
clean_sample3 = re.sub(r'[^a-zA-Z0-9\s]', ' ', clean_sample2)  # Remove non-alphanumeric characters except spaces
clean_sample3 = clean_sample3.lower()


In [None]:
# Removing words
words_black_list = ['python','pvlib','projects','affiliated','http','readthedocs','benchmarks','license','matlab','user','guide','html','https','open','source','journal',
                    'latest','release','build','publications','conda','installed','google','documentation','please']

words_list = clean_sample3.split() # Turn into a list
words_list2 = [x for x in words_list if len(x) > 3] # Remove short words
words_list3 = [x for x in words_list2 if x.isalpha()] # Remove numbers
words_list4 = [x for x in words_list3 if x not in words_black_list] # Remove blacklisted words
words_list5 = [x for x in words_list4 if not x in stpwords] # Remove stopwords
words_string = ' '.join(words_list5) # Turn back into string
words_string


## Yake for keyword Extraction
* Using our sample text, we will apply Yake's keyword extraction algorithm

In [None]:
import yake

kw_extractor = yake.KeywordExtractor(top=20, stopwords='en')
keywords = kw_extractor.extract_keywords(words_string)
for kw, v in keywords:
  print("Keyphrase: ",kw, ": score", v)


## Rake for keyword Extracion

In [None]:
from multi_rake import Rake
rake = Rake()
keywords = rake.apply(words_string)
print(keywords[:10])

## KeyBERT for keyword extraction
* Using our sample text, we will apply Yake's keyword extraction algorithm

In [None]:
pip install keybert

In [None]:
from keybert import KeyBERT

In [None]:
kw_model = KeyBERT('distilbert-base-nli-mean-tokens')
bert_keywords = kw_model.extract_keywords(words_string,keyphrase_ngram_range=(3,3), stop_words='english',
                                          use_mmr=True,diversity=0.9, top_n=20)

In [None]:
bert_keywords

# 3. Cleaning Full Dataset

In [None]:
# Fill null values with 'None'. This will allow us to concatenate the relevant text columns
df = df.fillna('None')

In [None]:
# Combine text columns
combined = df['oneliner'] + ' '+ df['topics'] + ' '+ df['readme_content']

In [None]:
df['combined'] = combined
df.head()

In [None]:
# Words to remove from column. We will keep adding to this list the more words we find that are irrelavent
words_black_list = ['python','pvlib','projects','affiliated','http','readthedocs','benchmarks',
                    'license','matlab','user','guide','html','https','open','source','journal',
                    'latest','release','build','publications','conda','installed','users','using'
                    'google','documentation','please','github','data','model','install','code',
                    'package','badge','project']

In [None]:
# Create new column called "cleaned_text" and apply different cleaning methods
df['cleaned_text'] = df['combined'].apply(lambda x: x.replace("\\n", " ")) # Replace \\n
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join(x.lower() for x in str(x).split())) # Lowercase
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: re.sub(r'<[^>]+>', ' ', x)) # Remove HTML tags
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]',' ', x)) # Remove symbols
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join(x for x in str(x).split() if len(x) > 3)) # Remove short words
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join(x for x in str(x).split() if x.isalpha())) # Remove numbers
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join(x for x in str(x).split() if x not in words_black_list)) # Remove words from blacklist
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join(x for x in str(x).split() if x not in stpwords)) # Remove stopwords



In [None]:
df['cleaned_text']

# 4. Unique Words

* Let's get a look at the most common words in our cleaned_text column

In [None]:
from collections import Counter

# Tokenize text
tokenized_data = [nltk.word_tokenize(text) for text in df['cleaned_text']]

# count the occurrence of each token
token_counts = [Counter(tokens) for tokens in tokenized_data]

# combine the counts from all rows
combined_counts = sum(token_counts, Counter())

In [None]:
# print the top 30 most common tokens
combined_counts.most_common(30)

In [None]:
# More words to remove
words_black_list = ['python','pvlib','projects','affiliated','http','readthedocs','benchmarks',
                    'license','matlab','user','guide','html','https','open','source','journal',
                    'latest','release','build','publications','conda','installed','users','using',
                    'google','documentation','please','github','data','model','install','code',
                    'package','badge','project','version','file','view','system','master','used','also',
                    'repository','example','docs','files','information','models','software','available',
                    'zenodo','command','modis','pull','request','pctl','theoj','brodiepearson','blob','getting',
                    'started','machine','learning','make','sure','pypi','ipcc','main','scholor','colorado','codecov',
                    'none','jupyter','notebook','united','states','docker','anaconda','forge','datasets','false','would',
                    'like','gustavoirgang','google','collab','sciencedirect','yaml','downloads','actions','workflows',
                    'media','icon','joss','papers','legend','description','jobs','download','input','output','latitude','longitude',
                    'unit','tests','cran','nbsp','colab','check','plot','ncss','contributing','installation', 'instructions',
                    'wiki','wikipedia','feel','free','name','list','issues','start','examples','index','dataset','branch','create',
                    'library','following','test','running','import','database','access','packages','directory','need','change','tools']


df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join(x for x in str(x).split() if x not in words_black_list)) # Remove words from blacklist


# 5. Unique Bigrams and Trigrams

* We should also look at the most common bigrams and trigrams

In [None]:
df['cleaned_text'].str.split()

In [None]:
# Create list of words in the order in which they orignally appear
words = list(df['cleaned_text'].str.split().apply(pd.Series).stack().reset_index(drop = True))

In [None]:
len(words)

In [None]:
# Most frequent bigrams
bigrams = (pd.Series(nltk.ngrams(words, 2)).value_counts()).to_frame('count')

In [None]:
bigrams.head(30)

In [None]:
# Most frequent trigrams
(pd.Series(nltk.ngrams(words, 3)).value_counts())[:30]

#6. Further Work

* Further reduce the number of unique keywords by removing irrelevant words
* Create a list of topics/subtopics that we want
* Define features (words, bigrams, trigrams)
* cosine similarity to get similar projects
