## Import Library

In [21]:
import pandas as pd
!pip install rake-nltk
!pip install sentence-transformers
import nltk
nltk.download('stopwords')
from google.colab import drive
drive.mount('/content/drive')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Data Processing

In [2]:
paragraphs = pd.read_json("/content/drive/MyDrive/vip_project_spring_2023/data/textbook_df.json")

In [3]:
length = len(paragraphs)

In [4]:
sections = {i for i in paragraphs["Chapter.Section"]}
sections = [s for s in sections]
sections.sort()

In [5]:
def to_string(section):
  section_text = [paragraphs["Textbook_Data"][i] for i in range(length) if paragraphs["Chapter.Section"][i] == section]
  section_string = ''
  for i in section_text:
    section_string += ' ' + str(i)
  return section_string

Try get keywords for chapter 5

In [6]:
chapter_num = 5
para_list = []
for i in sections:
  if i >=5 and i < 6:
    para_list.append(i)
para_list

[5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9]

In [7]:
chapter = ''
for i in para_list:
  chapter += to_string(i) 

## Extract Keywords using Sentence Transformer

In [8]:
from rake_nltk import Rake
rake_nltk_var = Rake()

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

n_gram_range = (1, 1)
stop_words = "english"

# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([chapter])
candidates = count.get_feature_names_out()
# candidates

In [10]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([chapter])
candidate_embeddings = model.encode(candidates)

Downloading (…)925a9/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)1a515925a9/README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading (…)515925a9/config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)925a9/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading (…)1a515925a9/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)15925a9/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Top 5 candidates for chapter 5

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

top_n = 5
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [(candidates[index], distances[0][index]) for index in distances.argsort()[0][-top_n:]]
keywords
# distances[0][distances.argsort()[0][-top_n:]]

[('equations', 0.3540721),
 ('algorithm', 0.36547846),
 ('filtering', 0.3665013),
 ('polynomials', 0.4075476),
 ('algorithms', 0.41442192)]

In [12]:
def create_sec_list(chapter_n):
  section_list = []
  for i in sections:
    if i >=chapter_n and i < chapter_n+1:
      section_list.append(i)
  return section_list

In [13]:
def create_chapter(chapter_n):
  chapter = ''
  sec_list = create_sec_list(chapter_n)
  for i in sec_list:
    chapter += to_string(i) 
  return chapter

In [14]:
import numpy as np
import itertools

def max_sum_sim(doc_embedding, word_embeddings, words, top_n, nr_candidates):
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [16]:
print(max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n=10, nr_candidates=20))

['mathematical', 'multipliers', 'algebraic', 'microprocessors', 'multiplications', 'reconsider', 'graph', 'sampling', 'polynomial', 'filters']


## Obtain the top 10 keywords for each sub_chapters

In [17]:
stop_words = "english"
top_n = 10
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
def keyword_extraction(chapter):
  n_gram_range = (1, 1)
  # Extract candidate words/phrases
  count_v = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([chapter])
  ch_candidates = count_v.get_feature_names_out()
  docs_embedding = model.encode([chapter])
  candidate_embedding = model.encode(ch_candidates)
  # distances = cosine_similarity(doc_embedding, candidate_embeddings)
  keywords = max_sum_sim(docs_embedding, candidate_embedding, ch_candidates, top_n=10, nr_candidates=20)
  # keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
  return keywords

In [22]:
keywords_sub_chap = {}
for i in sections:
  chapter = to_string(i)
  keywords_sub_chap[str(i)] = keyword_extraction(chapter)
keywords_sub_chap 

{'-2.7': ['lab',
  'economists',
  'con2dis',
  'polynomials',
  'microprocessor',
  'classes',
  'algebraic',
  'integers',
  'microprocessors',
  'homework'],
 '-2.6': ['sinusoids',
  'exponential',
  'commutative',
  'mathematical',
  'sinusoidal',
  'hundreds',
  'polynomial',
  'arithmetic',
  'multiplications',
  'integers'],
 '-2.5': ['designing',
  'tools',
  'computing',
  'equations',
  'polynomial',
  'compilers',
  'integers',
  'algebraic',
  'arithmetic',
  'mathematical'],
 '-2.4': ['optimized',
  'pairwise',
  'homework',
  'practical',
  'tools',
  'polynomials',
  'multiplications',
  'polynomial',
  'integers',
  'mathematical'],
 '-2.3': ['economists',
  'computing',
  'transform',
  'superimpose',
  'sinusoidal',
  'integers',
  'arithmetic',
  'exponential',
  'polynomial',
  'mathematical'],
 '-2.2': ['complex',
  'sinusoidal',
  'shading',
  'equations',
  'tools',
  'algorithms',
  'polynomial',
  'arithmetic',
  'integers',
  'mathematical'],
 '-2.1': ['econom

In [23]:
from google.colab import files
k_s = [(k, v) for k, v in keywords_sub_chap.items()]
df = pd.DataFrame(k_s, columns=["sub_chapter",'keywords'])
df.to_csv('keywords_para.csv')
files.download('keywords_para.csv')