In [1]:
from pypdf import PdfReader

In [2]:
pdf_reader = PdfReader('./Civics.pdf')

In [3]:
data = []
for i, page in enumerate(pdf_reader.pages):
    data.append({'page':i, 'content':page.extract_text()})

In [4]:
data = data[6:len(pdf_reader.pages)]

In [5]:
import pandas as pd

In [6]:
df = pd.DataFrame(data)

In [7]:
import nltk
import string
pd.options.mode.chained_assignment = None

In [8]:
df['text_lower'] = df['content'].str.lower()

In [44]:
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('','', PUNCT_TO_REMOVE))

df['text_wo_punct'] = df['text_lower'].apply(lambda text: remove_punctuation(text))


In [None]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

In [None]:
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

In [None]:
df['text_wo_stop'] = df['text_wo_punct'].apply(lambda text: remove_stopwords(text))
df.tail()

In [None]:
from collections import Counter
cnt = Counter()
for text in df['text_wo_stop'].values:
    for word in text.split():
        cnt[word] += 1
cnt.most_common(10)

In [None]:
FREQ_WORDS = set([w for (w, wc) in cnt.most_common(10)])

def remove_freqwords(text):
    return ' '.join([word for word in str(text).split() if word not in FREQ_WORDS])
df['text_wo_stopfreq'] = df['text_wo_stop'].apply(lambda text: remove_freqwords(text))
df['text_wo_stopfreq'][1][:-1]

In [None]:
df.drop(['text_wo_punct', 'text_wo_stop'], axis = 1, inplace= True)


In [None]:
n_rare_words = 10
RARE_WORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
RARE_WORDS

In [None]:
def remove_rarewords(text):
    return ' '.join([word for word in str(text).split() if word not in RARE_WORDS])

In [None]:
df['text_wo_stopfreqrare'] = df['text_wo_stopfreq'].apply(lambda text: remove_rarewords(text))
df.head()

In [None]:
from nltk.stem.porter import PorterStemmer
df['text_wo_stopfreqrare'][3][:]

In [None]:
df['text_wo_stopfreqrare'][1][20:]

In [None]:
df['text_wo_stopfreqrare']

In [10]:
units = []
for i in range(1, 12):
    units.append(i)
units

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

In [11]:
chapter_titles = ['Building a democratic system',
                  'Rule of Law','Equality','Justice','Patriotism','Responsibility','Industriousness','Self Reliance','Saving', 'Active community participation','The pursuit of wisdom']

In [12]:
chapters = {str(unit):chapter for (unit, chapter) in zip(units, chapter_titles)}
chapters

{'1': 'Building a democratic system',
 '2': 'Rule of Law',
 '3': 'Equality',
 '4': 'Justice',
 '5': 'Patriotism',
 '6': 'Responsibility',
 '7': 'Industriousness',
 '8': 'Self Reliance',
 '9': 'Saving',
 '10': 'Active community participation',
 '11': 'The pursuit of wisdom'}

In [13]:
import chromadb
import sentence_transformers

In [14]:
from sentence_transformers import SentenceTransformer, util

In [15]:
model = SentenceTransformer('all-MiniLM-L6-v2')

2023-09-19 11:54:24.692070: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [45]:
documents = []
metadatas = []
ids = []

for i in range(len(df)):
    documents.append(df['text_wo_punct'][i][6:])
    metadatas.append({'unit':df['text_wo_punct'][i][:6], 'page':i + 1})
    ids.append(str(i + 3))


In [46]:
documents

[' — building a democratic system lesson 1  \n civics and ethical education — grade 10 student textbook square6 3by the end of this lesson you should be able to\n circle6 d escri b e  th e  m a i n  pri nc i p l es  a n d  va l u es  o f  a  \ndemocratic system\n circle6 participate in the democratic process of your \ncountry within the limit of your capacity\ndid you see any examples of democracy in action during your vacation tell the class about \nthem\nin grade 9 you learnt about ancient and modern \nforms of democracy the first form of democracy was direct democracy while the modern form is indirect democracy in this lesson you will learn about the essence of democracy and the main principles and values of a democratic system this will help you to participate in and contribute to the democratic process in your country today the most widely practised form of democracy throughout the world is indirect democracy when you reach the age to enjoy the right to vote you will elect represe

In [25]:
client = chromadb.Client()

In [49]:
vector_coll = client.create_collection('civics-latest')

vector_coll.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

In [52]:
results = vector_coll.query(
    query_texts=['justice'],
    n_results=5
)
for i in range(len(results)):
    print('Ids:' , results['ids'][0][i])
    print('Metadatas:', results['metadatas'][0][i])
    print('Description:', results['documents'][0][i])
    print('\n')

Ids: 67
Metadatas: {'page': 65, 'unit': 'unit 4'}
Description:  — justice lesson 3
categories of justice
civics and ethical education — grade 10 student textbook square6 67
the federal supreme court is one of
the instruments of justice
 another type of justice is corrective justice 
as the name implies corrective justice deals with correcting unjust acts correcting wrongdoings i n c l u d e s  t w o  o p t i o n s   f i r s t  o f  a l l   i t  i s  i m p o r t a n t  to take back the situation to what it was before secondly the wronged person or group should be compensated for the wrong done  this type of justice is important to remove the 
feeling of vengeance from the wronged person in the absence of corrective justice people who have been wronged feel bad and start thinking of ways to correct it themselves some people may become violent taking matters into their own hands this is not a good way of correcting wrongdoings  democracy as a system suffers when people take justice into t