In [None]:
from pypdf import PdfReader

In [None]:
pdf_reader = PdfReader('./Civics.pdf')

In [None]:
data = []
for i, page in enumerate(pdf_reader.pages):
    data.append({'page':i, 'content':page.extract_text()})

In [None]:
data = data[6:len(pdf_reader.pages)]

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(data)

In [None]:
import nltk
import string
pd.options.mode.chained_assignment = None

In [None]:
df['text_lower'] = df['content'].str.lower()

In [None]:
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('','', PUNCT_TO_REMOVE))

df['text_wo_punct'] = df['text_lower'].apply(lambda text: remove_punctuation(text))


In [None]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

In [None]:
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

In [None]:
df['text_wo_stop'] = df['text_wo_punct'].apply(lambda text: remove_stopwords(text))
df.tail()

In [None]:
from collections import Counter
cnt = Counter()
for text in df['text_wo_stop'].values:
    for word in text.split():
        cnt[word] += 1
cnt.most_common(10)

In [None]:
FREQ_WORDS = set([w for (w, wc) in cnt.most_common(10)])

def remove_freqwords(text):
    return ' '.join([word for word in str(text).split() if word not in FREQ_WORDS])
df['text_wo_stopfreq'] = df['text_wo_stop'].apply(lambda text: remove_freqwords(text))
df['text_wo_stopfreq'][1][:-1]

In [None]:
df.drop(['text_wo_punct', 'text_wo_stop'], axis = 1, inplace= True)


In [None]:
n_rare_words = 10
RARE_WORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
RARE_WORDS

In [None]:
def remove_rarewords(text):
    return ' '.join([word for word in str(text).split() if word not in RARE_WORDS])

In [None]:
df['text_wo_stopfreqrare'] = df['text_wo_stopfreq'].apply(lambda text: remove_rarewords(text))
df.head()

In [None]:
from nltk.stem.porter import PorterStemmer
df['text_wo_stopfreqrare'][3][:]

In [None]:
df['text_wo_stopfreqrare'][1][20:]

In [None]:
df['text_wo_stopfreqrare']

In [None]:
units = []
for i in range(1, 12):
    units.append(i)
units

In [None]:
chapter_titles = ['Building a democratic system',
                  'Rule of Law','Equality','Justice','Patriotism','Responsibility','Industriousness','Self Reliance','Saving', 'Active community participation','The pursuit of wisdom']

In [None]:
chapters = {str(unit):chapter for (unit, chapter) in zip(units, chapter_titles)}
chapters

In [None]:
import chromadb
import sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer, util

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
documents = []
metadatas = []
ids = []

for i in range(len(df)):
    documents.append(df['text_wo_punct'][i][6:])
    metadatas.append({'unit':df['text_wo_punct'][i][:6], 'page':i + 1})
    ids.append(str(i + 3))


In [None]:
documents

In [None]:
client = chromadb.Client()

In [None]:
vector_coll = client.create_collection('civics-latest')

vector_coll.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

In [None]:
results = vector_coll.query(
    query_texts=['justice'],
    n_results=5
)
for i in range(len(results)):
    print('Ids:' , results['ids'][0][i])
    print('Metadatas:', results['metadatas'][0][i])
    print('Description:', results['documents'][0][i])
    print('\n')