# Note: This notebook is for extracting data from text books for uploading to TigerGraph studio.
## This data is already generated and available at Data/csv/11_python.csv 
### PS: The above csv contains all 6(6th-10th science and 11th python) textbooks data. Not just 11th python as the name suggests

In [81]:
# install dependencies
# you'll have to install general data science libraries like numpy and pandas if they are not available on your system

# !pip install numpy
# !pip install pandas
!pip install pdfplumber
!pip install clean-text
!pip install nltk
!pip install keybert
!pip install transformers
!pip install sentence-transformers
!pip install tqdm

In [82]:
import nltk

# download english stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sankepally/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Import statements

In [1]:
import numpy as np
import pandas as pd
import pdfplumber
from cleantext import clean
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
from nltk.corpus import stopwords
from keybert import KeyBERT
from cleantext import clean

stop_words = stopwords.words('english')

kw_model = KeyBERT()

def extract_keywords(sentence):
    lis = kw_model.extract_keywords(sentence, keyphrase_ngram_range=(1, 1), stop_words = stop_words, top_n = 3)
    clean_list = []
    for each in lis:
        clean_each = clean(each[0].strip(), no_numbers=True, replace_with_number="", no_punct=True)
        if '' != clean_each:
            clean_list.append(clean_each)
    return clean_list

In [3]:
extract_keywords('war is bad')

['war', 'bad']

In [4]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

def extract_embedding(sentence):
    return str(list(model.encode(sentence)))

In [5]:
extract_embedding('war is never good')

'[0.002829365, 0.024987437, 0.015249953, -0.021388775, -0.0522007, 0.09914375, 0.0010025097, -0.11118701, -0.00068331807, 0.03139175, -0.013233021, 0.06263115, 0.034528036, 0.009428903, -0.08118215, 0.0462052, -0.04227168, -0.043866996, -0.026571063, -0.01029072, -0.06500262, 0.055710282, 0.10370304, 0.029750353, -0.04489703, 0.0071981503, -0.007854432, 0.009255177, -0.05181875, 0.059535883, 0.03223194, 0.03462218, -0.07084622, 0.03612974, 0.03379692, -0.008233604, -0.0014060126, 0.050732393, 0.015356616, -0.023598855, -0.028280608, -0.014991093, -0.013565133, -0.029381888, 0.020428535, 0.040843867, 0.024555419, -0.017937401, 0.07358307, 0.0103214905, -0.013437979, -0.029363407, -0.01496702, -0.01783271, 0.07996846, -0.0063506006, 0.028404886, 0.043091964, 0.061533324, -0.016756872, -0.01935311, 0.008117353, -0.01038007, -0.09968768, 0.080171615, -0.0006195122, 0.03700022, 0.09160838, -0.13171138, 0.06295779, -0.04772893, 0.030385371, 0.011729667, -0.029647289, -0.043181516, 0.04575905

In [None]:
## tensor version
# def extract_embedding(sentence):
#     return model.encode(sentence, convert_to_tensor=True)

In [6]:
# stemmer

from nltk.stem.snowball import SnowballStemmer

englishStemmer=SnowballStemmer("english")

def stem(word):
    return englishStemmer.stem(word)

In [7]:
stem('having')

'have'

In [8]:
from tqdm import tqdm
import re

def get_words(file, grade, subject):
    
    # data file
    # file = 'Data/Computer-Science-Python-Book-Class-XI.pdf'
    extracted_txt = []
    
    df = pd.DataFrame(columns=['word', 'sentence', 'page', 'book', 'subject', 'grade', 'concat'])
    
    stop_words = stopwords.words('english')
#     punctuations = ['(',')',';',':','[',']',',']
    
    # set types
    df['word'] = df['word'].astype('str')
    df['sentence'] = df['sentence'].astype('str')
    df['page'] = df['page'].astype('int')
    df['book'] = df['book'].astype('str')
    df['subject'] = df['subject'].astype('str')
    df['grade'] = df['grade'].astype('int')
#     df['embedding'] = df['embedding'].astype('object')
    df['concat'] = df['concat'].astype('str')
    
    # load file into pdfplumber
    pdf = pdfplumber.open(file)
    
    # loop each page
    # ignore starting and ending pages
    for i in tqdm(range(10, len(pdf.pages)-5)):
        x = pdf.pages[i].extract_text()
        clean_x_list = clean(x, no_line_breaks=True).split('.')
        
        # for each sentence of the page, extract words
        for each in clean_x_list:
            
            # extract key words
            keywords = extract_keywords(each)
            # get embedding
#             embedding = extract_embedding(each)
            # add to df
            for word in keywords:
                df = df.append({'word':stem(word), 'sentence': re.sub('[\|\"\#]', '', each ), 'page':i+1, 'book':str(grade) + '_' + subject,'subject':subject, 'grade':grade, 'concat':'#'}, ignore_index=True)
    return df

# Book: 11th class python

In [250]:
new_res_df = get_words('Data/ncert-science/Computer-Science-Python-Book-Class-XI.pdf',11,'python')

100%|██████████| 257/257 [02:47<00:00,  1.53it/s]


In [251]:
new_res_df.to_csv('Data/csv/11_python.csv',index=False, sep='|',quotechar='"')

# Book: 8th class science

In [257]:
science8_df = get_words('Data/ncert-science/Science-Class-8.pdf', 8, 'science')

100%|██████████| 249/249 [05:46<00:00,  1.39s/it]


In [259]:
science8_df.sample(10)

Unnamed: 0,word,sentence,page,book,subject,grade,concat
2390,mug,what is the material of the buckets or mugs y...,49,8_science,science,8,#
15409,rain,wonder whether acid rain you know that co is ...,254,8_science,science,8,#
4015,magnesium,we learnt that magnesium burns to form magnes...,76,8_science,science,8,#
962,soil,sow some seeds in the soil and arrange to wat...,27,8_science,science,8,#
7560,gland,milk secreting glands or mammary glands devel...,129,8_science,science,8,#
15489,warm,majority of people living in regions which ar...,255,8_science,science,8,#
14961,planet,in a planetarium you can see the motion of th...,248,8_science,science,8,#
2122,clamp,nylon take an iron stand with a clamp,46,8_science,science,8,#
7216,find,find out if the twins are identical or non-id...,124,8_science,science,8,#
5247,live,many tribals still live in the area,94,8_science,science,8,#


In [261]:
science8_df.to_csv('Data/csv/11_python.csv',mode='a', index=False, sep='|',quotechar='"')

# Book: 9th class science

In [9]:
science9_df = get_words('Data/ncert-science/Science-Class-9.pdf',9,'science')

100%|██████████| 213/213 [13:14<00:00,  3.73s/it]


In [10]:
science9_df.sample(10)

Unnamed: 0,word,sentence,page,book,subject,grade,concat
13541,helicobact,list any three reasons why you that a bacteri...,190,9_science,science,9,#
2264,carbon,we may not be able to see - sodium acetate + ...,43,9_science,science,9,#
552,liquid,this heat gets used up in changing the liquid...,17,9_science,science,9,#
4722,life,"in this process, each cell called mother cell...",75,9_science,science,9,#
9600,kg,5) as n m2 kg-2,144,9_science,science,9,#
3672,averag,"but if an element occurs in q isotopic forms,...",63,9_science,science,9,#
10449,mass,* gravitation is a weak force unless large ma...,153,9_science,science,9,#
7395,graph,"5, draw 1 2 perpendiculars from the points cor...",115,9_science,science,9,#
2640,polyatom,in case the number of 2 polyatomic ion is one...,48,9_science,science,9,#
7096,object,"when an object is moving most cases, objects ...",111,9_science,science,9,#


In [11]:
science9_df.to_csv('Data/csv/11_python.csv',mode='a', index=False, sep='|',quotechar='"')

# Book: 10th class science

In [12]:
science10_df = get_words('Data/ncert-science/Science-Class-10.pdf',10,'science')

100%|██████████| 277/277 [20:54<00:00,  4.53s/it]  


In [13]:
science10_df.sample(10)

Unnamed: 0,word,sentence,page,book,subject,grade,concat
1495,juic,2 saliva (after meal) what is the nature of e...,36,10_science,science,10,#
11885,mirror,the magnification produced by a spherical mir...,195,10_science,science,10,#
15353,motor,we have seen the working of a split ring comm...,246,10_science,science,10,#
11212,index,from the table you can know that the refracti...,185,10_science,science,10,#
13499,silver,2 electrical resistivity* of some substances a...,217,10_science,science,10,#
8807,matur,"changes in the body at puberty, such as incre...",150,10_science,science,10,#
12327,near,"the near point, for the person, is farther aw...",200,10_science,science,10,#
14431,compass,take a small compass and a bar magnet,234,10_science,science,10,#
14353,link,thus we can say that electricity and magnetis...,233,10_science,science,10,#
8855,reproduct,how are the modes for reproduction different ...,151,10_science,science,10,#


In [14]:
science10_df.to_csv('Data/csv/11_python.csv',mode='a', index=False, sep='|',quotechar='"')

# Book: 7th class science

In [15]:
science7_df = get_words('Data/ncert-science/Science-Class-7.pdf',7,'science')

100%|██████████| 233/233 [05:53<00:00,  1.52s/it]


In [16]:
science7_df.sample(10)

Unnamed: 0,word,sentence,page,book,subject,grade,concat
4525,profound,it has a profound effect on our lives,82,7_science,science,7,#
13853,situat,the situation water,239,7_science,science,7,#
10487,standard,"bureau of indian standards, new delhi assigns...",179,7_science,science,7,#
7257,slowest,rate of breathing will be the slowest and sup...,125,7_science,science,7,#
4061,calcium,"indeed, every new carbon dioxide (co ) + lime...",76,7_science,science,7,#
2932,convect,"these are conduction, convection and radiation",59,7_science,science,7,#
630,absorb,(c) in photosynthesis solar energy is absorbe...,23,7_science,science,7,#
2459,fig,(fig,52,7_science,science,7,#
774,various,1 various modes of feeding 4,25,7_science,science,7,#
6763,soil,"for paddy, soils rich components of soil, det...",117,7_science,science,7,#


In [17]:
science7_df.to_csv('Data/csv/11_python.csv',mode='a', index=False, sep='|',quotechar='"')

# Book: 6th class science

In [19]:
science6_df = get_words('Data/ncert-science/Science---Class-6.pdf',6,'science')

100%|██████████| 161/161 [03:30<00:00,  1.31s/it]


In [20]:
science6_df.sample(10)

Unnamed: 0,word,sentence,page,book,subject,grade,concat
5993,subtract,you cannot measure the girth of subtract the ...,110,6_science,science,6,#
10,boil,"we take at different times, isn't it? raw ric...",11,6_science,science,6,#
2550,mixtur,nnnnn difference in the size of particles in ...,54,6_science,science,6,#
4787,fish,the habitat provides we discussed in chapter ...,91,6_science,science,6,#
1530,type,2 different types of objects let us be a littl...,37,6_science,science,6,#
3157,plant,"activity 2 we would require a glass, water, r...",64,6_science,science,6,#
4306,bodi,"part of the body, keeping the rear place the ...",83,6_science,science,6,#
7845,south,using this you can figure out which end of th...,140,6_science,science,6,#
5954,length,"even today, we before we do that, we do need ...",109,6_science,science,6,#
7560,fig,these are known as fig,136,6_science,science,6,#


In [21]:
science6_df.to_csv('Data/csv/11_python.csv',mode='a', index=False, sep='|',quotechar='"')

# Add index for sentence_id

In [33]:
df = pd.read_csv('Data/csv/11_python.csv', sep='|',quotechar='"')

In [39]:
df['grade'].unique()

array([11,  8,  9, 10,  7,  6])

In [30]:
df = df[~(df['grade']=='grade')]

In [38]:
df.to_csv('Data/csv/11_python.csv',mode='w', index=True, sep='|',quotechar='"')

In [37]:
df

Unnamed: 0,word,sentence,page,book,subject,grade,concat
0,unit,unit 1,11,11_python,python,11,#
1,comput,chapter 1 computer fundamentals after studying...,12,11_python,python,11,#
2,fundament,chapter 1 computer fundamentals after studying...,12,11_python,python,11,#
3,comput,chapter 1 computer fundamentals after studying...,12,11_python,python,11,#
4,comput,may it be the field of education and research...,12,11_python,python,11,#
...,...,...,...,...,...,...,...
81533,sheet,they or a sheet of newspaper on the paste to ...,171,6_science,science,6,#
81534,soak,let it soak up the extra water,171,6_science,science,6,#
81535,water,let it soak up the extra water,171,6_science,science,6,#
81536,let,let it soak up the extra water,171,6_science,science,6,#
