# Abhyas Edu Context
### Note: Choose Nvidia GPU in Google Colab for faster computation

In [1]:
# this will tell you if you have an Nvidia GPU attached to your machine or not
!nvidia-smi

Tue Apr 19 16:40:12 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P8    59W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# install dependencies
# ignore pip dependency warnings/errors (These are Colab specific)

!pip install pdfplumber
!pip install clean-text
!pip install nltk
!pip install keybert
!pip install unidecode
!pip install transformers
!pip install sentence-transformers
!pip install pyTigerGraph -q
!pip install tqdm



In [3]:
#downloading assets

from nltk import download

download('stopwords')
download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# keyword extraction

from nltk.corpus import stopwords
from keybert import KeyBERT
from cleantext import clean

stop_words = stopwords.words('english')

kw_model = KeyBERT()

def extract_keywords(sentence):
    lis = kw_model.extract_keywords(sentence, keyphrase_ngram_range=(1, 1), stop_words = stop_words, top_n = 3)
    clean_list = []
    for each in lis:
        clean_each = clean(each[0].strip(), no_numbers=True, replace_with_number="", no_punct=True)
        if '' != clean_each:
            clean_list.append(clean_each)
    return clean_list

In [5]:
# call the method once to download the required all-MiniLM-L6-v2 (distilled BERT base) model assets

extract_keywords('war is bad')

['war', 'bad']

In [6]:
# embedding extraction

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

def extract_embedding(sentence):
    return model.encode(sentence, convert_to_tensor=True)

In [7]:
# call the method once to download the required all-MiniLM-L6-v2 transfer learning model assets 
# This will not download again if downloaded at the above key word extraction step
# only the first call takes time due to assets being downloaded

a = extract_embedding('war is bad')
b = extract_embedding('war is never good')
c = extract_embedding('this is a sunflower')

In [8]:
from sentence_transformers import util

def similarity_score(embedding1, embedding2):
    return float(util.pytorch_cos_sim(embedding1, embedding2)[0][0])

In [9]:
similarity_score(a,b)

0.8693468570709229

In [10]:
similarity_score(a,c)

0.09055604040622711

In [11]:
similarity_score(b,c)

0.06450851261615753

In [12]:
# stemmer

from nltk.stem.snowball import SnowballStemmer

englishStemmer=SnowballStemmer("english")

def stem(word):
    return englishStemmer.stem(word)

In [13]:
stem('having')

'have'

# PyTigerGraph

In [14]:
import pyTigerGraph as tg

# connection parameters
# hostName is the TigerGraph solution URL
hostName = "https://abhyas-edu-context.i.tgcloud.io"
graphName = "edu_context"
userName = "tigergraph"
password = "thisisours"

# establish the connection to the TigerGraph Solution
conn = tg.TigerGraphConnection(host=hostName, username=userName, password=password)

# set the name of the graph that we want to connect to
conn.graphname = graphName

# create a secret
secret = conn.createSecret()
# use the secret to get a token
authToken = conn.getToken(secret)[0]

# connect to graph with token
conn = tg.TigerGraphConnection(host=hostName, username=userName, password=password, graphname=graphName, apiToken=authToken)

In [15]:
# result = conn.runInstalledQuery("get_edu_context", params={"inName": stem("software")})

In [16]:
# result[0]['RES'][0]

In [17]:
def get_edu_context(sentence, top_n=10):
    sentence_embedding = extract_embedding(sentence)
    key_words = extract_keywords(sentence)
    edu_context_res = []
    
    for word in key_words:
        this_res = conn.runInstalledQuery("get_edu_context", params={"inName": stem(word)})
        if this_res[0]['RES'] != [] and this_res[0]['RES'][0]['attributes']['@this_data'] != []:
            edu_context_res.append(this_res[0]['RES'][0])
    
    top_matches = []
    
    for each in edu_context_res:
        for sent_data in each['attributes']['@this_data']:
            this_sim_score = similarity_score(extract_embedding(sent_data['sentence']), sentence_embedding)
            grade = sent_data['book'].split('_')[0]
            subject = sent_data['book'].split('_')[1]
            top_matches.append({'sentence': sent_data['sentence'], 'page': sent_data['page'], 'grade':grade, 'subject':subject, 'similarity':this_sim_score })
            
    top_matches = [dict(t) for t in {tuple(d.items()) for d in top_matches}]
    
    return sorted(top_matches, key=lambda x: (x["similarity"], x['grade'], x['subject']), reverse=True)[:top_n]

# Now, let's find ***Edu Context***!!

In [18]:
import pandas as pd

In [19]:
# comment this cell if you're not using google colab

from google.colab import data_table
data_table.enable_dataframe_formatter()

In [20]:
result = get_edu_context('computers are really important for humanity')
pd.DataFrame(result)

Unnamed: 0,sentence,page,grade,subject,similarity
0,"the manner, in which computers have revolutio...",12,11,python,0.70634
1,may it be the field of education and research...,12,11,python,0.623129
2,the computers that we use today belong to thi...,20,11,python,0.59323
3,but the computers of today have evolved over ...,12,11,python,0.591756
4,in simple terms the computers of this generat...,21,11,python,0.566117
5,computers are very versatile as they do lot o...,21,11,python,0.549535
6,"the digital computers are used in industrial,...",26,11,python,0.528273
7,salient features of fifth generation computer...,21,11,python,0.515041
8,today no organization can function without a ...,12,11,python,0.513453
9,"we use objects to capture data, which then ca...",112,11,python,0.510727


In [24]:
result = get_edu_context('gravitational force')
pd.DataFrame(result)

Unnamed: 0,sentence,page,grade,subject,similarity
0,this force is called the gravitational force,141,9,science,0.798153
1,this force is known as the gravitational force,149,8,science,0.786039
2,4) gives the gravitational force,142,9,science,0.712268
3,u a force can act on an object with or withou...,154,8,science,0.700634
4,what do we call the gravitational force betwe...,154,9,science,0.696335
5,force,124,9,science,0.67562
6,force? the answer to these questions is gravi...,151,9,science,0.671985
7,4x106 m)2 to the gravitational force and is de...,145,9,science,0.660064
8,"which is called the force of gravity, or just...",149,8,science,0.65231
9,force f will be equal to the product of mass ...,145,9,science,0.650315


## Try something on your own!
#### Tip: Go to Data/ncert-science folder, open any one of the textbooks (pages "11" to "end-5") and search for related topics.

In [None]:
your_query = "type something here"
result = get_edu_context(your_query)
pd.DataFrame(result)

# *The End... And The Begining... :)*
