In [None]:
!curl -LO https://github.com/purnadip-manna/smart_assessment/raw/main/models/testmodel1.h5
!curl -LO https://raw.githubusercontent.com/purnadip-manna/smart_assessment/main/data.csv

In [None]:
!pip install sentence-transformers 

In [None]:
!pip install keybert

In [None]:
import tensorflow as tf
import numpy as np
import transformers
import csv
import pandas as pd
from keras.models import load_model 
import pandas as pd   
import sklearn 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn import metrics  
from scipy.spatial import distance
from sentence_transformers import SentenceTransformer 
from keybert import KeyBERT
kw_model = KeyBERT()
key_model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [None]:
bert_model = transformers.TFBertModel.from_pretrained("bert-base-uncased") 
max_length = 128  # Maximum length of input sentence to the model.
batch_size = 32
labels = ["contradiction", "entailment", "neutral"]

In [None]:
class BertSemanticDataGenerator(tf.keras.utils.Sequence): 
    def __init__(
        self,
        sentence_pairs,
        labels,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
    ):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        
        # Load our BERT Tokenizer to encode the text.
        # We will use base-base-uncased pretrained model.
        
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-uncased", do_lower_case=True
        )
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
        )   

        bert_output = bert_model(**encoded)
        sequence_output = bert_output.last_hidden_state
         
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return sequence_output, labels
        else:
            return sequence_output

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)


In [None]:
class SimilarityMetric:
    def __init__(self,student_answer,standard_answer,st_ans,sn_ans,key_model=key_model) -> None:
        self.student_answer=student_answer
        self.standard_answer=standard_answer 
        self.st_ans=st_ans
        self.sn_ans=sn_ans 
        self.embded_student_answer = key_model.encode(student_answer)
        self.embded_standard_answer= key_model.encode(standard_answer)

    def euclidian_dist(self): 
        dist=euclidean_distances(self.embded_standard_answer,self.embded_student_answer) 
        result=0.0
        for d in dist: 
            result=min(d)
        return result/dist.shape[0]
    
    def manhatten_dist(self):
        dist=manhattan_distances( self.embded_standard_answer,self.embded_student_answer) 
        result=0.0
        for d in dist: 
            result=min(d)
        return result/dist.shape[0]
#         return manhattan_distances( self.embded_standard_answer,self.embded_student_answer) 

    def cosine_similarity(self): 
        distances = cosine_similarity( self.embded_standard_answer,self.embded_student_answer)
        result=0.0
        for d in distances: 
            result=max(d)
        return result/distances.shape[0] 
    
    def Jaccard_Similarity(self): 

        words_doc1 = set(self.st_ans.lower().split()) 
        words_doc2 = set(self.sn_ans.lower().split())

        intersection = words_doc1.intersection(words_doc2)

        union = words_doc1.union(words_doc2)

        return float(len(intersection)) / len(union)


In [None]:
model=load_model('./testmodel1.h5')

In [None]:
df = pd.read_csv('data.csv')
length = len(df.index)
for i in range(length):
    question = df.loc[i, "questions"]
    sentence1 = df.loc[i, "sentence1"]
    sentence2 = df.loc[i, "sentence2"]
    label = df.loc[i, "label"]

    sentence_pairs = np.array([[str(sentence1), str(sentence2)]])
    test_data = BertSemanticDataGenerator(
        sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
    )
    result = model.predict(test_data)
    index = np.argmax(result, axis=None)
    pred_label = labels[index]
    percentage = str(result[0][index])
    sucess = 0
    if(pred_label.strip() == label.strip()):
        sucess = 1
        
#     Keyword ..

    keywords_stnd = kw_model.extract_keywords([sentence1], keyphrase_ngram_range=(1, 3), stop_words='english', use_mmr=True, diversity=0.3) 
    keywords_stud = kw_model.extract_keywords([sentence2], keyphrase_ngram_range=(1, 3), stop_words='english', use_mmr=True, diversity=0.3)
    candidates_standard=[]
    candidates_student=[]

    for key,match in keywords_stnd[0]: 
      candidates_standard.append(key)
    for key,match in keywords_stud[0]:
      candidates_student.append(key) 

    sim_obj=SimilarityMetric(candidates_standard,candidates_student,str(sentence1),str(sentence2))
    
    cosine_similar=sim_obj.cosine_similarity()
    euclid_dist=sim_obj.euclidian_dist()
    manhatten_d=sim_obj.manhatten_dist()
    jaccard_Similarity=sim_obj.Jaccard_Similarity()
    
    
    with open('result.csv', 'a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([question, sentence1, sentence2, label, pred_label, percentage, sucess, str(result),cosine_similar,jaccard_Similarity,euclid_dist,manhatten_d])
