# Imports

This will take a long time (around 2 mins)

In [None]:
import pandas as pd
import numpy as np
from math import e
import spacy
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
! python -m spacy download en_core_web_md
import en_core_web_md
nlp = en_core_web_md.load()
! pip install -U sentence-transformers

from sentence_transformers import SentenceTransformer, util
bert = SentenceTransformer('bert-base-nli-mean-tokens')
roberta = SentenceTransformer('stsb-roberta-large')
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en_core_web_md==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4 MB)
[K     |████████████████████████████████| 96.4 MB 589 kB/s 
Building wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.2.5-py3-none-any.whl size=98051301 sha256=22f87bbc2acd579eb855e247a15c85b24fe240e68ee56b56243306919a05f8bc
  Stored in directory: /tmp/pip-ephem-wheel-cache-k86kdizn/wheels/69/c5/b8/4f1c029d89238734311b3269762ab2ee325a42da2ce8edb997
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')
Looking in indexes: 

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

# Similarity Model

In [None]:
def Similarity(model_answer, student_answer, model = 'all'):
    sim = []

    model_answer = model_answer.lower()
    student_answer = student_answer.lower()
    docs = [nlp(s) for s in [model_answer,student_answer]]
    v1,v2 = [doc.vector for doc in docs]
    sim += [cosine_similarity([v2,v1])[0][1]]

    tfidf = TfidfVectorizer(max_features=100, lowercase=True, analyzer='word',
                                  stop_words= 'english',ngram_range=(1,1))
    tfidf.fit([model_answer])
    correct_ans_matrix = tfidf.transform([model_answer])
    student_ans_matrix = tfidf.transform([student_answer])
    sim += [cosine_similarity(correct_ans_matrix, student_ans_matrix)[0][0]]


    input = [model_answer, student_answer]
    embeddings = bert.encode(input)
    sim += [cosine_similarity([embeddings[0]],embeddings[1:])[0][0]]

    punkt_param = PunktParameters()
    correct_ans = model_answer.lower()
    submitted_ans = student_answer.lower()
    tokenizer = PunktSentenceTokenizer(punkt_param)
    correct_ans_list = tokenizer.tokenize(model_answer)
    submitted_ans_list = tokenizer.tokenize(student_answer)
    embedding1 = roberta.encode(correct_ans_list, convert_to_tensor=True)
    embedding2 = roberta.encode(submitted_ans_list, convert_to_tensor=True)
    sim += [cosine_similarity(embedding1, embedding2)[0][0]]

    df = pd.DataFrame(np.array([sim]), columns = ['DocToVector', 'TF-IDF', 'BERT', "Roberta"]).T
    df.columns = ['similarity']
    return df

# Use your own test cases

In [None]:
# Replace s1 and s2 with your own data

# Sentence 1
s1 = 'There are 365 days in year'
# Sentence 2
s2 = "A year has 365 days"

Similarity(s1,s2)

Unnamed: 0,similarity
DocToVector,0.89379
TF-IDF,1.0
BERT,0.98471
Roberta,0.949108


# 10 Sample Test cases with discussion

In [None]:
s1 = 'Population of Egypt is more than 78 million'
s2 = "Egypt's population is no less than 70 million"
Similarity(s1,s2)

Unnamed: 0,similarity
DocToVector,0.958394
TF-IDF,0.866025
BERT,0.802377
Roberta,0.655663


In [None]:
s1 = 'World war 2 started in 1939'
s2 = 'World war 2 started in 1949'
Similarity(s1,s2)

Unnamed: 0,similarity
DocToVector,1.0
TF-IDF,0.866025
BERT,0.545926
Roberta,0.651127


In [None]:
s1 = 'Polar bears are nearly invisible under infrared'
s2 = 'Under infrared light, polar bears are practically undetectable.'
Similarity(s1,s2)

Unnamed: 0,similarity
DocToVector,0.943533
TF-IDF,0.774597
BERT,0.895682
Roberta,0.934872


In [None]:
s1 = 'Joe Biden is the President of the US'
s2 = 'Donald Trump is the President of the US'
Similarity(s1,s2)

Unnamed: 0,similarity
DocToVector,0.943119
TF-IDF,0.57735
BERT,0.772651
Roberta,0.547745


In [None]:
s1 = 'Joe Biden is the President of the US'
s2 = 'Joe Biden is the Vice-President of the US'
Similarity(s1,s2)

Unnamed: 0,similarity
DocToVector,0.972736
TF-IDF,1.0
BERT,0.962798
Roberta,0.922993


In [None]:
s1 = 'Serine is an amino acid'
s2 = 'Serine is not an amino acid'
Similarity(s1,s2)

Unnamed: 0,similarity
DocToVector,0.987406
TF-IDF,1.0
BERT,0.257014
Roberta,0.78312


In [None]:
s1 = "The leopard's tail is 80-110cm long"
s2 = "The leopard's tail is 90-100cm long"
Similarity(s1,s2)

Unnamed: 0,similarity
DocToVector,0.98512
TF-IDF,0.774597
BERT,0.88788
Roberta,0.715535


In [None]:
s1 = "On 24 February 2022, Russia invaded Ukraine, in a steep escalation of the Russo-Ukrainian War that began in 2014."
s2 = "On 24 February 2022, Ukraine invaded Russia, in a steep escalation of the Russo-Ukrainian War that began in 2010."
Similarity(s1,s2)

Unnamed: 0,similarity
DocToVector,0.994686
TF-IDF,0.960769
BERT,0.964684
Roberta,0.920408


In [None]:
s1 = 'Arable crop production refers to the systematic use of land to grow crops'
s2 = 'The organized use of land to cultivate crops is referred to as arable crop production.'
Similarity(s1,s2)

Unnamed: 0,similarity
DocToVector,0.964748
TF-IDF,0.816497
BERT,0.940564
Roberta,0.916018


# Evaluating the model on a large dataset

Note : Similarity was calculated for each pair of student answer/correct answer and added to the dataset to save time here.

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/pradhyumnj/similarity-calculator/master/data.csv")

In [53]:
features = df[["DocToVec","TF-IDF","Bert","Roberta"]]
target = df['correct']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=23)
classifier = RandomForestClassifier(max_depth = 4, n_estimators= 300, min_samples_split = 2)
classifier.fit(X_train,y_train)
predictions = classifier.predict(X_test)

print(f"Accuracy = {accuracy_score(predictions,y_test) * 100}%")

Accuracy = 82.0040899795501%
