In [1]:
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd

In [2]:
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

In [3]:
#Function for cosine similarity
def mag(x): 
    return np.sqrt(sum(i**2 for i in x))
def cossim(embed1,embed2):
    return np.dot ( embed1, embed2)/(mag(embed1)*mag(embed2))

In [4]:
#List of phrases 
phrases = [
'Good job!',
'Why?',
'This is not quite right, check',
'Good question',
'Add more detail.',
'Explain why this is important.',
'What does this tell you about?',
'What do you mean?',
'Say more about this',
'You missed something',
'How does this connect to what you said before?',
'What does this reveal about the author’s claim?',
'Did we see this idea before, or is this new?',
'Can you connect this to your own life?',
'What is the deeper meaning here?',
'How do you know this?'
]
   
phrases = list(set(phrases))

#Embeddings for phrases
embeddings_p = model.encode( phrases)

#Normalizing Embeddings
embeddings_p = embeddings_p / np.linalg.norm(embeddings_p, ord=2, axis=1, keepdims=True)
print(embeddings_p.shape)
# To print embeddings along with its corresponding phrases below 
for i in range(len(phrases[:5])):
    print(phrases[i])
    print(embeddings_p[i][:5])
print ( cossim(embeddings_p[0], embeddings_p[0]))
print ( cossim(embeddings_p[0], embeddings_p[5]))

(16, 768)
Say more about this
[-0.02749122  0.0038594   0.01216771  0.02630792  0.02266499]
You missed something
[ 0.00320384 -0.00935981 -0.00158467  0.03046354  0.03646295]
What does this tell you about?
[ 0.01929869 -0.03846261 -0.00386472  0.04521561  0.00936567]
Good job!
[-0.01111315  0.03667478  0.02177003 -0.04349593  0.00314289]
How does this connect to what you said before?
[ 0.04603298 -0.03493194  0.01333391  0.02224684  0.0248211 ]
1.0000000189202332
0.18575934731374472


In [None]:
#Data needs 3 text columns - 1) annotation, 2) student_note & 3) teacher_feedback, for privacy reasons cell output is cleared
df = pd.read_csv('./teachers_feedbacks_historical.csv')
df.head()

In [13]:
df.shape

(66987, 8)

In [14]:
#Embeddings for (raw) teacher's feedback
embeddings_f = model.encode(df['teacher_feedback'].values)
print(embeddings_f.shape)
#Normalizing feed back embeddings
embeddings_f = embeddings_f / np.linalg.norm(embeddings_f, ord=2, axis=1, keepdims=True)

(66987, 768)


In [15]:
#Similarity matrix between teacher's raw feed back and feedback prases
sim_matrix = np.matmul(embeddings_f, embeddings_p.T)
print(sim_matrix.shape)
sim_matrix[0]

(66987, 16)


array([0.38102397, 0.182388  , 0.47442174, 0.153306  , 0.2288782 ,
       0.15670286, 0.4450034 , 0.51017964, 0.41391277, 0.13697979,
       0.35046747, 0.36698267, 0.20331094, 0.5332635 , 0.20964894,
       0.3703055 ], dtype=float32)

In [16]:
#phrase indices which is most similar to feedback
phrases_ind = list(np.argmax(sim_matrix, axis=1))
#Mapped phrases for the feedback
mapped_phrases = [phrases[i] for i in phrases_ind ]
mapped_phrases[:5]

['Explain why this is important.',
 'This is not quite right, check',
 'Good question',
 'Good job!',
 'Good question']

In [17]:
#Adding columns for mapped phrases and similarity score to the dataframe
df['phrases'] = mapped_phrases
df['score'] = sim_matrix [np.arange(len(sim_matrix)),  phrases_ind]

In [18]:
# Now this will have 5 columns 1) annotation, 2) student_note , 3) teacher_feedback, 4) phrases  & 5) score
#Score is not used in models but it gives an idea of how a phrase is close to raw feedback
df.to_csv('./data_with_mapped_phrases_v2.csv', index=False) 