In [5]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import os
import google.generativeai as genai

In [6]:
# Load dataset
df = pd.read_csv("data/asag_dataset.csv")

df = df.drop(columns=["\\"], errors="ignore")

# Check first few rows
df.head(3)


Unnamed: 0,question,student_answer,grades_round,student_modified,ref_answer,qn_modified,ref_modified,student_demoted,ref_demoted,length_ratio,...,embed_stud,embed_ref_demoted,embed_stud_demoted,aligned,aligned_demoted,cos_similarity,cos_similarity_demo,aligned_score,aligned_score_demo,question_id
0,"Give a definition for the term ""artificial ne...",An artificial neural network is a massively pa...,2,artificial neural network massively parallel d...,A neural network is a massively parallel distr...,give definition term artificial neural network...,neural network massively parallel distributed ...,massively parallel distributed processor simpl...,massively parallel distributed processor made ...,0.251889,...,[[ 2.2006836 0.86382484 0.27182007 2.55627...,[[ 1.6300049e+00 1.5985355e+00 -1.2829590e-01...,[[ 2.0412598e+00 4.9321938e-01 1.0058594e-01...,"[['neural', 'neural'], ['network', 'network'],...","[['simple', 'simple'], ['processing', 'process...",0.947867,0.933466,0.969697,0.950888,1
1,"Give a definition for the term ""artificial ne...",Artificial neural network consists of: . Large...,2,artificial neural network consists largely par...,A neural network is a massively parallel distr...,give definition term artificial neural network...,neural network massively parallel distributed ...,consists largely parallel distributed processo...,massively parallel distributed processor made ...,0.232759,...,[[ 1.33543945 1.09904457 0.52998901 2.03334...,[[ 1.6300049e+00 1.5985355e+00 -1.2829590e-01...,[[ 1.19566895 0.7539518 0.13561035 1.22295...,"[['knowledge', 'knowledge'], ['parallel', 'par...","[['knowledge', 'knowledge'], ['knowledge', 'kn...",0.964398,0.951182,0.883259,0.818713,1
2,"Give a definition for the term ""artificial ne...",An artificial neural network is a massive dist...,1,artificial neural network massive distributed ...,A neural network is a massively parallel distr...,give definition term artificial neural network...,neural network massively parallel distributed ...,massive distributed processor consists several...,massively parallel distributed processor made ...,0.102828,...,[[ 0.41577148 -0.37836266 0.22351074 0.95300...,[[ 1.6300049e+00 1.5985355e+00 -1.2829590e-01...,[[ 3.84277344e-01 -4.89446640e-01 1.72241211e...,"[['knowledge', 'knowledge'], ['neural', 'neura...","[['knowledge', 'knowledge'], ['distributed', '...",0.854767,0.775333,0.498039,0.465632,1


In [7]:
# Check missing columns
expected_columns = [
    "question", "student_answer", "ref_answer", "qn_modified", "student_modified",
    "ref_modified", "student_demoted", "ref_demoted", "length_ratio",
    "embed_ref", "embed_stud", "embed_ref_modified", "embed_stud_modified",
    "embed_ref_demoted", "embed_stud_demoted", "aligned", "aligned_demoted",
    "grades_round", "cos_similarity", "cos_similarity_modified", "cos_similarity_demo",
    "aligned_score", "aligned_score_demo", "question_id"
]
missing_columns = [col for col in expected_columns if col not in df.columns]
print("Missing Columns:", missing_columns)

Missing Columns: ['embed_ref_modified', 'embed_stud_modified', 'cos_similarity_modified']


🔹 Step 2: Load SBERT & Generate Embeddings`


In [8]:
# Load SBERT model
sbert_model = SentenceTransformer('paraphrase-mpnet-base-v2')

# Ensure all values are strings
df["embed_ref_modified"] = df["ref_modified"].astype(str).apply(lambda x: sbert_model.encode(x).tolist() if x.strip() else None)
df["embed_stud_modified"] = df["student_modified"].astype(str).apply(lambda x: sbert_model.encode(x).tolist() if x.strip() else None)

# Compute Cosine Similarity
df['cos_similarity_modified'] = df.apply(
    lambda row: cosine_similarity([row['embed_ref_modified']], [row['embed_stud_modified']])[0][0],
    axis=1
)





🔹 Step 3: Compute Final Combined Similarity Score
    Weighted formula: 70% `cos_similarity_demo` + 30% `cos_similarity_modified`

In [31]:
df["combined_similarity"] = (0.4 * df["cos_similarity_demo"]) + (0.6 * df["cos_similarity_modified"])


🔹 Step 4: Assign Grades Based on Cosine Similarity & Alignment

In [None]:
def assign_grades(cos_sim):
    if cos_sim > 0.7834:
        return "Completely Correct", 2
    elif cos_sim > 0.3126:
        return "Partially Incorrect", 1
    else:
        return "Incorrect", 0

df[["grade_text", "grades_auto"]] = df["combined_similarity"].apply(lambda x: pd.Series(assign_grades(x)))


# Check results
# print(df[["cos_similarity_modified", "aligned_word_match", "grade_text", "grades_auto"]].head())


In [33]:
# ================================
# Step 6: Save Graded Data
# ================================
OUTPUT_PATH = "data/graded_answers_new.csv"
df.to_csv(OUTPUT_PATH, index=False, encoding="utf-8")

print(f"Grading completed! Saved to {OUTPUT_PATH}")

Grading completed! Saved to data/graded_answers_new.csv


In [36]:
newdf=pd.read_csv("data/Final_Dataset.csv")

In [37]:
newdf.columns

Index(['question', 'student_answer', 'student_modified', 'ref_answer',
       'qn_modified', 'ref_modified', 'student_demoted', 'ref_demoted',
       'length_ratio', 'embed_ref', 'embed_stud', 'embed_ref_demoted',
       'embed_stud_demoted', 'aligned', 'aligned_demoted', 'cos_similarity',
       'cos_similarity_demo', 'aligned_score', 'aligned_score_demo',
       'question_id', 'embed_ref_modified', 'embed_stud_modified',
       'cos_similarity_modified', 'combined_similarity', 'grade_text',
       'grades_auto'],
      dtype='object')