In [None]:
import os
import re
import glob
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import matplotlib.pyplot as plt

pl.seed_everything(42) 

In [None]:
class TextProcessor:
    """Handles converting text to numbers. Includes a fix to reset vocab."""
    def __init__(self, max_vocab=10000, max_len=50):
        self.max_vocab = max_vocab
        self.max_len = max_len
        self.reset_vocab()
        
    def reset_vocab(self):
        """Clears the dictionary so we don't count words twice."""
        self.word2idx = {"<PAD>": 0, "<UNK>": 1}
        self.idx2word = {0: "<PAD>", 1: "<UNK>"}
        self.vocab_size = 2
        
    def clean_text(self, text):
        text = str(text).lower()
        # Keep only letters and numbers
        text = re.sub(r'[^a-z0-9\s]', '', text)
        return text

    def build_vocab(self, text_list):
        # FIX: Reset vocab every time build_vocab is called to prevent Index Errors
        self.reset_vocab()
        
        all_words = []
        for text in text_list:
            clean = self.clean_text(text)
            all_words.extend(clean.split())
            
        counts = Counter(all_words).most_common(self.max_vocab - 2)
        
        for word, _ in counts:
            self.word2idx[word] = self.vocab_size
            self.idx2word[self.vocab_size] = word
            self.vocab_size += 1
        print(f"‚úÖ Vocab Built! Size: {self.vocab_size}")

    def text_to_sequence(self, text):
        clean = self.clean_text(text)
        # Convert words to IDs. Use 1 (<UNK>) if word not found.
        seq = [self.word2idx.get(w, 1) for w in clean.split()]
        
        # Padding / Truncating
        if len(seq) < self.max_len:
            seq = seq + [0] * (self.max_len - len(seq))
        else:
            seq = seq[:self.max_len]
        return torch.tensor(seq, dtype=torch.long)

In [None]:

class GradingDataset(Dataset):
    def __init__(self, stu, tea, lbl, proc):
        self.data = list(zip(stu, tea, lbl))
        self.proc = proc
    def __len__(self): return len(self.data)
    def __getitem__(self, idx):
        s, t, l = self.data[idx]
        return (self.proc.text_to_sequence(s), 
                self.proc.text_to_sequence(t), 
                torch.tensor(l, dtype=torch.float))

class GradingDataModule(pl.LightningDataModule):
    def __init__(self, stu, tea, lbl, batch_size=32):
        super().__init__()
        self.batch_size = batch_size
        self.full_data = list(zip(stu, tea, lbl))
        random.shuffle(self.full_data)
        self.processor = TextProcessor()
        
    def setup(self, stage=None):
        # 1. Build Vocab on ALL data
        stu_list, tea_list, _ = zip(*self.full_data)
        self.processor.build_vocab(stu_list + tea_list)
        
        # 2. Split Data
        total = len(self.full_data)
        tr, va = int(total*0.8), int(total*0.9)
        
        # 3. Create Datasets
        s, t, l = zip(*self.full_data)
        self.train_ds = GradingDataset(s[:tr], t[:tr], l[:tr], self.processor)
        self.val_ds = GradingDataset(s[tr:va], t[tr:va], l[tr:va], self.processor)
        self.test_ds = GradingDataset(s[va:], t[va:], l[va:], self.processor)

    def train_dataloader(self): return DataLoader(self.train_ds, batch_size=self.batch_size, shuffle=True)
    def val_dataloader(self): return DataLoader(self.val_ds, batch_size=self.batch_size)
    def test_dataloader(self): return DataLoader(self.test_ds, batch_size=self.batch_size)


In [None]:
class Encoder(nn.Module):
    # Fixed argument list to match Tester
    def __init__(self, emb, hid_dim, voc_size, n_rnn_layer, n_heads, dropout):
        super().__init__()
        self.embedding = nn.Embedding(voc_size, emb)
        self.rnn = nn.LSTM(emb, hid_dim, num_layers=n_rnn_layer, batch_first=True, bidirectional=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        _, (hidden, _) = self.rnn(embedded)
        # Concatenate forward and backward final hidden states
        return torch.cat((hidden[-2], hidden[-1]), dim=1)

In [None]:
class Tester(nn.Module):
    def __init__(self, emb_dim, hid_dim, voc_size):
        super().__init__()
        # Initialize Encoder correctly
        self.encoder = Encoder(emb=emb_dim, hid_dim=hid_dim, voc_size=voc_size, n_rnn_layer=2, n_heads=4, dropout=0.3)
        self.cos = nn.CosineSimilarity(dim=1)
        self.classifier = nn.Sequential(nn.Linear(1, 1), nn.Sigmoid())

    def forward(self, student_text, teacher_text): # Fixed typo 'forwerd' -> 'forward'
        student_vec = self.encoder(student_text)
        teacher_vec = self.encoder(teacher_text)
        similarity = self.cos(student_vec, teacher_vec).unsqueeze(1)
        return self.classifier(similarity)

In [None]:

class GradingLearner(pl.LightningModule):
    def __init__(self, model_class, emb_dim, hid_dim, voc_size, lr=0.001):
        super().__init__()
        self.save_hyperparameters()
        self.model = model_class(emb_dim, hid_dim, voc_size)
        self.criterion = nn.MSELoss() # Changed to MSE for grading (0.0 to 1.0)
        self.history = {"train_loss": [], "val_loss": []}

    def forward(self, s, t): return self.model(s, t)

    def training_step(self, batch, idx):
        loss = self.criterion(self(batch[0], batch[1]).squeeze(), batch[2])
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, idx):
        loss = self.criterion(self(batch[0], batch[1]).squeeze(), batch[2])
        self.log("val_loss", loss, prog_bar=True)
        return loss

    def on_train_epoch_end(self):
        self.history["train_loss"].append(self.trainer.callback_metrics.get("train_loss").item())
        
    def on_validation_epoch_end(self):
        v_loss = self.trainer.callback_metrics.get("val_loss")
        if v_loss: self.history["val_loss"].append(v_loss.item())

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.hparams.lr)
        # FIX: Removed 'verbose=True' causing TypeError
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2)
        return {"optimizer": optimizer, "lr_scheduler": {"scheduler": scheduler, "monitor": "val_loss", "interval": "epoch", "strict": True}}

    def plot_performance(self):
        plt.figure(figsize=(10, 5))
        plt.plot(self.history["train_loss"], label="Train Loss")
        plt.plot(self.history["val_loss"], label="Val Loss")
        plt.xlabel("Epochs")
        plt.ylabel("Loss")
        plt.legend()
        plt.show()

In [None]:

print("\n--- 1. Loading Data ---")
csv_files = glob.glob("/kaggle/input/**/*.csv", recursive=True)
if not csv_files:
    # Try Excel
    excel_files = glob.glob("/kaggle/input/**/*.xlsx", recursive=True)
    if not excel_files:
        raise FileNotFoundError("Please add a dataset in Kaggle UI (Sidebar -> Add Data)")
    df = pd.read_excel(excel_files[0])
else:
    df = pd.read_csv(csv_files[0])

# 2. DETECT COLUMNS
def find_col(k, c):
    for col in c: 
        if any(x in col.lower() for x in k): return col
    return None

s_col = find_col(['student', 'answer'], df.columns) or 'Student Answer'
t_col = find_col(['teacher', 'model', 'reference'], df.columns) or 'Model Answer'
l_col = find_col(['score', 'grade'], df.columns) or 'Score'

print(f"Columns Detected: Student='{s_col}', Teacher='{t_col}', Score='{l_col}'")

df = df.dropna(subset=[s_col, t_col, l_col])
raw_s = df[s_col].astype(str).tolist()
raw_t = df[t_col].astype(str).tolist()
raw_l = df[l_col].astype(float).tolist()


if max(raw_l) > 1.0: raw_l = [x / max(raw_l) for x in raw_l]


dm = GradingDataModule(raw_s, raw_t, raw_l, batch_size=32)


dm.setup()
actual_vocab_size = dm.processor.vocab_size
print(f"Real Vocab Size: {actual_vocab_size}")


SAFE_VOCAB = actual_vocab_size + 100
print(f"Initializing Model with Safe Vocab: {SAFE_VOCAB}")

model = GradingLearner(Tester, emb_dim=64, hid_dim=128, voc_size=SAFE_VOCAB)

# 7. TRAIN
print("\n--- 2. Starting Training ---")
trainer = pl.Trainer(max_epochs=15, accelerator="auto", devices=1)
trainer.fit(model, dm)

# 8. PLOT
print("\n--- 3. Results ---")
model.plot_performance()


In [None]:

def predict_grade(student_ans, teacher_ans):
    """
    Takes a single student answer and teacher answer,
    processes them, and returns the predicted score (0.0 to 1.0).
    """

    model.eval()
    model.to("cpu")
    

    s_seq = dm.processor.text_to_sequence(student_ans).unsqueeze(0)
    t_seq = dm.processor.text_to_sequence(teacher_ans).unsqueeze(0)
    
    with torch.no_grad(): 
        score = model(s_seq, t_seq)
        
    return score.item()


print("\nü§ñ Grading AI Predictions:\n")

# TEST CASE 1: The "TRUE" Case (Correct Answer)
# Context: Biology/Cells
teacher_ref = "The mitochondria is the powerhouse of the cell."
student_good = "Mitochondria are responsible for producing energy for the cell."

score_1 = predict_grade(student_good, teacher_ref)

print(f"üìù Teacher: '{teacher_ref}'")
print(f"‚úÖ Student (Good): '{student_good}'")
print(f"üìä Predicted Score: {score_1:.4f}  (Expected: High)\n")
print("-" * 50 + "\n")


# TEST CASE 2: The "FALSE" Case (Wrong Answer)
# Context: Biology/Cells
student_bad = "The mitochondria is the brain of the cell and controls DNA."

score_2 = predict_grade(student_bad, teacher_ref)

print(f"üìù Teacher: '{teacher_ref}'")
print(f"‚ùå Student (Bad):  '{student_bad}'")
print(f"üìä Predicted Score: {score_2:.4f}  (Expected: Low)")

In [5]:
import json
import torch

print("üíæ Saving System...")

# 1. Save the Dictionary (Vocabulary)
# We need this to translate new student answers into numbers later
vocab_path = "vocab.json"
with open(vocab_path, "w") as f:
    json.dump(dm.processor.word2idx, f)
print(f"‚úÖ Saved Vocabulary to: {vocab_path}")

# 2. Save the Brain (Model Weights)
# We save the internal 'Tester' model, not the whole Lightning wrapper
# This makes it easier to load in a simple Python app later
model_path = "grading_model.pt"
torch.save(model.model.state_dict(), model_path)
print(f"‚úÖ Saved Model Weights to: {model_path}")

print("\nüéâ DONE! You can now download these files.")

üíæ Saving System...
‚úÖ Saved Vocabulary to: vocab.json
‚úÖ Saved Model Weights to: grading_model.pt

üéâ DONE! You can now download these files.
