**Data Pre-Proccesing**

In [3]:
import pandas as pd  # Importing the pandas library for data manipulation
import re  # Importing the re module for regular expression operations

def read_and_split_file(file_path):
    # List of common file encodings to try reading the file
    encodings = ['utf-8', 'ISO-8859-1', 'latin1', 'cp1252']
    
    # Attempt to read the file with each encoding
    for encoding in encodings:
        try:
            # Open the file with the current encoding
            with open(file_path, 'r', encoding=encoding) as file:
                content = file.readlines()  # Read all lines of the file

            data = []  # Initialize a list to store processed data
            # Process each line in the file
            for line in content:
                # Use regular expression to split the line at the first occurrence of a number pattern
                split_line = re.split(r'(\d+\.\d+)\s+', line, maxsplit=1)
                # If the line is successfully split into three parts (part before number, number, part after number)
                if len(split_line) == 3:
                    number, answer = split_line[1], split_line[2]  # Extract the number and the answer
                    data.append([number, answer.strip()])  # Add to the data list
            return data  # Return the processed data

        except UnicodeDecodeError:
            # If the current encoding fails, continue to try with the next encoding
            continue  

    # If all encodings fail, raise an error indicating the file could not be decoded
    raise ValueError(f"Unable to decode file {file_path} with given encodings.")

# File path for the data file
student_answers_file = 'ShortAnswerGrading_v2/data/raw/all'

# Read and split the file to get the student data
student_data = read_and_split_file(student_answers_file)

# Create a DataFrame from the student data with specified column names
df_student = pd.DataFrame(student_data, columns=['Number', 'Student Answer'])

In [4]:
instructor_answers_file = 'ShortAnswerGrading_v2/data/raw/answers'
questions_file = 'ShortAnswerGrading_v2/data/raw/questions'

# Read and split file
instructor_data = read_and_split_file(instructor_answers_file)
questions_data = read_and_split_file(questions_file)

# Create DataFrames
df_instructor_answers = pd.DataFrame(instructor_data, columns=['Number', 'Instructor Answer'])
df_questions = pd.DataFrame(questions_data, columns=['Number', 'Question'])

In [5]:
combined_df = pd.merge(df_questions, df_instructor_answers, on='Number')

combined_df

Unnamed: 0,Number,Question,Instructor Answer
0,1.1,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...
1,1.2,What stages in the software life cycle are inf...,The testing stage can influence both the codin...
2,1.3,What are the main advantages associated with o...,Abstraction and reusability.
3,1.4,Where do C++ programs begin to execute?,At the main function.
4,1.5,What is a variable?,A location in memory that can store a value.
...,...,...,...
82,12.6,What is a queue?,A data structure that stores elements followin...
83,12.7,What are the main operations associated with a...,push and pop
84,12.8,What is the Euler tour traversal of a tree?,"A walk around the tree, starting with the root..."
85,12.9,How do you delete a node from a binary search ...,"Find the node, then replace it with the leftmo..."


In [6]:
dataset = pd.merge(combined_df, df_student, on='Number')
dataset

Unnamed: 0,Number,Question,Instructor Answer,Student Answer
0,1.1,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,High risk problems are address in the prototyp...
1,1.1,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,To simulate portions of the desired final prod...
2,1.1,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,A prototype program simulates the behaviors of...
3,1.1,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,Defined in the Specification phase a prototype...
4,1.1,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,It is used to let the users have a first idea ...
...,...,...,...,...
2437,12.10,How many steps does it take to search a node i...,The height of the tree.,log n
2438,12.10,How many steps does it take to search a node i...,The height of the tree.,( n(n-1) ) / 2<br><br>
2439,12.10,How many steps does it take to search a node i...,The height of the tree.,2n-1
2440,12.10,How many steps does it take to search a node i...,The height of the tree.,"it takes at most h steps, where h is the heigh..."


In [9]:
import pandas as pd  # Importing pandas for data manipulation
import os  # Importing os module for operating system dependent functionality

def add_grades_to_df(df, base_dir):
    """
    This function adds grades to a DataFrame from text files located in numbered directories.
    It expects a DataFrame with a 'Number' column indicating directory names, and a base directory 
    path where these numbered directories are located.
    """

    # Iterating through each row in the DataFrame
    for index, row in df.iterrows():
        # Constructing the directory name from the 'Number' column in the DataFrame
        directory_name = str(row['Number'])
        # Creating the file path by joining the base directory, directory name, and a file named 'ave'
        file_path = os.path.join(base_dir, directory_name, 'ave')
        
        # Checking if the constructed file path exists
        if os.path.exists(file_path):
            # Opening and reading the contents of the file
            with open(file_path, 'r') as file:
                file_contents = file.read().strip()  # Removing any leading/trailing whitespace

                # Splitting the contents by newline and filtering out empty strings
                grades = file_contents.split('\n')
                grades = [grade for grade in grades if grade]  

                # Check if there are any grades in the list
                if grades:
                    # Selecting the first grade as the required grade
                    selected_grade = grades[0]
                    # Updating the DataFrame with the selected grade at the corresponding index
                    df.at[index, 'grade'] = selected_grade
                else:
                    # Print a message if no valid grades are found in the file
                    print(f"No valid grades found in {file_path}")
    # Returning the modified DataFrame with grades added
    return df


dataset_updated = add_grades_to_df(dataset, 'ShortAnswerGrading_v2/data/scores')
# 'dataset_updated' now contains the original data along with added grades
dataset_updated


Unnamed: 0,Number,Question,Instructor Answer,Student Answer,grade
0,1.1,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,High risk problems are address in the prototyp...,3.5
1,1.1,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,To simulate portions of the desired final prod...,3.5
2,1.1,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,A prototype program simulates the behaviors of...,3.5
3,1.1,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,Defined in the Specification phase a prototype...,3.5
4,1.1,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,It is used to let the users have a first idea ...,3.5
...,...,...,...,...,...
2437,12.10,How many steps does it take to search a node i...,The height of the tree.,log n,5
2438,12.10,How many steps does it take to search a node i...,The height of the tree.,( n(n-1) ) / 2<br><br>,5
2439,12.10,How many steps does it take to search a node i...,The height of the tree.,2n-1,5
2440,12.10,How many steps does it take to search a node i...,The height of the tree.,"it takes at most h steps, where h is the heigh...",5


In [32]:
dataset_updated.to_csv('Dataset/dataset_with_grades.csv', index=False)

**Building Word2Vec**

In [5]:
!pip install scikit-learn



In [6]:
from sklearn.model_selection import train_test_split
import gensim
from nltk.tokenize import word_tokenize
import nltk
import re

# Download NLTK data
nltk.download('punkt')

# Preprocessing function
def preprocess_text(text):
    # Lowercasing and removing non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    # Tokenizing
    return word_tokenize(text)

# Applying preprocessing to each text column
dataset_updated['Processed Question'] = dataset_updated['Question'].apply(preprocess_text)
dataset_updated['Processed Instructor Answer'] = dataset_updated['Instructor Answer'].apply(preprocess_text)
dataset_updated['Processed Student Answer'] = dataset_updated['Student Answer'].apply(preprocess_text)

# Combine all processed text into a single series for training the model
all_text = pd.concat([dataset['Processed Question'], dataset_updated['Processed Instructor Answer'], dataset_updated['Processed Student Answer']])

# Split the dataset (80% train, 20% test)
train_text, test_text = train_test_split(all_text, test_size=0.2, random_state=42)

# Training the word2vec model
model = gensim.models.Word2Vec(sentences=train_text, vector_size=100, window=5, min_count=1, workers=4)

# Save the model
model.save("Models/word2vec_model.model")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shahadaleissa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**Buliding a regressor with RandomForest**

In [7]:
df=pd.read_csv('Dataset/dataset_with_grades.csv')

In [8]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Load pre-trained word2vec model
model = Word2Vec.load("Models/word2vec_model.model")


def sentence_to_avg_vector(sentence, model):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Feature extraction
df['inst_vector'] = df['Instructor Answer'].apply(lambda x: sentence_to_avg_vector(x, model))
df['student_vector'] = df['Student Answer'].apply(lambda x: sentence_to_avg_vector(x, model))

# Combine vectors
df['combined_vector'] = df.apply(lambda row: np.concatenate([row['inst_vector'], row['student_vector']]), axis=1)

# Prepare dataset
X = np.stack(df['combined_vector'])
y = df['grade']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
regressor = RandomForestRegressor()  # You can experiment with different models and parameters
regressor.fit(X_train, y_train)

# Evaluate model
y_pred = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.0019168200408997953


In [17]:
import joblib

# Save model
joblib.dump(regressor, 'regressor.joblib')

['regressor.joblib']

In [9]:
import joblib
import gensim
import numpy as np

# Load pre-trained word2vec and regressor models
word2vec_model = gensim.models.Word2Vec.load("Models/word2vec_model.model")
regressor_model = joblib.load("Models/regressor.joblib")

def compute_sentence_vector(sentence, model):
    vectors = [model.wv[word] for word in sentence.split() if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

def predict_grade(ref_answer, student_answer, word2vec_model, regressor_model):
    ref_vector = compute_sentence_vector(ref_answer, word2vec_model)
    student_vector = compute_sentence_vector(student_answer, word2vec_model)
    combined_vector = np.concatenate([ref_vector, student_vector])
    predicted_grade = regressor_model.predict([combined_vector])[0]
    return predicted_grade

# Sample reference answer and student answer for testing
sample_ref_answer = "log n is the height of the tree"
sample_student_answer = "log n"

# Perform inference
predicted_grade = predict_grade(sample_ref_answer, sample_student_answer, word2vec_model, regressor_model)
print(f"Predicted Grade: {predicted_grade}")


Predicted Grade: 3.15


**Optimization**

In [None]:
# Define the parameter grid
from sklearn.model_selection import GridSearchCV


param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the random forest
    'max_depth': [10, 20, 30],        # Maximum number of levels in each tree
    'min_samples_split': [2, 5, 10]   # Minimum number of samples required to split a node
}

# Create a base model
rf = RandomForestRegressor()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)


In [11]:
# Best parameters found
print("Best Parameters:", grid_search.best_params_)

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

Best Parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 200}
Mean Squared Error: 0.0018615237319338127


In [12]:
# Perform inference
predicted_grade = predict_grade(sample_ref_answer, sample_student_answer, word2vec_model, best_model)
print(f"Predicted Grade: {predicted_grade}")

Predicted Grade: 3.1825
