In [None]:
!pip install torch
!pip install transformers

In [None]:
!pip install catboost

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout, Input
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from transformers import RobertaTokenizer, TFRobertaModel
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
import xgboost as xgb

In [None]:
code = pd.read_excel(r"/content/totalmerged.xlsx")
code

In [None]:
# Merge columns
code['merged'] = code['Question'] + ' ' + code['Code_with_Error']
code

In [None]:
# Fill missing values with empty strings
code['merged'] = code['merged'].fillna('')

In [None]:
# Load the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = TFRobertaModel.from_pretrained("roberta-base")

In [None]:
# Tokenize and convert the textual data to vectors
X_text = code["merged"].values
y = code["Total_Marks"].values

In [None]:
X_vectors = []
for text in X_text:
    inputs = tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='tf', max_length=512, truncation=True)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    output = roberta_model(input_ids, attention_mask=attention_mask)[0][:, 0, :]
    X_vectors.append(output.numpy())

In [None]:

X_vectors = np.array(X_vectors)
y = np.array(y)

In [None]:
X_vectors

In [None]:
# Split the data into train, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X_vectors, y, test_size=0.25, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [None]:
# Create the BiLSTM model
model = Sequential()
model.add(Bidirectional(LSTM(units=64), input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(units=1))

In [None]:
# Compile the model
model.compile(optimizer=Adam(), loss='mean_squared_error')

In [None]:
# Define early stopping callback
early_stopping = EarlyStopping(patience=5, monitor='val_loss', restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=32, callbacks=[early_stopping])

In [None]:
# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)


In [None]:
# Calculate evaluation metrics
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_mape = mean_absolute_percentage_error(y_train, y_train_pred)
test_mape = mean_absolute_percentage_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

In [None]:
# Print the evaluation metrics
print("Train RMSE:", train_rmse)
print("Test RMSE:", test_rmse)
print() 
print("Train MAE:", train_mae)
print("Test MAE:", test_mae)
print() 
print("Train MAPE:", train_mape)
print("Test MAPE:", test_mape)
print() 
print("Train R^2:", train_r2)
print("Test R^2:", test_r2)