In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split  
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the datasets
cc_data = pd.read_csv(r"C:\Users\vvmad\Downloads\5th\ML\PROJECT\datasets\java_cc_embed_data.csv")  # Code with Comments
co_data = pd.read_csv(r"C:\Users\vvmad\Downloads\5th\ML\PROJECT\datasets\java_co_embed_data.csv")  # Code Only

# Set A: Entire Code Only dataset (Set A)
# Set B: Entire Code with Comments dataset (Set B)
set_A = co_data  
set_B = cc_data  

# Splitting the datasets
X_A = set_A.filter(like='co_embedding_')  # Assuming embeddings for CO
y_A = set_A[['Final_Marks', 'error_count']]  # Labels for CO

X_B = set_B.filter(like='cc_embedding_')  # Assuming embeddings for CC
y_B = set_B[['Final_Marks', 'error_count']]  # Labels for CC

# Train-test split with 0.2:0.8 ratio
X_A_train, X_A_test, y_A_train, y_A_test = train_test_split(X_A, y_A, test_size=0.2, random_state=42)
X_B_train, X_B_test, y_B_train, y_B_test = train_test_split(X_B, y_B, test_size=0.2, random_state=42)

# Replace half of the test set in Model A with Set B
replace_count_A = int(len(X_A_test) / 2)
X_A_test_replaced = pd.concat([X_A_test.iloc[:replace_count_A], X_B.sample(replace_count_A).reset_index(drop=True)])
y_A_test_replaced = pd.concat([y_A_test.iloc[:replace_count_A], y_B.sample(replace_count_A).reset_index(drop=True)])

# Replace half of the test set in Model B with Set A
replace_count_B = int(len(X_B_test) / 2)
X_B_test_replaced = pd.concat([X_B_test.iloc[:replace_count_B], X_A.sample(replace_count_B).reset_index(drop=True)])
y_B_test_replaced = pd.concat([y_B_test.iloc[:replace_count_B], y_A.sample(replace_count_B).reset_index(drop=True)])

# Model A: Training on Set A, Testing on Set A and modified Set B
model_A_final_marks = CatBoostRegressor(iterations=500, learning_rate=0.1, depth=6, verbose=0)
model_A_error_count = CatBoostRegressor(iterations=500, learning_rate=0.1, depth=6, verbose=0)

model_A_final_marks.fit(X_A_train, y_A_train['Final_Marks'])
model_A_error_count.fit(X_A_train, y_A_train['error_count'])

# Predictions for Set A and modified Set B
predictions_A_A_final_marks = model_A_final_marks.predict(X_A_test)
predictions_A_B_final_marks = model_A_final_marks.predict(X_A_test_replaced)

predictions_A_A_error_count = model_A_error_count.predict(X_A_test)
predictions_A_B_error_count = model_A_error_count.predict(X_A_test_replaced)

# Model B: Training on Set B, Testing on Set B and modified Set A
model_B_final_marks = CatBoostRegressor(iterations=500, learning_rate=0.1, depth=6, verbose=0)
model_B_error_count = CatBoostRegressor(iterations=500, learning_rate=0.1, depth=6, verbose=0)

model_B_final_marks.fit(X_B_train, y_B_train['Final_Marks'])
model_B_error_count.fit(X_B_train, y_B_train['error_count'])

# Predictions for Set A and modified Set B
predictions_B_A_final_marks = model_B_final_marks.predict(X_A_test_replaced)
predictions_B_B_final_marks = model_B_final_marks.predict(X_B_test)

predictions_B_A_error_count = model_B_error_count.predict(X_A_test_replaced)
predictions_B_B_error_count = model_B_error_count.predict(X_B_test)

# Function to calculate evaluation metrics
def evaluate_model(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_true, y_pred)
    mape = (abs((y_true - y_pred) / y_true)).mean() * 100  # MAPE calculation
    r2 = r2_score(y_true, y_pred)
    return mse, rmse, mae, mape, r2

# Evaluate Model A
mse_A_A_final_marks, rmse_A_A_final_marks, mae_A_A_final_marks, mape_A_A_final_marks, r2_A_A_final_marks = evaluate_model(y_A_test['Final_Marks'], predictions_A_A_final_marks)
mse_A_B_final_marks, rmse_A_B_final_marks, mae_A_B_final_marks, mape_A_B_final_marks, r2_A_B_final_marks = evaluate_model(y_A_test_replaced['Final_Marks'], predictions_A_B_final_marks)


# Evaluate Model B
mse_B_A_final_marks, rmse_B_A_final_marks, mae_B_A_final_marks, mape_B_A_final_marks, r2_B_A_final_marks = evaluate_model(y_A_test_replaced['Final_Marks'], predictions_B_A_final_marks)
mse_B_B_final_marks, rmse_B_B_final_marks, mae_B_B_final_marks, mape_B_B_final_marks, r2_B_B_final_marks = evaluate_model(y_B_test['Final_Marks'], predictions_B_B_final_marks)


# Print the results
print(f"Model A - Set A (Code Only) - final_marks: MSE: {mse_A_A_final_marks}, RMSE: {rmse_A_A_final_marks}, MAE: {mae_A_A_final_marks}, MAPE: {mape_A_A_final_marks}, R2: {r2_A_A_final_marks}")
print(f"Model A - modified Set B (Code with Comments) - final_marks: MSE: {mse_A_B_final_marks}, RMSE: {rmse_A_B_final_marks}, MAE: {mae_A_B_final_marks}, MAPE: {mape_A_B_final_marks}, R2: {r2_A_B_final_marks}")


print(f"Model B - modified Set A (Code Only) - final_marks: MSE: {mse_B_A_final_marks}, RMSE: {rmse_B_A_final_marks}, MAE: {mae_B_A_final_marks}, MAPE: {mape_B_A_final_marks}, R2: {r2_B_A_final_marks}")
print(f"Model B - Set B (Code with Comments) - final_marks: MSE: {mse_B_B_final_marks}, RMSE: {rmse_B_B_final_marks}, MAE: {mae_B_B_final_marks}, MAPE: {mape_B_B_final_marks}, R2: {r2_B_B_final_marks}")



Model A - Set A (Code Only) - final_marks: MSE: 2.7439268580316223, RMSE: 1.6564802618901386, MAE: 1.2699671395338963, MAPE: 37.31424474076323, R2: 0.5424638717355562
Model A - modified Set B (Code with Comments) - final_marks: MSE: 3.905870276660464, RMSE: 1.9763274720198736, MAE: 1.52649738528082, MAPE: 43.65478320892572, R2: 0.29107500318704604
Model A - Set A (Code Only) - error_count: MSE: 1.0432939893987545, RMSE: 1.021417637109696, MAE: 0.6895226619447867, MAPE: inf, R2: 0.6833569317903618
Model A - modified Set B (Code with Comments) - error_count: MSE: 3.2100527727301786, RMSE: 1.7916620140892028, MAE: 1.3845627069488806, MAPE: inf, R2: -0.050324157589750174
Model B - modified Set A (Code Only) - final_marks: MSE: 7.818480277624734, RMSE: 2.7961545518130313, MAE: 2.2545414646156976, MAPE: 74.97293200871323, R2: -0.41907327005143236
Model B - Set B (Code with Comments) - final_marks: MSE: 3.1195481924883244, RMSE: 1.7662242758178601, MAE: 1.3710705926759523, MAPE: 42.3041509600

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split  
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the datasets
cc_data = pd.read_csv(r"C:\Users\vvmad\Downloads\5th\ML\PROJECT\datasets\java_cc_embed_data.csv")  # Code with Comments
co_data = pd.read_csv(r"C:\Users\vvmad\Downloads\5th\ML\PROJECT\datasets\java_co_embed_data.csv")  # Code Only

# Set A: Entire Code Only dataset (Set A)
# Set B: Entire Code with Comments dataset (Set B)
set_A = co_data  
set_B = cc_data  

# Splitting the datasets with an 80/20 train-test split for training sets only
X_A = set_A.filter(like='co_embedding_')  # Embeddings for Code Only
y_A = set_A['Final_Marks']  # Label for Code Only

X_B = set_B.filter(like='cc_embedding_')  # Embeddings for Code with Comments
y_B = set_B['Final_Marks']  # Label for Code with Comments

X_A_train, _, y_A_train, _ = train_test_split(X_A, y_A, test_size=0.2, random_state=42)
X_B_train, _, y_B_train, _ = train_test_split(X_B, y_B, test_size=0.2, random_state=42)

# Creating a mixed test set with 50% from Set A and 50% from Set B
test_size = int(0.2 * len(set_A))  # 20% of Set A as reference for mixed test set size
X_test = pd.concat([X_A.sample(test_size, random_state=42).reset_index(drop=True), 
                    X_B.sample(test_size, random_state=42).reset_index(drop=True)])
y_test = pd.concat([y_A.sample(test_size, random_state=42).reset_index(drop=True), 
                    y_B.sample(test_size, random_state=42).reset_index(drop=True)])

# Model A: Training on Set A, Testing on the mixed test set
model_A_final_marks = CatBoostRegressor(iterations=500, learning_rate=0.1, depth=6, verbose=0)
model_A_final_marks.fit(X_A_train, y_A_train)

# Predictions for the mixed test set using Model A
predictions_A_test_final_marks = model_A_final_marks.predict(X_test)

# Model B: Training on Set B, Testing on the mixed test set
model_B_final_marks = CatBoostRegressor(iterations=500, learning_rate=0.1, depth=6, verbose=0)
model_B_final_marks.fit(X_B_train, y_B_train)

# Predictions for the mixed test set using Model B
predictions_B_test_final_marks = model_B_final_marks.predict(X_test)

# Function to calculate evaluation metrics
def evaluate_model(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_true, y_pred)
    mape = (abs((y_true - y_pred) / y_true)).mean() * 100  # MAPE calculation
    r2 = r2_score(y_true, y_pred)
    return mse, rmse, mae, mape, r2

# Evaluate Model A on training set
train_pred_A_final_marks = model_A_final_marks.predict(X_A_train)
mse_A_train, rmse_A_train, mae_A_train, mape_A_train, r2_A_train = evaluate_model(y_A_train, train_pred_A_final_marks)

# Evaluate Model A on the mixed test set
mse_A_test, rmse_A_test, mae_A_test, mape_A_test, r2_A_test = evaluate_model(y_test, predictions_A_test_final_marks)

# Evaluate Model B on training set
train_pred_B_final_marks = model_B_final_marks.predict(X_B_train)
mse_B_train, rmse_B_train, mae_B_train, mape_B_train, r2_B_train = evaluate_model(y_B_train, train_pred_B_final_marks)

# Evaluate Model B on the mixed test set
mse_B_test, rmse_B_test, mae_B_test, mape_B_test, r2_B_test = evaluate_model(y_test, predictions_B_test_final_marks)

# Print the results
print(f"Model A - Training Set - final_marks: MSE: {mse_A_train}, RMSE: {rmse_A_train}, MAE: {mae_A_train}, MAPE: {mape_A_train}, R2: {r2_A_train}")
print(f"Model A - Mixed Test Set (50% CO, 50% CC) - final_marks: MSE: {mse_A_test}, RMSE: {rmse_A_test}, MAE: {mae_A_test}, MAPE: {mape_A_test}, R2: {r2_A_test}")

print(f"Model B - Training Set - final_marks: MSE: {mse_B_train}, RMSE: {rmse_B_train}, MAE: {mae_B_train}, MAPE: {mape_B_train}, R2: {r2_B_train}")
print(f"Model B - Mixed Test Set (50% CO, 50% CC) - final_marks: MSE: {mse_B_test}, RMSE: {rmse_B_test}, MAE: {mae_B_test}, MAPE: {mape_B_test}, R2: {r2_B_test}")


Model A - Training Set - final_marks: MSE: 0.16023059789531988, RMSE: 0.40028814358574233, MAE: 0.2699527106958413, MAPE: inf, R2: 0.972542755760629
Model A - Mixed Test Set (50% CO, 50% CC) - final_marks: MSE: 4.365549148612913, RMSE: 2.089389659353399, MAE: 1.6081186246087504, MAPE: 49.68091956264545, R2: 0.2651162268042522
Model B - Training Set - final_marks: MSE: 0.1373945619876308, RMSE: 0.37066772450218916, MAE: 0.23472756310833273, MAPE: inf, R2: 0.9764559572565512
Model B - Mixed Test Set (50% CO, 50% CC) - final_marks: MSE: 4.545405098010447, RMSE: 2.1319955670710122, MAE: 1.6525994898387961, MAPE: 56.227431005939806, R2: 0.23483980241284208
