In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Load the datasets
cc_data = pd.read_csv(r"C:\Users\vvmad\Downloads\5th\ML\PROJECT\datasets\java_cc_embed_data.csv")  # Code with Comments
co_data = pd.read_csv(r"C:\Users\vvmad\Downloads\5th\ML\PROJECT\datasets\java_co_embed_data.csv")  # Code Only

# Set A: Entire Code Only dataset (Set A)
# Set B: Entire Code with Comments dataset (Set B)
set_A = co_data  
set_B = cc_data  

# Splitting the datasets with an 80/20 train-test split for training sets only
X_A = set_A.filter(like='co_embedding_')  # Embeddings for Code Only
y_A = set_A['Final_Marks']  # Label for Code Only

X_B = set_B.filter(like='cc_embedding_')  # Embeddings for Code with Comments
y_B = set_B['Final_Marks']  # Label for Code with Comments

X_A_train, _, y_A_train, _ = train_test_split(X_A, y_A, test_size=0.2, random_state=42)
X_B_train, _, y_B_train, _ = train_test_split(X_B, y_B, test_size=0.2, random_state=42)

# Creating a mixed test set with 50% from Set A and 50% from Set B
test_size = int(0.5 * len(set_A))  # 50% of Set A as reference for mixed test set size
X_test = pd.concat([X_A.sample(test_size, random_state=42).reset_index(drop=True), 
                    X_B.sample(test_size, random_state=42).reset_index(drop=True)])
y_test = pd.concat([y_A.sample(test_size, random_state=42).reset_index(drop=True), 
                    y_B.sample(test_size, random_state=42).reset_index(drop=True)])

# Function to calculate evaluation metrics
def evaluate_model(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_true, y_pred)
    mape = (abs((y_true - y_pred) / y_true)).mean() * 100  # MAPE calculation
    r2 = r2_score(y_true, y_pred)
    return mse, rmse, mae, mape, r2

# Hyperparameter tuning using RandomizedSearchCV
def tune_model(X_train, y_train):
    param_dist = {
        'iterations': [500],  # Fixed number of iterations
        'learning_rate': [0.1],  # Single learning rate
        'depth': [6],  # Single depth
        'l2_leaf_reg': [3],  # Single value for regularization
    }
    
    model = CatBoostRegressor(early_stopping_rounds=50, verbose=0)  # Early stopping
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter=10,  # Reduce number of iterations to sample the parameter space
        scoring='neg_mean_absolute_error',
        cv=3,
        n_jobs=-1
    )
    random_search.fit(X_train, y_train)
    return random_search.best_estimator_, random_search.best_params_

# Tune Model A and B
best_model_A, best_params_A = tune_model(X_A_train, y_A_train)
print(f"Best Parameters for Model A: {best_params_A}")

best_model_B, best_params_B = tune_model(X_B_train, y_B_train)
print(f"Best Parameters for Model B: {best_params_B}")

# Predictions and evaluation for Model A
best_predictions_A_train = best_model_A.predict(X_A_train)
mse_A_train, rmse_A_train, mae_A_train, mape_A_train, r2_A_train = evaluate_model(y_A_train, best_predictions_A_train)

best_predictions_A_test = best_model_A.predict(X_test)
mse_A_test, rmse_A_test, mae_A_test, mape_A_test, r2_A_test = evaluate_model(y_test, best_predictions_A_test)

# Predictions and evaluation for Model B
best_predictions_B_train = best_model_B.predict(X_B_train)
mse_B_train, rmse_B_train, mae_B_train, mape_B_train, r2_B_train = evaluate_model(y_B_train, best_predictions_B_train)

best_predictions_B_test = best_model_B.predict(X_test)
mse_B_test, rmse_B_test, mae_B_test, mape_B_test, r2_B_test = evaluate_model(y_test, best_predictions_B_test)

# Print the results
results = [
    ("Model A", "Training Set", mse_A_train, rmse_A_train, mae_A_train, mape_A_train, r2_A_train),
    ("Model A", "Mixed Test Set", mse_A_test, rmse_A_test, mae_A_test, mape_A_test, r2_A_test),
    ("Model B", "Training Set", mse_B_train, rmse_B_train, mae_B_train, mape_B_train, r2_B_train),
    ("Model B", "Mixed Test Set", mse_B_test, rmse_B_test, mae_B_test, mape_B_test, r2_B_test)
]

# Sorting the results by decreasing test MAE
sorted_results = sorted(results, key=lambda x: x[5], reverse=True)

# Displaying sorted results
for model, dataset, mse, rmse, mae, mape, r2 in sorted_results:
    print(f"{model} - {dataset} - final_marks: MSE: {mse}, RMSE: {rmse}, MAE: {mae}, MAPE: {mape}, R2: {r2}")




Best Parameters for Model A: {'learning_rate': 0.1, 'l2_leaf_reg': 3, 'iterations': 500, 'depth': 6}
Best Parameters for Model B: {'learning_rate': 0.1, 'l2_leaf_reg': 3, 'iterations': 500, 'depth': 6}
Model A - Training Set - final_marks: MSE: 0.16023059789531988, RMSE: 0.40028814358574233, MAE: 0.2699527106958413, MAPE: inf, R2: 0.972542755760629
Model A - Mixed Test Set - final_marks: MSE: 3.641673727818242, RMSE: 1.9083169882957711, MAE: 1.3278014197826713, MAPE: inf, R2: 0.38105352609746934
Model B - Training Set - final_marks: MSE: 0.1373945619876308, RMSE: 0.37066772450218916, MAE: 0.23472756310833273, MAPE: inf, R2: 0.9764559572565512
Model B - Mixed Test Set - final_marks: MSE: 3.6379497268312675, RMSE: 1.9073410095814716, MAE: 1.3196345200877162, MAPE: inf, R2: 0.38168646508431214


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Load the datasets
cc_data = pd.read_csv(r"C:\Users\vvmad\Downloads\5th\ML\PROJECT\datasets\java_cc_embed_data.csv")  # Code with Comments
co_data = pd.read_csv(r"C:\Users\vvmad\Downloads\5th\ML\PROJECT\datasets\java_co_embed_data.csv")  # Code Only

# Set A: Entire Code Only dataset (Set A)
# Set B: Entire Code with Comments dataset (Set B)
set_A = co_data
set_B = cc_data

# Splitting the datasets with an 80/20 train-test split for training sets only
X_A = set_A.filter(like='co_embedding_')  # Embeddings for Code Only
y_A = set_A['Final_Marks']  # Label for Code Only

X_B = set_B.filter(like='cc_embedding_')  # Embeddings for Code with Comments
y_B = set_B['Final_Marks']  # Label for Code with Comments

X_A_train, _, y_A_train, _ = train_test_split(X_A, y_A, test_size=0.2, random_state=42)
X_B_train, _, y_B_train, _ = train_test_split(X_B, y_B, test_size=0.2, random_state=42)

# Creating a mixed test set with 50% from Set A and 50% from Set B
test_size = int(0.5 * len(set_A))  # 50% of Set A as reference for mixed test set size
X_test = pd.concat([X_A.sample(test_size, random_state=42).reset_index(drop=True),
                    X_B.sample(test_size, random_state=42).reset_index(drop=True)])
y_test = pd.concat([y_A.sample(test_size, random_state=42).reset_index(drop=True),
                    y_B.sample(test_size, random_state=42).reset_index(drop=True)])

# Function to calculate evaluation metrics
def evaluate_model(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_true, y_pred)
    mape = (np.abs((y_true - y_pred) / y_true)).mean() * 100  # MAPE calculation
    r2 = r2_score(y_true, y_pred)
    return mse, rmse, mae, mape, r2

# Hyperparameter tuning using RandomizedSearchCV with early stopping
def tune_model(X_train, y_train):
    param_dist = {
        'iterations': [500],  # Fixed number of iterations
        'learning_rate': [0.1],  # Single learning rate
        'depth': [6],  # Single depth
        'l2_leaf_reg': [3],  # Single value for regularization
    }
    
    model = CatBoostRegressor(early_stopping_rounds=50, verbose=0)  # Early stopping
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter=10,  # Reduce number of iterations to sample the parameter space
        scoring='neg_mean_absolute_error',
        cv=3,
        n_jobs=-1
    )
    random_search.fit(X_train, y_train)
    return random_search.best_estimator_, random_search.best_params_

# Tune Model A and B
best_model_A, best_params_A = tune_model(X_A_train, y_A_train)
print(f"Best Parameters for Model A: {best_params_A}")

best_model_B, best_params_B = tune_model(X_B_train, y_B_train)
print(f"Best Parameters for Model B: {best_params_B}")

# Predictions and evaluation for Model A
best_predictions_A_train = best_model_A.predict(X_A_train)
mse_A_train, rmse_A_train, mae_A_train, mape_A_train, r2_A_train = evaluate_model(y_A_train, best_predictions_A_train)

best_predictions_A_test = best_model_A.predict(X_test)
mse_A_test, rmse_A_test, mae_A_test, mape_A_test, r2_A_test = evaluate_model(y_test, best_predictions_A_test)

# Predictions and evaluation for Model B
best_predictions_B_train = best_model_B.predict(X_B_train)
mse_B_train, rmse_B_train, mae_B_train, mape_B_train, r2_B_train = evaluate_model(y_B_train, best_predictions_B_train)

best_predictions_B_test = best_model_B.predict(X_test)
mse_B_test, rmse_B_test, mae_B_test, mape_B_test, r2_B_test = evaluate_model(y_test, best_predictions_B_test)

# Print the results
results = [
    ("Model A", "Training Set", mse_A_train, rmse_A_train, mae_A_train, mape_A_train, r2_A_train),
    ("Model A", "Mixed Test Set", mse_A_test, rmse_A_test, mae_A_test, mape_A_test, r2_A_test),
    ("Model B", "Training Set", mse_B_train, rmse_B_train, mae_B_train, mape_B_train, r2_B_train),
    ("Model B", "Mixed Test Set", mse_B_test, rmse_B_test, mae_B_test, mape_B_test, r2_B_test)
]

# Sorting the results by decreasing test MAE
sorted_results = sorted(results, key=lambda x: x[4], reverse=True)

# Displaying sorted results
for model, dataset, mse, rmse, mae, mape, r2 in sorted_results:
    print(f"{model} - {dataset} - Final_Marks: MSE: {mse}, RMSE: {rmse}, MAE: {mae}, MAPE: {mape}, R2: {r2}")




Best Parameters for Model A: {'learning_rate': 0.1, 'l2_leaf_reg': 3, 'iterations': 500, 'depth': 6}
Best Parameters for Model B: {'learning_rate': 0.1, 'l2_leaf_reg': 3, 'iterations': 500, 'depth': 6}
Model A - Mixed Test Set - Final_Marks: MSE: 3.641673727818242, RMSE: 1.9083169882957711, MAE: 1.3278014197826713, MAPE: inf, R2: 0.38105352609746934
Model B - Mixed Test Set - Final_Marks: MSE: 3.6379497268312675, RMSE: 1.9073410095814716, MAE: 1.3196345200877162, MAPE: inf, R2: 0.38168646508431214
Model A - Training Set - Final_Marks: MSE: 0.16023059789531988, RMSE: 0.40028814358574233, MAE: 0.2699527106958413, MAPE: inf, R2: 0.972542755760629
Model B - Training Set - Final_Marks: MSE: 0.1373945619876308, RMSE: 0.37066772450218916, MAE: 0.23472756310833273, MAPE: inf, R2: 0.9764559572565512
