In [None]:
import pandas as pd
import shap
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load datasets (Example: Replace paths with your actual file locations)
co_data = pd.read_csv(r"C:\Users\prabi\OneDrive\Desktop\V sem\ML\Datasets\java_co_embed_data.csv")  # Code Only
cc_data = pd.read_csv(r"C:\Users\prabi\OneDrive\Desktop\V sem\ML\Datasets\java_cc_embed_data.csv")  # Code with Comments

# Normalize column names to avoid hidden issues
co_data.columns = co_data.columns.str.strip().str.lower()
cc_data.columns = cc_data.columns.str.strip().str.lower()

# Feature columns and target variables
co_feature_cols = [col for col in co_data.columns if col.startswith('co_embedding_')]
cc_feature_cols = [col for col in cc_data.columns if col.startswith('cc_embedding_')]  # Adjusted for matching prefix
target_col = "final_marks"  # Ensure the target column name is lowercase

# Debugging step: Check columns and features
print("Code Only Dataset Columns:", co_data.columns.tolist())
print("Code with Comments Dataset Columns:", cc_data.columns.tolist())
print("Code Only Feature Columns:", co_feature_cols)
print("Code with Comments Feature Columns:", cc_feature_cols)

# Check for missing columns
missing_in_co = set(co_feature_cols) - set(co_data.columns)
missing_in_cc = set(cc_feature_cols) - set(cc_data.columns)
print("Missing columns in Code Only Dataset:", missing_in_co)
print("Missing columns in Code with Comments Dataset:", missing_in_cc)

# Prepare training and test datasets
X_co = co_data[co_feature_cols]
y_co = co_data[target_col]

X_cc = cc_data[cc_feature_cols]  # Adjusted to ensure feature columns match
y_cc = cc_data[target_col]

# Split datasets for training and testing
X_train_co, X_test_co, y_train_co, y_test_co = train_test_split(X_co, y_co, test_size=0.2, random_state=42)
X_train_cc, X_test_cc, y_train_cc, y_test_cc = train_test_split(X_cc, y_cc, test_size=0.2, random_state=42)

# Initialize and train the CatBoost model on Code Only dataset
model_co = CatBoostRegressor(iterations=500, learning_rate=0.1, depth=6, verbose=0)
model_co.fit(X_train_co, y_train_co)

# Initialize and train the CatBoost model on Code with Comments dataset
model_cc = CatBoostRegressor(iterations=500, learning_rate=0.1, depth=6, verbose=0)
model_cc.fit(X_train_cc, y_train_cc)

# Predict and evaluate performance (Example for Code Only)
y_pred_co = model_co.predict(X_test_co)
print("MSE (Code Only):", mean_squared_error(y_test_co, y_pred_co))

y_pred_cc = model_cc.predict(X_test_cc)
print("MSE (Code with Comments):", mean_squared_error(y_test_cc, y_pred_cc))

# SHAP Analysis for Code Only model
explainer_co = shap.TreeExplainer(model_co)
shap_values_co = explainer_co.shap_values(X_test_co)

# SHAP Analysis for Code with Comments model
explainer_cc = shap.TreeExplainer(model_cc)
shap_values_cc = explainer_cc.shap_values(X_test_cc)

# Visualize SHAP summary for Code Only
print("SHAP Summary Plot: Code Only")
shap.summary_plot(shap_values_co, X_test_co)

# Visualize SHAP summary for Code with Comments
print("SHAP Summary Plot: Code with Comments")
shap.summary_plot(shap_values_cc, X_test_cc)

# Force plot for a specific instance in Code Only dataset
print("SHAP Force Plot: Code Only")
shap.force_plot(explainer_co.expected_value, shap_values_co[0], X_test_co.iloc[0], matplotlib=True)

# Dependence plot for a specific feature in Code Only dataset
print("SHAP Dependence Plot: Code Only")
shap.dependence_plot("co_embedding_0", shap_values_co, X_test_co)  # Replace with a valid feature name

# Force plot for a specific instance in Code with Comments dataset
print("SHAP Force Plot: Code with Comments")
shap.force_plot(explainer_cc.expected_value, shap_values_cc[0], X_test_cc.iloc[0], matplotlib=True)

# Dependence plot for a specific feature in Code with Comments dataset
print("SHAP Dependence Plot: Code with Comments")
shap.dependence_plot("cc_embedding_69", shap_values_cc, X_test_cc)  # Replace with a valid feature name


Code Only Dataset Columns: ['co_embedding_0', 'co_embedding_1', 'co_embedding_2', 'co_embedding_3', 'co_embedding_4', 'co_embedding_5', 'co_embedding_6', 'co_embedding_7', 'co_embedding_8', 'co_embedding_9', 'co_embedding_10', 'co_embedding_11', 'co_embedding_12', 'co_embedding_13', 'co_embedding_14', 'co_embedding_15', 'co_embedding_16', 'co_embedding_17', 'co_embedding_18', 'co_embedding_19', 'co_embedding_20', 'co_embedding_21', 'co_embedding_22', 'co_embedding_23', 'co_embedding_24', 'co_embedding_25', 'co_embedding_26', 'co_embedding_27', 'co_embedding_28', 'co_embedding_29', 'co_embedding_30', 'co_embedding_31', 'co_embedding_32', 'co_embedding_33', 'co_embedding_34', 'co_embedding_35', 'co_embedding_36', 'co_embedding_37', 'co_embedding_38', 'co_embedding_39', 'co_embedding_40', 'co_embedding_41', 'co_embedding_42', 'co_embedding_43', 'co_embedding_44', 'co_embedding_45', 'co_embedding_46', 'co_embedding_47', 'co_embedding_48', 'co_embedding_49', 'co_embedding_50', 'co_embedding