In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from itertools import product

# Load the dataset
def load_data(file_path):
    df = pd.read_excel(file_path)
    keywords = df[[f'Keywords_Vector_{i}' for i in range(20)]]
    custom_data_structures = df[[f'Custom_Data_Structures_Vector_{i}' for i in range(20)]]
    final_marks = df["Final_Marks"]
    return keywords, custom_data_structures, final_marks

# Function to pad vectors with zeros to match length
def pad_with_zeros(vector, max_length):
    vector = np.atleast_2d(vector)
    if vector.shape[1] < max_length:
        padding = np.zeros((vector.shape[0], max_length - vector.shape[1]))
        vector = np.hstack((vector, padding))
    return vector

# Reduction and combination strategies
def combine_vectors(keywords, custom_data_structures, weights, fusion):
    weighted_keywords = weights[1] * keywords
    weighted_custom_data_structures = weights[0] * custom_data_structures

    if fusion == "add":
        max_length = max(weighted_keywords.shape[1], weighted_custom_data_structures.shape[1])
        weighted_keywords = pad_with_zeros(weighted_keywords, max_length)
        weighted_custom_data_structures = pad_with_zeros(weighted_custom_data_structures, max_length)
        combined_vectors = weighted_keywords + weighted_custom_data_structures
    elif fusion == "concatenate":
        combined_vectors = np.hstack((weighted_keywords, weighted_custom_data_structures))
    return combined_vectors

# CatBoost parameter options
iterations_list = [1500, 1000, 2000]
learning_rate_list = [0.01, 0.05]
l2_leaf_reg_list = [5, 7]
early_stopping_rounds_list = [15, 30]

# Generate all combinations of CatBoost parameters
param_combinations = list(product(iterations_list, learning_rate_list, l2_leaf_reg_list, early_stopping_rounds_list))

# Load data
file_path = "pca.xlsx"
keywords, custom_data_structures, final_marks = load_data(file_path)

# Weights and fusion types
weights_options = [(1, 1), (1,9),(9,1),(2,8),(8,2),(3,7),(7,3),(4,6),(6,4),(0.1,0.9),(0.9,0.1),(0.2,0.8),(0.8,0.2),(0.3,0.7),(0.7,0.3),(0.4,0.6),(0.6,0.4)]
fusions = ["add", "concatenate"]

# DataFrame to store results
results = []

# Evaluate models with different combinations
for weights in weights_options:
    for fusion in fusions:
        combined_vectors = combine_vectors(keywords, custom_data_structures, weights, fusion)
        X_train, X_test, y_train, y_test = train_test_split(combined_vectors, final_marks, test_size=0.2, random_state=42, stratify=final_marks)

        for params in param_combinations:
            iterations, learning_rate, l2_leaf_reg, early_stopping_rounds = params

            # Print parameters being tested
            print(f"Testing CatBoost with iterations={iterations}, learning_rate={learning_rate}, l2_leaf_reg={l2_leaf_reg}, early_stopping_rounds={early_stopping_rounds}")

            model = CatBoostRegressor(
                iterations=iterations,
                learning_rate=learning_rate,
                l2_leaf_reg=l2_leaf_reg,
                early_stopping_rounds=early_stopping_rounds,
                random_state=43,
                verbose=0
            )

            model.fit(X_train, y_train)

            # Train metrics
            y_train_pred = np.round(model.predict(X_train))
            train_mae = mean_absolute_error(y_train, y_train_pred)
            train_mse = mean_squared_error(y_train, y_train_pred)
            train_rmse = np.sqrt(train_mse)
            train_r2 = r2_score(y_train, y_train_pred)
            train_mape = mean_absolute_percentage_error(y_train, y_train_pred)

            # Test metrics
            y_test_pred = np.round(model.predict(X_test))
            test_mae = mean_absolute_error(y_test, y_test_pred)
            test_mse = mean_squared_error(y_test, y_test_pred)
            test_rmse = np.sqrt(test_mse)
            test_r2 = r2_score(y_test, y_test_pred)
            test_mape = mean_absolute_percentage_error(y_test, y_test_pred)

            # Store results
            results.append({
                "Model": "CatBoost Regressor",
                "Weights": weights,
                "Fusion": fusion,
                "Iterations": iterations,
                "Learning Rate": learning_rate,
                "L2 Leaf Reg": l2_leaf_reg,
                "Early Stopping Rounds": early_stopping_rounds,
                "Train MAE": train_mae,
                "Train MSE": train_mse,
                "Train RMSE": train_rmse,
                "Train R2": train_r2,
                "Train MAPE": train_mape,
                "Test MAE": test_mae,
                "Test MSE": test_mse,
                "Test RMSE": test_rmse,
                "Test R2": test_r2,
                "Test MAPE": test_mape
            })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Sort results by Fusion and Test R2 in descending order
results_df.sort_values(by=["Fusion", "Test R2"], ascending=[True, False], inplace=True)
# Save results to Excel
results_df.to_excel("catboostpca2.xlsx", index=False)

Testing CatBoost with iterations=1500, learning_rate=0.01, l2_leaf_reg=5, early_stopping_rounds=15
Testing CatBoost with iterations=1500, learning_rate=0.01, l2_leaf_reg=5, early_stopping_rounds=30
Testing CatBoost with iterations=1500, learning_rate=0.01, l2_leaf_reg=7, early_stopping_rounds=15
Testing CatBoost with iterations=1500, learning_rate=0.01, l2_leaf_reg=7, early_stopping_rounds=30
Testing CatBoost with iterations=1500, learning_rate=0.05, l2_leaf_reg=5, early_stopping_rounds=15
Testing CatBoost with iterations=1500, learning_rate=0.05, l2_leaf_reg=5, early_stopping_rounds=30
Testing CatBoost with iterations=1500, learning_rate=0.05, l2_leaf_reg=7, early_stopping_rounds=15
Testing CatBoost with iterations=1500, learning_rate=0.05, l2_leaf_reg=7, early_stopping_rounds=30
Testing CatBoost with iterations=1000, learning_rate=0.01, l2_leaf_reg=5, early_stopping_rounds=15
Testing CatBoost with iterations=1000, learning_rate=0.01, l2_leaf_reg=5, early_stopping_rounds=30
Testing Ca

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from itertools import product

# Load the dataset
def load_data(file_path):
    df = pd.read_excel(file_path)
    keywords = df[[f'Keywords_Vector_{i}' for i in range(20)]]
    custom_data_structures = df[[f'Custom_Data_Structures_Vector_{i}' for i in range(20)]]
    final_marks = df["Final_Marks"]
    return keywords, custom_data_structures, final_marks

# Function to pad vectors with zeros to match length
def pad_with_zeros(vector, max_length):
    vector = np.atleast_2d(vector)
    if vector.shape[1] < max_length:
        padding = np.zeros((vector.shape[0], max_length - vector.shape[1]))
        vector = np.hstack((vector, padding))
    return vector

# Reduction and combination strategies
def combine_vectors(keywords, custom_data_structures, weights, fusion):
    weighted_keywords = weights[1] * keywords
    weighted_custom_data_structures = weights[0] * custom_data_structures

    if fusion == "add":
        max_length = max(weighted_keywords.shape[1], weighted_custom_data_structures.shape[1])
        weighted_keywords = pad_with_zeros(weighted_keywords, max_length)
        weighted_custom_data_structures = pad_with_zeros(weighted_custom_data_structures, max_length)
        combined_vectors = weighted_keywords + weighted_custom_data_structures
    elif fusion == "concatenate":
        combined_vectors = np.hstack((weighted_keywords, weighted_custom_data_structures))
    return combined_vectors

# CatBoost parameter options
iterations_list = [1500, 1000, 2000]
learning_rate_list = [0.01, 0.05]
l2_leaf_reg_list = [3, 5, 7]
early_stopping_rounds_list = [15, 20, 30]

# Generate all combinations of CatBoost parameters
param_combinations = list(product(iterations_list, learning_rate_list, l2_leaf_reg_list, early_stopping_rounds_list))

# Load data
file_path = "IG.xlsx"
keywords, custom_data_structures, final_marks = load_data(file_path)

# Weights and fusion types
weights_options = [(1, 1), (0.4, 0.6), (0.1, 0.9)]
fusions = ["add", "concatenate"]

# DataFrame to store results
results = []

# Evaluate models with different combinations
for weights in weights_options:
    for fusion in fusions:
        combined_vectors = combine_vectors(keywords, custom_data_structures, weights, fusion)
        X_train, X_test, y_train, y_test = train_test_split(combined_vectors, final_marks, test_size=0.2, random_state=42, stratify=final_marks)

        for params in param_combinations:
            iterations, learning_rate, l2_leaf_reg, early_stopping_rounds = params

            # Print parameters being tested
            print(f"Testing CatBoost with iterations={iterations}, learning_rate={learning_rate}, l2_leaf_reg={l2_leaf_reg}, early_stopping_rounds={early_stopping_rounds}")

            model = CatBoostRegressor(
                iterations=iterations,
                learning_rate=learning_rate,
                l2_leaf_reg=l2_leaf_reg,
                early_stopping_rounds=early_stopping_rounds,
                random_state=43,
                verbose=0
            )

            model.fit(X_train, y_train)

            # Train metrics
            y_train_pred = np.round(model.predict(X_train))
            train_mae = mean_absolute_error(y_train, y_train_pred)
            train_mse = mean_squared_error(y_train, y_train_pred)
            train_rmse = np.sqrt(train_mse)
            train_r2 = r2_score(y_train, y_train_pred)
            train_mape = mean_absolute_percentage_error(y_train, y_train_pred)

            # Test metrics
            y_test_pred = np.round(model.predict(X_test))
            test_mae = mean_absolute_error(y_test, y_test_pred)
            test_mse = mean_squared_error(y_test, y_test_pred)
            test_rmse = np.sqrt(test_mse)
            test_r2 = r2_score(y_test, y_test_pred)
            test_mape = mean_absolute_percentage_error(y_test, y_test_pred)

            # Store results
            results.append({
                "Model": "CatBoost Regressor",
                "Weights": weights,
                "Fusion": fusion,
                "Iterations": iterations,
                "Learning Rate": learning_rate,
                "L2 Leaf Reg": l2_leaf_reg,
                "Early Stopping Rounds": early_stopping_rounds,
                "Train MAE": train_mae,
                "Train MSE": train_mse,
                "Train RMSE": train_rmse,
                "Train R2": train_r2,
                "Train MAPE": train_mape,
                "Test MAE": test_mae,
                "Test MSE": test_mse,
                "Test RMSE": test_rmse,
                "Test R2": test_r2,
                "Test MAPE": test_mape
            })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Sort results by Fusion and Test R2 in descending order
results_df.sort_values(by=["Fusion", "Test R2"], ascending=[True, False], inplace=True)
# Save results to Excel
results_df.to_excel("catboostactualIG.xlsx", index=False)

Testing CatBoost with iterations=1500, learning_rate=0.01, l2_leaf_reg=3, early_stopping_rounds=15
Testing CatBoost with iterations=1500, learning_rate=0.01, l2_leaf_reg=3, early_stopping_rounds=20
Testing CatBoost with iterations=1500, learning_rate=0.01, l2_leaf_reg=3, early_stopping_rounds=30
Testing CatBoost with iterations=1500, learning_rate=0.01, l2_leaf_reg=5, early_stopping_rounds=15
Testing CatBoost with iterations=1500, learning_rate=0.01, l2_leaf_reg=5, early_stopping_rounds=20
Testing CatBoost with iterations=1500, learning_rate=0.01, l2_leaf_reg=5, early_stopping_rounds=30
Testing CatBoost with iterations=1500, learning_rate=0.01, l2_leaf_reg=7, early_stopping_rounds=15
Testing CatBoost with iterations=1500, learning_rate=0.01, l2_leaf_reg=7, early_stopping_rounds=20
Testing CatBoost with iterations=1500, learning_rate=0.01, l2_leaf_reg=7, early_stopping_rounds=30
Testing CatBoost with iterations=1500, learning_rate=0.05, l2_leaf_reg=3, early_stopping_rounds=15
Testing Ca

For PCA reduction

In [20]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, StackingRegressor

# Load the dataset
def load_data(file_path):
    df = pd.read_excel(file_path)
    keywords = df[[f'Keywords_Vector_{i}' for i in range(20)]]
    custom_data_structures = df[[f'Custom_Data_Structures_Vector_{i}' for i in range(20)]]
    final_marks = df["Final_Marks"]
    return keywords, custom_data_structures, final_marks

# Function to pad vectors with zeros to match length
def pad_with_zeros(vector, max_length):
    vector = np.atleast_2d(vector)
    if vector.shape[1] < max_length:
        padding = np.zeros((vector.shape[0], max_length - vector.shape[1]))
        vector = np.hstack((vector, padding))
    return vector

# Reduction and combination strategies
def combine_vectors(keywords, custom_data_structures, weights, fusion):
    weighted_keywords = weights[1] * keywords
    weighted_custom_data_structures = weights[0] * custom_data_structures

    if fusion == "add":
        max_length = max(weighted_keywords.shape[1], weighted_custom_data_structures.shape[1])
        weighted_keywords = pad_with_zeros(weighted_keywords, max_length)
        weighted_custom_data_structures = pad_with_zeros(weighted_custom_data_structures, max_length)
        combined_vectors = weighted_keywords + weighted_custom_data_structures
    elif fusion == "concatenate":
        combined_vectors = np.hstack((weighted_keywords, weighted_custom_data_structures))
    return combined_vectors

# Model definitions
catboost_params = {
    'verbose': 0,
    'iterations': 1000,
    'learning_rate': 0.05,
    'depth': 6,
    'l2_leaf_reg': 5,
    'early_stopping_rounds': 15, 'random_state': 43
}

xgboost_params = {
    'n_estimators': 500,
    'learning_rate': 0.01,
    'max_depth': 5,
    'min_child_weight': 2,
    'subsample': 0.6,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 1, 'random_state': 43
}

random_forest_params = {
    'n_estimators': 100,
    'max_depth': 6,
    'min_samples_split': 5,
    'min_samples_leaf': 3, 'random_state': 43
}

bagging_params = {
    'n_estimators': 100,
    'max_samples': 0.8,
    'max_features': 1.0, 'random_state': 43
}

# Base models
models = {
    "CatBoost Regressor": CatBoostRegressor(**catboost_params),
    "XGBoost Regressor": XGBRegressor(**xgboost_params),
    "Random Forest Regressor": RandomForestRegressor(**random_forest_params),
    "Bagging Regressor": BaggingRegressor(**bagging_params)
}

# Stacking regressor
estimators = [(name, model) for name, model in models.items()]
stacking_model = StackingRegressor(
    estimators=estimators,
    final_estimator=CatBoostRegressor(n_estimators=200, learning_rate=0.05, max_depth=3, verbose=0),
)
models["Stacking Regressor"] = stacking_model

# Load data
file_path = "/content/pca.xlsx"
keywords, custom_data_structures, final_marks = load_data(file_path)

# Weights and fusion types
weights_options = [(1, 1), (0.6, 0.4), (0.7, 0.3), (0.8, 0.2), (0.9, 0.1), (0.4, 0.6), (0.3, 0.7), (0.2, 0.8), (0.1, 0.9), (2, 1), (3, 1), (4, 1), (5, 1)]
fusions = ["add", "concatenate"]

# DataFrame to store results
results = []

# Evaluate models
for weights in weights_options:
    for fusion in fusions:
        combined_vectors = combine_vectors(keywords, custom_data_structures, weights, fusion)
        X_train, X_test, y_train, y_test = train_test_split(combined_vectors, final_marks, test_size=0.2, random_state=42, stratify=final_marks)

        for name, model in models.items():
            model.fit(X_train, y_train)

            # Train metrics
            y_train_pred = np.round(model.predict(X_train))
            train_mae = mean_absolute_error(y_train, y_train_pred)
            train_mse = mean_squared_error(y_train, y_train_pred)
            train_rmse = np.sqrt(train_mse)
            train_r2 = r2_score(y_train, y_train_pred)
            train_mape = mean_absolute_percentage_error(y_train, y_train_pred)

            # Test metrics
            y_test_pred = np.round(model.predict(X_test))
            test_mae = mean_absolute_error(y_test, y_test_pred)
            test_mse = mean_squared_error(y_test, y_test_pred)
            test_rmse = np.sqrt(test_mse)
            test_r2 = r2_score(y_test, y_test_pred)
            test_mape = mean_absolute_percentage_error(y_test, y_test_pred)

            # Store results
            results.append({
                "Model": name,
                "Weights": weights,
                "Fusion": fusion,
                "Train MAE": train_mae,
                "Train MSE": train_mse,
                "Train RMSE": train_rmse,
                "Train R2": train_r2,
                "Train MAPE": train_mape,
                "Test MAE": test_mae,
                "Test MSE": test_mse,
                "Test RMSE": test_rmse,
                "Test R2": test_r2,
                "Test MAPE": test_mape
            })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Sort results by fusion method and Test R2 in descending order
results_df.sort_values(by=["Fusion", "Test R2"], ascending=[True, False], inplace=True)

# Save results to Excel
results_df.to_excel("pca_results.xlsx", index=False)
print("Results saved to pca_results.xlsx.")


Results saved to pca_results.xlsx.


For CHI

In [22]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, StackingRegressor

# Load the dataset
def load_data(file_path):
    df = pd.read_excel(file_path)
    keywords = df[[f'Keywords_Vector_{i}' for i in range(20)]]
    custom_data_structures = df[[f'Custom_Data_Structures_Vector_{i}' for i in range(20)]]
    final_marks = df["Final_Marks"]
    return keywords, custom_data_structures, final_marks

# Function to pad vectors with zeros to match length
def pad_with_zeros(vector, max_length):
    vector = np.atleast_2d(vector)
    if vector.shape[1] < max_length:
        padding = np.zeros((vector.shape[0], max_length - vector.shape[1]))
        vector = np.hstack((vector, padding))
    return vector

# Reduction and combination strategies
def combine_vectors(keywords, custom_data_structures, weights, fusion):
    weighted_keywords = weights[1] * keywords
    weighted_custom_data_structures = weights[0] * custom_data_structures

    if fusion == "add":
        max_length = max(weighted_keywords.shape[1], weighted_custom_data_structures.shape[1])
        weighted_keywords = pad_with_zeros(weighted_keywords, max_length)
        weighted_custom_data_structures = pad_with_zeros(weighted_custom_data_structures, max_length)
        combined_vectors = weighted_keywords + weighted_custom_data_structures
    elif fusion == "concatenate":
        combined_vectors = np.hstack((weighted_keywords, weighted_custom_data_structures))
    return combined_vectors

# Model definitions
catboost_params = {
    'verbose': 0,
    'iterations': 1000,
    'learning_rate': 0.05,
    'depth': 6,
    'l2_leaf_reg': 5,
    'early_stopping_rounds': 15, 'random_state': 43
}

xgboost_params = {
    'n_estimators': 500,
    'learning_rate': 0.01,
    'max_depth': 5,
    'min_child_weight': 2,
    'subsample': 0.6,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 1, 'random_state': 43
}

random_forest_params = {
    'n_estimators': 100,
    'max_depth': 6,
    'min_samples_split': 5,
    'min_samples_leaf': 3, 'random_state': 43
}

bagging_params = {
    'n_estimators': 100,
    'max_samples': 0.8,
    'max_features': 1.0, 'random_state': 43
}

# Base models
models = {
    "CatBoost Regressor": CatBoostRegressor(**catboost_params),
    "XGBoost Regressor": XGBRegressor(**xgboost_params),
    "Random Forest Regressor": RandomForestRegressor(**random_forest_params),
    "Bagging Regressor": BaggingRegressor(**bagging_params)
}

# Stacking regressor
estimators = [(name, model) for name, model in models.items()]
stacking_model = StackingRegressor(
    estimators=estimators,
    final_estimator=CatBoostRegressor(n_estimators=200, learning_rate=0.05, max_depth=3, verbose=0),
)
models["Stacking Regressor"] = stacking_model

# Load data
file_path = "/content/chi.xlsx"
keywords, custom_data_structures, final_marks = load_data(file_path)

# Weights and fusion types
weights_options = [(1, 1), (0.6, 0.4), (0.7, 0.3), (0.8, 0.2), (0.9, 0.1), (0.4, 0.6), (0.3, 0.7), (0.2, 0.8), (0.1, 0.9), (2, 1), (3, 1), (4, 1), (5, 1)]
fusions = ["add", "concatenate"]

# DataFrame to store results
results = []

# Evaluate models
for weights in weights_options:
    for fusion in fusions:
        combined_vectors = combine_vectors(keywords, custom_data_structures, weights, fusion)
        X_train, X_test, y_train, y_test = train_test_split(combined_vectors, final_marks, test_size=0.2, random_state=42, stratify=final_marks)

        for name, model in models.items():
            model.fit(X_train, y_train)

            # Train metrics
            y_train_pred = np.round(model.predict(X_train))
            train_mae = mean_absolute_error(y_train, y_train_pred)
            train_mse = mean_squared_error(y_train, y_train_pred)
            train_rmse = np.sqrt(train_mse)
            train_r2 = r2_score(y_train, y_train_pred)
            train_mape = mean_absolute_percentage_error(y_train, y_train_pred)

            # Test metrics
            y_test_pred = np.round(model.predict(X_test))
            test_mae = mean_absolute_error(y_test, y_test_pred)
            test_mse = mean_squared_error(y_test, y_test_pred)
            test_rmse = np.sqrt(test_mse)
            test_r2 = r2_score(y_test, y_test_pred)
            test_mape = mean_absolute_percentage_error(y_test, y_test_pred)

            # Store results
            results.append({
                "Model": name,
                "Weights": weights,
                "Fusion": fusion,
                "Train MAE": train_mae,
                "Train MSE": train_mse,
                "Train RMSE": train_rmse,
                "Train R2": train_r2,
                "Train MAPE": train_mape,
                "Test MAE": test_mae,
                "Test MSE": test_mse,
                "Test RMSE": test_rmse,
                "Test R2": test_r2,
                "Test MAPE": test_mape
            })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Sort results by fusion method and Test R2 in descending order
results_df.sort_values(by=["Fusion", "Test R2"], ascending=[True, False], inplace=True)

# Save results to Excel
results_df.to_excel("chi_results.xlsx", index=False)
print("Results saved to chi_results.xlsx.")


Results saved to chi_results.xlsx.


For Information Gain

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, StackingRegressor

# Load the dataset
def load_data(file_path):
    df = pd.read_excel(file_path)
    keywords = df[[f'Keywords_Vector_{i}' for i in range(20)]]
    custom_data_structures = df[[f'Custom_Data_Structures_Vector_{i}' for i in range(20)]]
    final_marks = df["Final_Marks"]
    return keywords, custom_data_structures, final_marks

# Function to pad vectors with zeros to match length
def pad_with_zeros(vector, max_length):
    vector = np.atleast_2d(vector)
    if vector.shape[1] < max_length:
        padding = np.zeros((vector.shape[0], max_length - vector.shape[1]))
        vector = np.hstack((vector, padding))
    return vector

# Reduction and combination strategies
def combine_vectors(keywords, custom_data_structures, weights, fusion):
    weighted_keywords = weights[1] * keywords
    weighted_custom_data_structures = weights[0] * custom_data_structures

    if fusion == "add":
        max_length = max(weighted_keywords.shape[1], weighted_custom_data_structures.shape[1])
        weighted_keywords = pad_with_zeros(weighted_keywords, max_length)
        weighted_custom_data_structures = pad_with_zeros(weighted_custom_data_structures, max_length)
        combined_vectors = weighted_keywords + weighted_custom_data_structures
    elif fusion == "concatenate":
        combined_vectors = np.hstack((weighted_keywords, weighted_custom_data_structures))
    return combined_vectors

# Model definitions
catboost_params = {
    'verbose': 0,
    'iterations': 1000,
    'learning_rate': 0.05,
    'depth': 6,
    'l2_leaf_reg': 5,
    'early_stopping_rounds': 15, 'random_state': 43
}

xgboost_params = {
    'n_estimators': 500,
    'learning_rate': 0.01,
    'max_depth': 5,
    'min_child_weight': 2,
    'subsample': 0.6,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 1, 'random_state': 43
}

random_forest_params = {
    'n_estimators': 100,
    'max_depth': 6,
    'min_samples_split': 5,
    'min_samples_leaf': 3, 'random_state': 43
}

bagging_params = {
    'n_estimators': 100,
    'max_samples': 0.8,
    'max_features': 1.0, 'random_state': 43
}

# Base models
models = {
    "CatBoost Regressor": CatBoostRegressor(**catboost_params),
    "XGBoost Regressor": XGBRegressor(**xgboost_params),
    "Random Forest Regressor": RandomForestRegressor(**random_forest_params),
    "Bagging Regressor": BaggingRegressor(**bagging_params)
}

# Stacking regressor
estimators = [(name, model) for name, model in models.items()]
stacking_model = StackingRegressor(
    estimators=estimators,
    final_estimator=CatBoostRegressor(n_estimators=200, learning_rate=0.05, max_depth=3, verbose=0),
)
models["Stacking Regressor"] = stacking_model

# Load data
file_path = "/content/IG.xlsx"
keywords, custom_data_structures, final_marks = load_data(file_path)

# Weights and fusion types
weights_options = [(1, 1), (0.6, 0.4), (0.7, 0.3), (0.8, 0.2), (0.9, 0.1), (0.4, 0.6), (0.3, 0.7), (0.2, 0.8), (0.1, 0.9), (2, 1), (3, 1), (4, 1), (5, 1)]
fusions = ["add", "concatenate"]

# DataFrame to store results
results = []

# Evaluate models
for weights in weights_options:
    for fusion in fusions:
        combined_vectors = combine_vectors(keywords, custom_data_structures, weights, fusion)
        X_train, X_test, y_train, y_test = train_test_split(combined_vectors, final_marks, test_size=0.2, random_state=42, stratify=final_marks)

        for name, model in models.items():
            model.fit(X_train, y_train)

            # Train metrics
            y_train_pred = np.round(model.predict(X_train))
            train_mae = mean_absolute_error(y_train, y_train_pred)
            train_mse = mean_squared_error(y_train, y_train_pred)
            train_rmse = np.sqrt(train_mse)
            train_r2 = r2_score(y_train, y_train_pred)
            train_mape = mean_absolute_percentage_error(y_train, y_train_pred)

            # Test metrics
            y_test_pred = np.round(model.predict(X_test))
            test_mae = mean_absolute_error(y_test, y_test_pred)
            test_mse = mean_squared_error(y_test, y_test_pred)
            test_rmse = np.sqrt(test_mse)
            test_r2 = r2_score(y_test, y_test_pred)
            test_mape = mean_absolute_percentage_error(y_test, y_test_pred)

            # Store results
            results.append({
                "Model": name,
                "Weights": weights,
                "Fusion": fusion,
                "Train MAE": train_mae,
                "Train MSE": train_mse,
                "Train RMSE": train_rmse,
                "Train R2": train_r2,
                "Train MAPE": train_mape,
                "Test MAE": test_mae,
                "Test MSE": test_mse,
                "Test RMSE": test_rmse,
                "Test R2": test_r2,
                "Test MAPE": test_mape
            })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Sort results by fusion method and Test R2 in descending order
results_df.sort_values(by=["Fusion", "Test R2"], ascending=[True, False], inplace=True)

# Save results to Excel
results_df.to_excel("IG_results.xlsx", index=False)
print("Results saved to IG_results.xlsx.")


For to 20 variance

In [19]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, StackingRegressor

# Load the dataset
def load_data(file_path):
    df = pd.read_excel(file_path)
    keywords = df[[f'Keywords_Vector_{i}' for i in range(20)]]
    custom_data_structures = df[[f'Custom_Data_Structures_Vector_{i}' for i in range(20)]]
    final_marks = df["Final_Marks"]
    return keywords, custom_data_structures, final_marks

# Function to pad vectors with zeros to match length
def pad_with_zeros(vector, max_length):
    vector = np.atleast_2d(vector)
    if vector.shape[1] < max_length:
        padding = np.zeros((vector.shape[0], max_length - vector.shape[1]))
        vector = np.hstack((vector, padding))
    return vector

# Reduction and combination strategies
def combine_vectors(keywords, custom_data_structures, weights, fusion):
    weighted_keywords = weights[1] * keywords
    weighted_custom_data_structures = weights[0] * custom_data_structures

    if fusion == "add":
        max_length = max(weighted_keywords.shape[1], weighted_custom_data_structures.shape[1])
        weighted_keywords = pad_with_zeros(weighted_keywords, max_length)
        weighted_custom_data_structures = pad_with_zeros(weighted_custom_data_structures, max_length)
        combined_vectors = weighted_keywords + weighted_custom_data_structures
    elif fusion == "concatenate":
        combined_vectors = np.hstack((weighted_keywords, weighted_custom_data_structures))
    return combined_vectors

# Model definitions
catboost_params = {
    'verbose': 0,
    'iterations': 1000,
    'learning_rate': 0.05,
    'depth': 6,
    'l2_leaf_reg': 5,
    'early_stopping_rounds': 15, 'random_state': 43
}

xgboost_params = {
    'n_estimators': 500,
    'learning_rate': 0.01,
    'max_depth': 5,
    'min_child_weight': 2,
    'subsample': 0.6,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 1, 'random_state': 43
}

random_forest_params = {
    'n_estimators': 100,
    'max_depth': 6,
    'min_samples_split': 5,
    'min_samples_leaf': 3, 'random_state': 43
}

bagging_params = {
    'n_estimators': 100,
    'max_samples': 0.8,
    'max_features': 1.0, 'random_state': 43
}

# Base models
models = {
    "CatBoost Regressor": CatBoostRegressor(**catboost_params),
    "XGBoost Regressor": XGBRegressor(**xgboost_params),
    "Random Forest Regressor": RandomForestRegressor(**random_forest_params),
    "Bagging Regressor": BaggingRegressor(**bagging_params)
}

# Stacking regressor
estimators = [(name, model) for name, model in models.items()]
stacking_model = StackingRegressor(
    estimators=estimators,
    final_estimator=CatBoostRegressor(n_estimators=200, learning_rate=0.05, max_depth=3, verbose=0),
)
models["Stacking Regressor"] = stacking_model

# Load data
file_path = "/content/var.xlsx"
keywords, custom_data_structures, final_marks = load_data(file_path)

# Weights and fusion types
weights_options = [(1, 1), (0.6, 0.4), (0.7, 0.3), (0.8, 0.2), (0.9, 0.1), (0.4, 0.6), (0.3, 0.7), (0.2, 0.8), (0.1, 0.9)]
fusions = ["add", "concatenate"]

# DataFrame to store results
results = []

# Evaluate models
for weights in weights_options:
    for fusion in fusions:
        combined_vectors = combine_vectors(keywords, custom_data_structures, weights, fusion)
        X_train, X_test, y_train, y_test = train_test_split(combined_vectors, final_marks, test_size=0.2, random_state=42, stratify=final_marks)

        for name, model in models.items():
            model.fit(X_train, y_train)

            # Train metrics
            y_train_pred = np.round(model.predict(X_train))
            train_mae = mean_absolute_error(y_train, y_train_pred)
            train_mse = mean_squared_error(y_train, y_train_pred)
            train_rmse = np.sqrt(train_mse)
            train_r2 = r2_score(y_train, y_train_pred)
            train_mape = mean_absolute_percentage_error(y_train, y_train_pred)

            # Test metrics
            y_test_pred = np.round(model.predict(X_test))
            test_mae = mean_absolute_error(y_test, y_test_pred)
            test_mse = mean_squared_error(y_test, y_test_pred)
            test_rmse = np.sqrt(test_mse)
            test_r2 = r2_score(y_test, y_test_pred)
            test_mape = mean_absolute_percentage_error(y_test, y_test_pred)

            # Store results
            results.append({
                "Model": name,
                "Weights": weights,
                "Fusion": fusion,
                "Train MAE": train_mae,
                "Train MSE": train_mse,
                "Train RMSE": train_rmse,
                "Train R2": train_r2,
                "Train MAPE": train_mape,
                "Test MAE": test_mae,
                "Test MSE": test_mse,
                "Test RMSE": test_rmse,
                "Test R2": test_r2,
                "Test MAPE": test_mape
            })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Sort results by fusion method and Test R2 in descending order
results_df.sort_values(by=["Fusion", "Test R2"], ascending=[True, False], inplace=True)

# Save results to Excel
results_df.to_excel("var_results.xlsx", index=False)
print("Results saved to var_results.xlsx.")


Results saved to var_results.xlsx.
