# Prediction

## Dependencies

In [31]:
import os
import joblib
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

## Constant

In [32]:
DATASET_PATH = os.path.join(os.getcwd(), "dataset_scored.csv")
MODEL_DIR = os.path.join(os.getcwd(), "models")
PREDICTION_PATH = os.path.join(os.getcwd(), "prediction.csv")
MODEL_NAMES = ['Random Forest', 'Linear Regression', 'Gradient Boosting', 'Support Vector Regression', 'XGBoost Regressor']

## Dataset

In [33]:
# Load dataset from CSV
df = pd.read_csv(DATASET_PATH)

# Preview data
print(f"Sample of {df.shape[0]} data:")
display(df.head())

# Select feature columns
features = [
    'Layer Count', 'Avg Imports per File',
    'Architecture Score', 'Avg Cyclomatic', 'Avg Volume',
    'Avg Difficulty', 'Avg Effort', 'Score'
]

# Use only numeric features
DATATRAIN = df[features].astype(float)

# Adjustment
DATATRAIN.rename(columns={
    'Avg Cyclomatic': 'cyclomatic',
    'Avg Volume': 'volume',
    'Avg Difficulty': 'difficulty',
    'Avg Effort': 'effort',
    'Architecture Score': 'architecture',
    'Layer Count': 'layer',
    'Avg Imports per File': 'import',
    'Score': 'score'
}, inplace=True)

Sample of 3362 data:


Unnamed: 0,ID,Owner,Name,Description,URL,Size,Stars,Watch Count,Fork Count,Issues Open,...,Has Domain,Has Application,Has Interfaces,Has Infrastructure,Architecture Score,Avg Cyclomatic,Avg Volume,Avg Difficulty,Avg Effort,Score
0,475154802,ganeshnikumbh,cqrs-poc,,https://github.com/ganeshnikumbh/cqrs-poc,10,0,0,0,0,...,0,0,1,0,1,2.75,8.860336,0.541667,4.930168,69.0
1,728505766,DilsadChowdhury,Neural-Network-Structure,This neural network structure represents a bas...,https://github.com/DilsadChowdhury/Neural-Netw...,2,0,0,0,0,...,0,0,0,0,0,0.0,16.0,2.5,40.0,46.0
2,339078138,VictorErmakov,WMCheckout,Checkout system using clean architecture princ...,https://github.com/VictorErmakov/WMCheckout,33,0,0,0,0,...,1,1,0,1,3,8.833333,28.815673,0.766171,156.323418,78.0
3,956088994,aniketkadukar1,fastapi-clean-architecture,,https://github.com/aniketkadukar1/fastapi-clea...,6,0,0,0,0,...,1,0,0,0,1,1.538462,1.43742,0.141026,1.611759,69.0
4,963236916,firdavsDev,fast-api-ddd-example,FastAPI Domain-driven design pet todo project,https://github.com/firdavsDev/fast-api-ddd-exa...,12,0,0,0,0,...,1,1,1,1,4,2.483871,3.659811,0.198925,4.836333,93.0


## Normalize

In [34]:
# Normalize data
scaler = MinMaxScaler()
norm = scaler.fit_transform(DATATRAIN)
norm_df = pd.DataFrame(norm, columns=DATATRAIN.columns)

## Predicting

### Function Declaration

In [35]:
def load_model(model_name):
    """
    Load a machine learning model from the specified directory.
    Args:
        model_name (str): The name of the model to load.
    Returns:
        model: The loaded machine learning model.
    """
    model_filename = f"{model_name.replace(' ', '_').lower()}_model.pkl"
    model_path = os.path.join(os.getcwd(), os.path.join(MODEL_DIR, model_filename))
    return joblib.load(model_path)

In [36]:
def scale_features_if_needed(model_name, X):
    """
    Scale features if the model requires it.
    Args:
        model_name (str): The name of the model.
        X (pd.DataFrame): The feature data to scale.
    Returns:
        pd.DataFrame: The scaled feature data.
    """
    if model_name in ["Support Vector Regression", "Linear Regression"]:
        scaler = StandardScaler(with_mean=False)
        return scaler.fit_transform(X)
    return X

In [37]:
def generate_predictions(model, X, y, repo_ids, model_name):
    """
    Generate predictions using the specified model.
    Args:
        model: The machine learning model to use for predictions.
        X (pd.DataFrame): The feature data.
        y (pd.Series): The target data.
        repo_ids (pd.Series): The repository IDs.
        model_name (str): The name of the model.
    Returns:
        list: A list of dictionaries containing the model name, repository ID, actual score, and predicted score.
    """
    preds = model.predict(X).clip(0, 100)
    return [
        {
            "Model": model_name,
            "Repo ID": repo_id,
            "Actual Score": actual_score,
            "Predicted Score": round(predicted_score, 2)
        }
        for repo_id, actual_score, predicted_score in zip(repo_ids, y, preds)
    ]

In [38]:
def make_predictions(models, X, y, repo_ids):
    """
    Generate predictions for the given models and dataset.
    Args:
        models (list): List of model names to use for predictions.
        X (pd.DataFrame): Feature data.
        y (pd.Series): Target variable.
        repo_ids (pd.Series): Repository IDs.
    Returns:
        list: List of dictionaries containing predictions and model names.
    """
    predictions_data = []
    for model_name in models:
        model = load_model(model_name)
        X_scaled = scale_features_if_needed(model_name, X)
        predictions_data.extend(generate_predictions(model, X_scaled, y, repo_ids, model_name))
    return predictions_data

In [39]:
def save_predictions_to_csv(predictions, output_path):
    """Save predictions to a CSV file."""
    predictions_df = pd.DataFrame(predictions)
    predictions_df.to_csv(output_path, index=False)
    print("\n📊 Sample Predictions:")
    print(predictions_df)

In [40]:
# Feature extraction
X = norm_df[['cyclomatic', 'volume', 'difficulty', 'effort', 'architecture', 'layer', 'import']]
y = norm_df['score']
repo_ids = df['ID']

# Prediction
predictions = make_predictions(MODEL_NAMES, X, y, repo_ids)
save_predictions_to_csv(predictions, PREDICTION_PATH)


📊 Sample Predictions:
                   Model    Repo ID  Actual Score  Predicted Score
0          Random Forest  475154802          0.69             0.69
1          Random Forest  728505766          0.46             0.45
2          Random Forest  339078138          0.78             0.78
3          Random Forest  956088994          0.69             0.69
4          Random Forest  963236916          0.93             0.93
...                  ...        ...           ...              ...
16805  XGBoost Regressor  814754954          0.84             0.84
16806  XGBoost Regressor  163900100          0.60             0.60
16807  XGBoost Regressor  839622210          0.51             0.50
16808  XGBoost Regressor  257543499          0.37             0.37
16809  XGBoost Regressor  903542950          0.90             0.90

[16810 rows x 4 columns]
