# Scoring Dataset

## Dependencies

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

## Constant

In [None]:
DATASET_PATH = os.path.join(os.getcwd(), "dataset.csv")
DATASET_SCORED_PATH = os.path.join(os.getcwd(), "dataset_scored.csv")
MODEL_DIR = os.path.join(os.getcwd(), "models")
EVALUATION_PATH = os.path.join(os.getcwd(), "evaluation.csv")
os.makedirs(MODEL_DIR, exist_ok=True)

## Dataset

In [3]:
# Load dataset from CSV
df = pd.read_csv(DATASET_PATH)

# Preview data
print(f"Sample of {df.shape[0]} data:")
display(df.head())

# Select feature columns
features = [
    'Layer Count', 'Avg Imports per File',
    'Architecture Score', 'Avg Cyclomatic', 'Avg Volume',
    'Avg Difficulty', 'Avg Effort',
]

# Use only numeric features
DATATRAIN = df[features].astype(float)

# Adjustment
DATATRAIN.loc[:, 'Architecture Score'] = DATATRAIN['Architecture Score'] / 4
DATATRAIN.loc[:, 'Layer Count'] = DATATRAIN['Layer Count'].apply(lambda x: 0.25 if x == 1 else 0.5 if x == 2 else 1 if x > 2 else 0)
DATATRAIN.loc[:, 'Avg Imports per File'] = DATATRAIN['Avg Imports per File'].apply(
    lambda x: 0 if x > 10 else 0.25 if x > 7 else 0.5 if x > 5 else 1
)

# Apply log transformation to reduce outliers
DATATRAIN = DATATRAIN.apply(np.log1p)

Sample of 3362 data:


Unnamed: 0,ID,Owner,Name,Description,URL,Size,Stars,Watch Count,Fork Count,Issues Open,Subscriber Count,Total Files,Layer Count,Avg Imports per File,Has Domain,Has Application,Has Interfaces,Has Infrastructure,Architecture Score,Avg Cyclomatic,Avg Volume,Avg Difficulty,Avg Effort
0,475154802,ganeshnikumbh,cqrs-poc,,https://github.com/ganeshnikumbh/cqrs-poc,10,0,0,0,0,1,4,3,4.5,0,0,1,0,1,2.75,8.860336,0.541667,4.930168
1,728505766,DilsadChowdhury,Neural-Network-Structure,This neural network structure represents a bas...,https://github.com/DilsadChowdhury/Neural-Netw...,2,0,0,0,0,1,1,0,1.0,0,0,0,0,0,0.0,16.0,2.5,40.0
2,339078138,VictorErmakov,WMCheckout,Checkout system using clean architecture princ...,https://github.com/VictorErmakov/WMCheckout,33,0,0,0,0,1,24,3,3.041667,1,1,0,1,3,8.833333,28.815673,0.766171,156.323418
3,956088994,aniketkadukar1,fastapi-clean-architecture,,https://github.com/aniketkadukar1/fastapi-clea...,6,0,0,0,0,1,13,2,2.923077,1,0,0,0,1,1.538462,1.43742,0.141026,1.611759
4,963236916,firdavsDev,fast-api-ddd-example,FastAPI Domain-driven design pet todo project,https://github.com/firdavsDev/fast-api-ddd-exa...,12,0,0,0,0,1,31,3,1.967742,1,1,1,1,4,2.483871,3.659811,0.198925,4.836333


## Scoring

In [4]:
# Normalize data
scaler = MinMaxScaler()
norm = scaler.fit_transform(DATATRAIN)
norm_df = pd.DataFrame(norm, columns=DATATRAIN.columns)

# Matrics Weights
weights = {
    'Cyclomatic': 0.2,
    'Volume': 0.025,
    'Difficulty': 0.2,
    'Effort': 0.1,
    'Architecture': 0.3,
    'Layer': 0.1,
    'Import': 0.075,
}

# Calculate weighted contributions
positive_contributions = (
    norm_df['Architecture Score'] * weights['Architecture'] +
    norm_df['Layer Count'] * weights['Layer'] +
    norm_df['Avg Imports per File'] * weights['Import']
)

negative_contributions = (
    norm_df['Avg Cyclomatic'] * weights['Cyclomatic'] +
    norm_df['Avg Volume'] * weights['Volume'] +
    norm_df['Avg Difficulty'] * weights['Difficulty'] +
    norm_df['Avg Effort'] * weights['Effort']
)

# Final raw score
final_score = positive_contributions - negative_contributions

# Normalize to range 0–100
norm_df['Score'] = 100 * (final_score - final_score.min()) / (final_score.max() - final_score.min())

# Rename columns for clarity
norm_df.rename(columns={
    'Avg Cyclomatic': 'cyclomatic',
    'Avg Volume': 'volume',
    'Avg Difficulty': 'difficulty',
    'Avg Effort': 'effort',
    'Architecture Score': 'architecture',
    'Layer Count': 'layer',
    'Avg Imports per File': 'import',
    'Score': 'score'
}, inplace=True)

# Preview the score
print(f"Sample of {norm_df.shape[0]} data with score:")
display(norm_df.sort_values(by='score', ascending=False))

Sample of 3362 data with score:


Unnamed: 0,layer,import,architecture,cyclomatic,volume,difficulty,effort,score
3034,1.000000,1.0,1.0,0.000000,0.000000,0.000000,0.000000,100.000000
1817,1.000000,1.0,1.0,0.000000,0.010552,0.003743,0.004485,99.845537
715,1.000000,1.0,1.0,0.015163,0.011063,0.003933,0.004709,99.517145
1956,1.000000,1.0,1.0,0.008876,0.023717,0.008950,0.010416,99.450196
1354,1.000000,1.0,1.0,0.039028,0.000000,0.000000,0.000000,99.174679
...,...,...,...,...,...,...,...,...
868,0.321928,0.0,0.0,0.755905,0.756344,0.667254,0.788016,12.753064
1445,0.000000,0.0,0.0,0.705552,0.752053,0.611148,0.767915,11.824329
29,0.000000,0.0,0.0,0.698799,0.706327,0.742824,0.768303,9.299348
2396,1.000000,0.0,0.0,0.899153,0.818289,0.963822,0.921861,9.042932


## Store Dataset

In [5]:
# Save the final dataset with scores
df['Score'] = norm_df['score'].apply(lambda x: round(x, 0))
df.to_csv(DATASET_SCORED_PATH, index=False)