In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [3]:
df = pd.read_csv("data/metrics.csv")

X = df.drop(columns=["filename", "language", "quality"])
y = df["quality"]


In [4]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_scaled, y_train)

joblib.dump(rf, "rf_quality_model.pkl")


['rf_quality_model.pkl']

In [8]:
y_pred = rf.predict(X_test_scaled)

print("Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))


Classification Report:

              precision    recall  f1-score   support

     Average       1.00      1.00      1.00       140
         Bad       1.00      1.00      1.00       453
        Good       1.00      1.00      1.00       205

    accuracy                           1.00       798
   macro avg       1.00      1.00      1.00       798
weighted avg       1.00      1.00      1.00       798

Confusion Matrix:

[[140   0   0]
 [  0 453   0]
 [  0   0 205]]


In [10]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix

gbc = GradientBoostingClassifier(n_estimators=200, random_state=42)
gbc.fit(X_train_scaled, y_train)

y_pred_gbc = gbc.predict(X_test_scaled)

print(classification_report(y_test, y_pred_gbc, target_names=le.classes_))
print(confusion_matrix(y_test, y_pred_gbc))

              precision    recall  f1-score   support

     Average       1.00      1.00      1.00       140
         Bad       1.00      1.00      1.00       453
        Good       1.00      1.00      1.00       205

    accuracy                           1.00       798
   macro avg       1.00      1.00      1.00       798
weighted avg       1.00      1.00      1.00       798

[[140   0   0]
 [  0 453   0]
 [  0   0 205]]


In [11]:
from sklearn.svm import SVC

svc = SVC(kernel='rbf', C=1.0, gamma='scale')
svc.fit(X_train_scaled, y_train)

y_pred_svc = svc.predict(X_test_scaled)
print(classification_report(y_test, y_pred_svc, target_names=le.classes_))


              precision    recall  f1-score   support

     Average       1.00      1.00      1.00       140
         Bad       1.00      1.00      1.00       453
        Good       1.00      1.00      1.00       205

    accuracy                           1.00       798
   macro avg       1.00      1.00      1.00       798
weighted avg       1.00      1.00      1.00       798



In [13]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(multi_class='multinomial', max_iter=1000)
lr.fit(X_train_scaled, y_train)

y_pred_lr = lr.predict(X_test_scaled)
print(classification_report(y_test, y_pred_lr, target_names=le.classes_))


              precision    recall  f1-score   support

     Average       1.00      1.00      1.00       140
         Bad       1.00      1.00      1.00       453
        Good       1.00      1.00      1.00       205

    accuracy                           1.00       798
   macro avg       1.00      1.00      1.00       798
weighted avg       1.00      1.00      1.00       798





In [14]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

y_pred_knn = knn.predict(X_test_scaled)
print(classification_report(y_test, y_pred_knn, target_names=le.classes_))


              precision    recall  f1-score   support

     Average       0.99      1.00      0.99       140
         Bad       1.00      1.00      1.00       453
        Good       1.00      0.99      0.99       205

    accuracy                           0.99       798
   macro avg       0.99      0.99      0.99       798
weighted avg       1.00      0.99      0.99       798

