In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier

In [5]:
def train_model_on_pca_features(csv_path, test_size=0.2, random_state=42, kernel='rbf'):
    # Load dataset
    df = pd.read_csv(csv_path)
    
    # Extract features and labels
    X = df.drop(columns=['filename', 'label']).values
    y = df['label'].values
    
    # Encode string labels to integers
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    # Split into train/test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=test_size, random_state=random_state, stratify=y_encoded
    )

    # Train SVM
    clf = SVC(kernel=kernel, probability=True)
    clf.fit(X_train, y_train)

    # Evaluate
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=le.classes_)
    cm = confusion_matrix(y_test, y_pred)

    print(f"\nAccuracy: {acc:.4f}\n")
    print("Classification Report:")
    print(report)
    print("Confusion Matrix:")
    print(cm)

    return clf, le

In [7]:
def train_xgboost_on_pca_features(csv_path, test_size=0.2, random_state=42):
    # Load dataset
    df = pd.read_csv(csv_path)

    # Extract features and labels
    X = df.drop(columns=['filename', 'label']).values
    y = df['label'].values

    # Encode string labels
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=test_size, random_state=random_state, stratify=y_encoded
    )

    # Train XGBoost model
    clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    clf.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=le.classes_)
    cm = confusion_matrix(y_test, y_pred)

    print(f"\nAccuracy: {acc:.4f}\n")
    print("Classification Report:")
    print(report)
    print("Confusion Matrix:")
    print(cm)

    return clf, le

In [2]:
train_test = "train"
version = "v2"
pca_csv_name = f"D:/Documentos/Polito/Thesis/Datasets/A3LIS-147_italian/trimmed-life/pca-hei-seed59-{version}/reduced/left_{train_test}_pca.csv"

In [3]:
df = pd.read_csv(pca_csv_name)

In [4]:
df

Unnamed: 0,filename,label,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5,pca_6,pca_7,...,pca_40,pca_41,pca_42,pca_43,pca_44,pca_45,pca_46,pca_47,pca_48,pca_49
0,1744911414337282800_fef_abitare_Left.jpg,abitare,9534.194961,-3453.693640,-2231.303006,2271.878799,-2926.420898,146.714932,2550.667569,-1385.802651,...,-419.672739,226.458782,400.464472,-270.862681,413.459798,-617.315806,-440.945180,-465.853993,321.387294,-422.928427
1,1744911428027573000_fsf_abitare_Left.jpg,abitare,-2583.109508,-128.792263,-421.043967,-2004.628430,-4725.353693,-887.311834,528.206812,-2176.223256,...,42.567914,613.912164,-572.617546,181.661106,561.161909,-89.502546,396.584000,360.078240,529.654468,1378.975858
2,1744911444394916800_mdp_abitare_Left.jpg,abitare,5917.375592,-1552.747536,-2557.459102,-187.251487,-2795.150750,-0.687107,1456.226765,-779.154831,...,145.733315,-766.352875,-12.185824,62.608334,-437.259419,-257.417228,-124.329025,254.524264,105.225977,251.737309
3,1744911462913590000_mdq_abitare_Left.jpg,abitare,8440.474031,-4480.939425,-2327.530679,1909.870305,-1000.264720,-1982.642909,-76.377104,1060.571870,...,-456.890894,-1420.814337,206.683459,234.722672,-356.312867,454.065413,601.434077,291.292314,-713.781557,920.664206
4,1744911480802613600_mic_abitare_Left.jpg,abitare,8906.332482,-3008.714001,-2502.276833,253.479504,-1859.725359,-1023.014800,1620.398791,729.807543,...,-422.412171,197.040990,-262.274963,-364.064002,161.265579,-563.705791,129.568943,28.201388,-299.161731,-52.571401
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,1744911474640020300_mdq_litro_Left.jpg,litro,-2066.229190,-1541.124385,387.035040,-3280.326922,1692.511660,-835.821539,-3554.932088,-995.476875,...,-529.716663,-394.025883,464.525915,-438.927564,145.654689,-307.948597,174.103891,-21.572875,35.191973,-172.734367
108,1744911496320066200_mic_litro_Left.jpg,litro,4033.144334,-3444.762177,2555.123504,3545.265585,-387.813856,1231.565459,-3215.933962,903.741741,...,366.470515,84.650696,-60.365423,-160.368023,201.846825,-399.469520,-812.251715,326.810494,689.271297,-338.502770
109,1744911512083874600_mmr_litro_Left.jpg,litro,5530.863077,-2492.736228,-1375.057029,-1443.477263,1235.638448,36.267177,737.388580,-100.671029,...,153.672471,461.510773,-740.575937,-213.088287,165.613847,64.978881,246.691060,-355.987186,-189.311146,-88.586202
110,1744911526008728500_mrla_litro_Left.jpg,litro,1978.807562,-2256.222424,-2963.171967,-39.656217,-205.219208,-428.855870,-129.722418,722.122177,...,-748.352419,543.710608,-422.099114,-259.297034,-47.871093,-345.376880,-695.419935,379.428183,122.548259,-14.080962


In [6]:
model, label_encoder = train_model_on_pca_features(pca_csv_name)


Accuracy: 0.6957

Classification Report:
              precision    recall  f1-score   support

     abitare       1.00      1.00      1.00         2
       acqua       0.50      1.00      0.67         2
     affitto       0.00      0.00      0.00         2
       banca       0.00      0.00      0.00         1
       caldo       0.50      0.50      0.50         2
        casa       1.00      1.00      1.00         1
        cibo       0.00      0.00      0.00         1
        data       1.00      1.00      1.00         2
      freddo       1.00      1.00      1.00         2
        idle       1.00      1.00      1.00         2
  interprete       0.50      1.00      0.67         1
     inviare       1.00      0.50      0.67         2
      lingua       1.00      0.50      0.67         2
       litro       0.25      1.00      0.40         1

    accuracy                           0.70        23
   macro avg       0.62      0.68      0.61        23
weighted avg       0.68      0.70     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
model, label_encoder = train_xgboost_on_pca_features(pca_csv_name)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Accuracy: 0.4783

Classification Report:
              precision    recall  f1-score   support

     abitare       1.00      0.50      0.67         2
       acqua       0.33      0.50      0.40         2
     affitto       0.50      0.50      0.50         2
       banca       0.00      0.00      0.00         1
       caldo       1.00      0.50      0.67         2
        casa       0.00      0.00      0.00         1
        cibo       0.00      0.00      0.00         1
        data       1.00      0.50      0.67         2
      freddo       0.50      0.50      0.50         2
        idle       0.67      1.00      0.80         2
  interprete       0.50      1.00      0.67         1
     inviare       0.00      0.00      0.00         2
      lingua       1.00      1.00      1.00         2
       litro       0.00      0.00      0.00         1

    accuracy                           0.48        23
   macro avg       0.46      0.43      0.42        23
weighted avg       0.54      0.48     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
