In [107]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import json

In [108]:
EPS = 1e-8

def normalize(X):
    X_mean = np.mean(X, axis=0)
    X_std = np.std(X, axis=0)
    return (X - X_mean) / (X_std + EPS), X_mean, X_std

def denormalize(X_normalized, X_mean, X_std):
    return (X_normalized * (X_std + EPS)) + X_mean

In [109]:
with open("../data/labeled_data_2.json", "r") as json_file:
    data = json.load(json_file)

y = np.array([entry['score'] for entry in data.values()])
y_avg = np.mean(y)
y = np.where(y_avg < y, y, 0)
y = np.where(y_avg >= y, y, 1)
X = np.array([np.array(list({k : v for k, v in entry.items() if k != 'score'}.values())) for entry in data.values()])
X_normalized, X_mean, X_std = normalize(X)
X_train, X_eval, y_train, y_eval = train_test_split(X_normalized, y, test_size=0.20)

# 1. Random Forest / Bagging

In [110]:
# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100)

# Train the model on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the evaluation data
rf_predictions = rf_classifier.predict(X_eval)

# Evaluate the model's performance
rf_accuracy = accuracy_score(y_eval, rf_predictions)
print(f"Random Forest Classifier Accuracy: {rf_accuracy:.2f}")
print(classification_report(y_eval, rf_predictions))

Random Forest Classifier Accuracy: 0.54
              precision    recall  f1-score   support

         0.0       0.47      0.64      0.54        11
         1.0       0.64      0.47      0.54        15

    accuracy                           0.54        26
   macro avg       0.55      0.55      0.54        26
weighted avg       0.56      0.54      0.54        26



In [111]:
# Initialize the base estimator (e.g., Decision Tree)
base_estimator = DecisionTreeClassifier()

# Initialize the Bagging Classifier
bagging_classifier = BaggingClassifier(base_estimator=base_estimator, n_estimators=100)

# Train the model on the training data
bagging_classifier.fit(X_train, y_train)

# Make predictions on the evaluation data
bagging_predictions = bagging_classifier.predict(X_eval)

# Evaluate the model's performance
bagging_accuracy = accuracy_score(y_eval, bagging_predictions)
print(f"Bagging Classifier Accuracy: {bagging_accuracy:.2f}")
print(classification_report(y_eval, bagging_predictions))



Bagging Classifier Accuracy: 0.62
              precision    recall  f1-score   support

         0.0       0.53      0.73      0.62        11
         1.0       0.73      0.53      0.62        15

    accuracy                           0.62        26
   macro avg       0.63      0.63      0.62        26
weighted avg       0.65      0.62      0.62        26



# 2. Pruning data with PCA first

In [112]:
# Initialize the PCA object with 2 principal components
pca = PCA(n_components=2)

# Fit and transform the training data
X_train_pca = pca.fit_transform(X_train)

# Transform the evaluation data using the same PCA model
X_eval_pca = pca.transform(X_eval)

In [113]:
# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100)

# Train the model on the training data
rf_classifier.fit(X_train_pca, y_train)

# Make predictions on the evaluation data
rf_predictions = rf_classifier.predict(X_eval_pca)

# Evaluate the model's performance
rf_accuracy = accuracy_score(y_eval, rf_predictions)
print(f"Random Forest Classifier Accuracy: {rf_accuracy:.2f}")
print(classification_report(y_eval, rf_predictions))

Random Forest Classifier Accuracy: 0.46
              precision    recall  f1-score   support

         0.0       0.40      0.55      0.46        11
         1.0       0.55      0.40      0.46        15

    accuracy                           0.46        26
   macro avg       0.47      0.47      0.46        26
weighted avg       0.48      0.46      0.46        26



In [114]:
# Initialize the base estimator (e.g., Decision Tree)
base_estimator = DecisionTreeClassifier()

# Initialize the Bagging Classifier
bagging_classifier = BaggingClassifier(base_estimator=base_estimator, n_estimators=100)

# Train the model on the training data
bagging_classifier.fit(X_train_pca, y_train)

# Make predictions on the evaluation data
bagging_predictions = bagging_classifier.predict(X_eval_pca)

# Evaluate the model's performance
bagging_accuracy = accuracy_score(y_eval, bagging_predictions)
print(f"Bagging Classifier Accuracy: {bagging_accuracy:.2f}")
print(classification_report(y_eval, bagging_predictions))

Bagging Classifier Accuracy: 0.46
              precision    recall  f1-score   support

         0.0       0.40      0.55      0.46        11
         1.0       0.55      0.40      0.46        15

    accuracy                           0.46        26
   macro avg       0.47      0.47      0.46        26
weighted avg       0.48      0.46      0.46        26



