In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from scipy.stats import ttest_1samp


# Define correct column names
column_names = [
    'type', 'index', 'nr_pix', 'rows_with_1', 'cols_with_1', 'rows_with_3p',
    'cols_with_3p', 'aspect_ratio', 'neigh_1', 'no_neigh_above', 'no_neigh_below',
    'no_neigh_left', 'no_neigh_right', 'no_neigh_horiz', 'no_neigh_vert',
    'connected_areas', 'eyes', 'diagonalness'
]

# Load the data
df = pd.read_csv(r"40415474\all_features.csv", sep='\t', header=None, names=column_names)

# Features and labels
X = df.drop(columns=['type', 'index'])  # Remove ID and label
y = df['type']

# Parameters for grid search
n_trees = list(range(25, 400, 50))  # Nt: 25 to 375
n_predictors = [2, 4, 6, 8]         # Np
results = []

print("Grid search for Random Forest model (Section 3.1):\n")
for nt in n_trees:
    for np_ in n_predictors:
        rf = RandomForestClassifier(n_estimators=nt, max_features=np_, random_state=42)
        scores = cross_val_score(rf, X, y, cv=5, scoring='accuracy')
        mean_score = scores.mean()
        results.append({'Nt': nt, 'Np': np_, 'Accuracy': mean_score})
        print(f"Nt = {nt}, Np = {np_} → Accuracy = {mean_score:.4f}")

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Plot accuracy for each Np group
plt.figure(figsize=(10, 6))
for np_ in n_predictors:
    subset = results_df[results_df['Np'] == np_]
    plt.plot(subset['Nt'], subset['Accuracy'], marker='o', label=f'Np = {np_}')

plt.title("Random Forest Accuracy (5-Fold CV)")
plt.xlabel("Number of Trees (Nt)")
plt.ylabel("Accuracy")
plt.legend(title="Max Features (Np)")
plt.grid(True)
plt.show()

# Identify best model
best_model = results_df.loc[results_df['Accuracy'].idxmax()]
print(f"\nBest model: Nt = {int(best_model.Nt)}, Np = {int(best_model.Np)} with Accuracy = {best_model.Accuracy:.4f}")

#section 3.2

# Use best Nt and Np from Section 3.1
best_Nt = int(best_model.Nt)
best_Np = int(best_model.Np)

accuracies = []

print(f"\nRunning best Random Forest model (Nt={best_Nt}, Np={best_Np}) for 15 random seeds:\n")

for i in range(15):
    rf = RandomForestClassifier(n_estimators=best_Nt, max_features=best_Np, random_state=i)
    score = cross_val_score(rf, X, y, cv=5, scoring='accuracy').mean()
    accuracies.append(score)
    print(f"Run {i+1:02d}: Accuracy = {score:.4f}")

mean_acc = np.mean(accuracies)
std_acc = np.std(accuracies)
chance = 1 / len(np.unique(y))

# t-test vs. chance level
t_stat, p_value = ttest_1samp(accuracies, chance)
print(f"\nMean Accuracy: {mean_acc:.4f}")
print(f"Standard Deviation: {std_acc:.4f}")
print(f"T-test vs. chance ({chance:.4f}): t = {t_stat:.2f}, p = {p_value:.4e}")

if p_value < 0.05:
    print("Significantly better than chance")
else:
    print("Not significantly better than chance")

