## Подготовка данных

Добавим в стандартный датасет Iris 20 случайных признаков с разными распределениями. Таким образом, мы сможем отследить качесвто и важность использования метода главных компонент.

In [33]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import pandas as pd
import time
import warnings

warnings.filterwarnings("ignore")

iris = load_iris()
X_base = pd.DataFrame(iris.data, columns=iris.feature_names)
y = iris.target

np.random.seed(42)  
random_features = pd.DataFrame()

for i in range(5):
    random_features[f'normal_{i+1}'] = np.random.normal(0, 1, size=X_base.shape[0])
    random_features[f'uniform_{i+1}'] = np.random.uniform(0, 1, size=X_base.shape[0])
    random_features[f'exponential_{i+1}'] = np.random.exponential(1.0, size=X_base.shape[0])
    random_features[f'lognormal_{i+1}'] = np.random.lognormal(0, 1, size=X_base.shape[0])

X_combined = pd.concat([X_base, random_features], axis=1)

scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

X_train, X_test, y_train, y_test = train_test_split(X_combined_scaled, y, test_size=0.3, random_state=42)

Напишим функция для обучения, предсказания и снятия метрик модели многослойного перцептрона.

In [34]:
def evaluate_model(X_train, X_test, y_train, y_test):
    model = MLPClassifier(hidden_layer_sizes=(10,), max_iter=300, random_state=42)
    
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    metrics = {
        'Training Time (s)': training_time,
        'Train Accuracy': accuracy_score(y_train, y_pred_train),
        'Test Accuracy': accuracy_score(y_test, y_pred_test)
    }
    
    return metrics


Результаты до использования метода главных компонент

In [35]:
from IPython.display import display, Markdown

def display_metrics(metrics, title="Метрики"):
    metrics_str = f"### {title}\n"
    for key, value in metrics.items():
        metrics_str += f"- **{key}**: {value}\n"
    display(Markdown(metrics_str))

original_metrics = evaluate_model(X_train, X_test, y_train, y_test)
display_metrics(original_metrics, title="Метрики до применения PCA")

### Метрики до применения PCA
- **Training Time (s)**: 0.0415043830871582
- **Train Accuracy**: 0.9428571428571428
- **Test Accuracy**: 0.6666666666666666


In [36]:
from FastPCA import PCASturm
pca_sturm = PCASturm(X_train)  
N = 5
X_train_pca, eigenvalues, eigenvectors = pca_sturm.pca_sturm(n_components=N)
X_test_pca = (X_test @ eigenvectors[:, :N])
pca_metrics = evaluate_model(X_train_pca, X_test_pca, y_train, y_test)
display_metrics(pca_metrics, title="Метрики после применения PCA")



### Метрики после применения PCA
- **Training Time (s)**: 0.03858780860900879
- **Train Accuracy**: 0.8761904761904762
- **Test Accuracy**: 0.9111111111111111


In [38]:
explained_variance_ratio = eigenvalues / np.sum(eigenvalues)

explained_variance_df = pd.DataFrame({
    'Principal Component': [f'PC{i+1}' for i in range(len(eigenvalues))],
    'Explained Variance Ratio': explained_variance_ratio
})
explained_variance_df

Unnamed: 0,Principal Component,Explained Variance Ratio
0,PC1,0.2
1,PC2,0.2
2,PC3,0.2
3,PC4,0.2
4,PC5,0.2
