In [47]:
# Import libraries
import pandas as pd 
from ucimlrepo import fetch_ucirepo
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

In [48]:
# getting the multi-class dataset from UCI library
adult = fetch_ucirepo(id=2)

In [57]:
# Convert to pandas dataframe
data = pd.DataFrame(data=adult.data.features, columns=adult.variables)
data['target'] = adult.target

In [50]:
# Splitting the dataset into 10 samples (assuming the dataset size is small for now)
sampled_data = [data.sample(frac=0.1, random_state=i) for i in range(10)]

In [51]:
# Store best results for each sample
results = []

In [52]:
# SVM parameters grid
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [1, 0.1, 0.01],
    'kernel': ['linear','rbf']
}

In [None]:
# Perform optimization for each sample
for i, sample in enumerate(sampled_data):
    X = sample.drop('target', axis=1)
    y = sample['target']

    # Apply the imputer on both the training and test data
    imputer = SimpleImputer(strategy='constant', fill_value=0)
    X_imputed = imputer.fit_transform(X)
    y = y.astype('category')
    
    
    
    # Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.3, random_state=i)   
    
    
    # X_train = imputer.fit_transform(X_train)
    # X_test = imputer.transform(X_test) 

    # Grid search for SVM optimization
    grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=0)
    grid.fit(X_train, y_train)
    
    # Best SVM model
    best_svm = grid.best_estimator_
    y_pred = best_svm.predict(X_test)
    
    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Append results
    results.append({
        'Sample': f'S{i+1}',
        'Best Accuracy': accuracy,
        'Best Parameters': grid.best_params_
    })

AttributeError: 'list' object has no attribute 'reset_index'

In [None]:
# Converting results to DataFrame
results_df = pd.DataFrame(results)

In [None]:
# Plot convergence graph for best sample
best_sample_idx = results_df['Best Accuracy'].idxmax()
best_sample = sampled_data[best_sample_idx]
X = best_sample.drop('target', axis=1)
y = best_sample['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=best_sample_idx)

In [None]:
# Refit best SVM for plotting the graph
best_svm = GridSearchCV(SVC(), param_grid, refit=True, verbose=0)
best_svm.fit(X_train, y_train)

In [None]:
# Accuracy over iterations (simulation, since sklearn does not support iteration tracking)
accuracies = []
for iteration in range(1, 101):
    y_pred = best_svm.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

In [None]:
# Plotting convergence graph
plt.figure(figsize=(6, 4))
plt.plot(range(1, 101), accuracies, label="Accuracy over Iterations")
plt.xlabel("Iteration")
plt.ylabel("Accuracy")
plt.title("Fitness (Best Accuracy) over Iterations")
plt.grid(True)
plt.legend()
plt.savefig("/mnt/data/convergence_graph.png")

In [None]:
# Display results DataFrame and convergence graph
results_df