In [1]:
import pandas as pd

#read in dataset
df = pd.read_csv('C:/Users/rachr/Rowey-DATA1030-Project/wpbc.data')
df.columns = ['ID number', 'outcome', 'time', 'radius1', 'texture1', 'perimeter1', 'area1', 'smoothness1', 'compactness1', 'concavity1', 'concave_points1', 'symmetry1', 'fractal_dimension1', 'radius2', 'texture2', 'perimeter2', 'area2', 'smoothness2', 'compactness2', 'concavity2', 'concave_points2', 'symmetry2', 'fractal_dimension2', 'radius3', 'texture3', 'perimeter3', 'area3', 'smoothness3', 'compactness3', 'concavity3', 'concave_points3', 'symmetry3', 'fractal_dimension3', 'tumor_size', 'lymph_node_status']
#print(df)

In [2]:
#feature engineering
df['fractal_dimension_avg'] = df[['fractal_dimension1', 'fractal_dimension2', 'fractal_dimension3']].mean(axis=1)
df['symmetry_avg'] = df[['symmetry1', 'symmetry2', 'symmetry3']].mean(axis=1)
df['concave_points_avg'] = df[['concave_points1', 'concave_points2', 'concave_points3']].mean(axis=1)
df['concavity_avg'] = df[['concavity1', 'concavity2', 'concavity3']].mean(axis=1)
df['compactness_avg'] = df[['compactness1', 'compactness2', 'compactness3']].mean(axis=1)
df['smoothness_avg'] = df[['smoothness1', 'smoothness2', 'smoothness3']].mean(axis=1)
df['area_avg'] = df[['area1', 'area2', 'area3']].mean(axis=1)
df['perimeter_avg'] = df[['perimeter1', 'perimeter2', 'perimeter3']].mean(axis=1)
df['texture_avg'] = df[['texture1', 'texture2', 'texture3']].mean(axis=1)
df['radius_avg'] = df[['radius1', 'radius2', 'radius3']].mean(axis=1)

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

#dataset
y = df['outcome']
X = df[['time', 'radius1', 'texture1', 'perimeter1', 'area1', 'smoothness1', 'compactness1', 'concavity1', 'concave_points1',
        'symmetry1', 'fractal_dimension1', 'radius2', 'texture2', 'perimeter2', 'area2', 'smoothness2', 'compactness2', 'concavity2', 'concave_points2',
        'symmetry2', 'fractal_dimension2', 'radius3', 'texture3', 'perimeter3', 'area3', 'smoothness3', 'compactness3', 'concavity3', 'concave_points3',
        'symmetry3', 'fractal_dimension3', 'tumor_size', 'lymph_node_status', 'compactness_avg', 'radius_avg', 'texture_avg', 'perimeter_avg', 'area_avg',
        'smoothness_avg', 'concavity_avg', 'concave_points_avg', 'symmetry_avg', 'fractal_dimension_avg']]

#convert target variable values [N, R] to numerical data [0, 1]
df['outcome'] = df['outcome'].replace({'N': 0, 'R': 1}).astype(int)

#define categorical and numerical features
categorical_features = ['lymph_node_status']
numerical_features = X.columns.difference(categorical_features)

#transformer to replace '?' with 'NA'
replace_question_marks = FunctionTransformer(lambda x: x.replace('?', np.nan))

#categorical pipeline
ord_cats = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '13', '14', '15', '16', '17', '18', '20', '21', '24', '27', 'NA']
categorical_pipeline = Pipeline([
    ('replace_missing', replace_question_marks),
    ('impute', SimpleImputer(strategy='constant', fill_value='NA')), #replace np.nan with 'NA'
    ('ordinal', OrdinalEncoder(categories=[ord_cats]))
])

#preprocess all features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_pipeline, categorical_features),
        ('num', StandardScaler(), numerical_features)
    ]
)

In [6]:
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
from sklearn.metrics import fbeta_score, accuracy_score
from sklearn.svm import SVC


#initialize pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SVC(random_state=42))
])

#parameter grid for SVC
param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__gamma': ['scale', 'auto'],
}

#random states
random_states = [0, 42, 123, 2024, 5678]

#store results of each random state
params_list = []
cv_scores = []
test_acc_scores = []
test_fbeta_scores = []
best_models = []

for state in random_states:
    print(f"\nRandom State: {state}")
    
    #stratified train-test split
    X_other, X_test, y_other, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=state
    )
    print('Test data balance:', np.unique(y_test, return_counts=True))
    
    #stratified K-Fold cross-validation
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=state)
    
    #GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(
        estimator=model_pipeline,
        param_grid=param_grid,
        cv=kf,
        scoring='accuracy',
        n_jobs=-1
    )
    
    #fit GridSearchCV on training data (X_other, y_other)
    grid_search.fit(X_other, y_other)

    #evaluate on test set
    best_model = grid_search.best_estimator_
    y_test_pred = best_model.predict(X_test)

    #accuracy and f-scores
    test_score = accuracy_score(y_test, y_test_pred)
    fscore = fbeta_score(y_test, y_test_pred, beta=2, average='binary')

    #store best parameters, scores, and models
    params_list.append(grid_search.best_params_)
    cv_scores.append(grid_search.best_score_)
    test_acc_scores.append(test_score)
    test_fbeta_scores.append(fscore)
    best_models.append(best_model)

    print(f"  Best Parameters for this state: {grid_search.best_params_}")
    print(f"  Best CV Score for this state: {grid_search.best_score_:.4f}")
    print(f" Test Score: {test_score:.4f}")
    print(f"  Test F-Score: {fscore:.4f}")

#select best model
ultimate_best_idx = np.argmax(cv_scores)
ultimate_best_params = params_list[ultimate_best_idx]
ultimate_best_model = best_models[ultimate_best_idx]

#print best parameters & cv score
print("\nUltimate Best Model Across Random States:")
print(f"  Best Parameters: {ultimate_best_params}")
print(f"  Best CV Score: {cv_scores[ultimate_best_idx]:.4f}")


Random State: 0
Test data balance: (array([0, 1]), array([30, 10], dtype=int64))
  Best Parameters for this state: {'classifier__C': 1, 'classifier__gamma': 'auto', 'classifier__kernel': 'rbf'}
  Best CV Score for this state: 0.7901
 Test Score: 0.7250
  Test F-Score: 0.2222

Random State: 42
Test data balance: (array([0, 1]), array([30, 10], dtype=int64))
  Best Parameters for this state: {'classifier__C': 0.1, 'classifier__gamma': 'scale', 'classifier__kernel': 'linear'}
  Best CV Score for this state: 0.7833
 Test Score: 0.7250
  Test F-Score: 0.3191

Random State: 123
Test data balance: (array([0, 1]), array([30, 10], dtype=int64))
  Best Parameters for this state: {'classifier__C': 1, 'classifier__gamma': 'scale', 'classifier__kernel': 'linear'}
  Best CV Score for this state: 0.8147
 Test Score: 0.8000
  Test F-Score: 0.4348

Random State: 2024
Test data balance: (array([0, 1]), array([30, 10], dtype=int64))
  Best Parameters for this state: {'classifier__C': 0.001, 'classifier_

In [7]:
#test scores summary (accuracy and fbeta scores)
mean_test_acc_score = np.mean(test_acc_scores)
std_test_acc_score = np.std(test_acc_scores)
mean_fbeta_score = np.mean(test_fbeta_scores)
std_fbeta_score = np.std(test_fbeta_scores)

print("\nTest Score Summary Across Random States:")
print(f"Mean Test Score: {mean_test_acc_score:.4f}")
print(f"Standard Deviation of Test Scores: {std_test_acc_score:.4f}")
print(f"Mean F-Score: {mean_fbeta_score:.4f}")
print(f"Standard Deviation of F-Score: {std_fbeta_score:.4f}")


Test Score Summary Across Random States:
Mean Test Score: 0.7600
Standard Deviation of Test Scores: 0.0339
Mean F-Score: 0.2634
Standard Deviation of F-Score: 0.1480
