In [1]:
# Fall 2023 COGS 118A Final Project
# Raul Martinez Beltran

In [2]:
# Grab datasets from UCI: https://archive.ics.uci.edu/datasets?skip=0&take=10&sort=desc&orderBy=NumHits&search=&Python=true

In [3]:
# Import UCI datasets
from ucimlrepo import fetch_ucirepo

# Import SVM from scikit-learn https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
from sklearn import svm

# Import Decision Tree from scikit-learn https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier
from sklearn import tree

# Import K Nearest Neighbors from scikit-learn https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

# Import Random Forest from scikit-learn https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.ensemble import RandomForestClassifier

# All other imports
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [4]:
# Grab each of the datasets
datasets = {}

# Spambase dataset
data_spambase = fetch_ucirepo(id=94)
datasets["spambase"] = data_spambase.data.original

# Adult income database
data_rice = fetch_ucirepo(id=545)
datasets["rice"] = data_rice.data.original

# Bank marketing database
data_senior = fetch_ucirepo(id=887)
datasets["senior"] = data_senior.data.original

In [5]:
for set in datasets:
    print(
        "Number of missing data in {}: {}".format(
            set, datasets[set].isnull().any(axis=1).sum()
        )
    )

Number of missing data in spambase: 0
Number of missing data in rice: 0
Number of missing data in senior: 0


In [6]:
# Drop any missing data
for set in datasets:
    if datasets[set].isnull().values.any():
        datasets[set] = datasets[set].dropna()

In [7]:
def transform_senior_gender(input):
    if input == 1:
        return 1, 0
    else:
        return 0, 1


def transform_senior_activity(input):
    if input == 1:
        return 1, 0
    else:
        return 0, 1

In [8]:
# Cleaning of Senior dataset
datasets["senior"]["is_male"], datasets["senior"]["is_female"] = zip(
    *datasets["senior"]["RIAGENDR"].apply(transform_senior_gender)
)
datasets["senior"]["is_active"], datasets["senior"]["not_active"] = zip(
    *datasets["senior"]["age_group"].apply(transform_senior_activity)
)
datasets["senior"] = datasets["senior"].drop(columns=["RIAGENDR", "PAQ605"])

In [9]:
# Rerranging senior dataset class to end
datasets["senior"] = datasets["senior"].rename(columns={"age_group": "Class"})
column_to_move = datasets["senior"]["Class"]
datasets["senior"] = datasets["senior"].drop(columns=["Class"])
datasets["senior"].insert(len(datasets["senior"].dtypes), "Class", column_to_move)

# Rearranging rice dataset class to end

In [10]:
# Settings including hyperparameters and number of runs
num_trials = 3
n_jobs = 4
param_grid_svm = {"C": [0.01, 0.1, 1, 10, 100, 1000]}
param_grid_tree = {"max_depth": [3, 5, 10, 15]}
param_grid_knn = {"n_neighbors": [1, 3, 5, 7, 9]}
classifiers = ["svm", "tree", "knn", "rf"]

In [11]:
# Used for 20/80 split
test_size = 0.8

acc_20_80 = {}
f1_20_80 = {}

for data in datasets:
    for clf in classifiers:
        acc_20_80[clf] = 0
        f1_20_80[clf] = 0
    print(data)
    for trial in range(num_trials):
        grid_svm = GridSearchCV(
            svm.SVC(kernel="rbf"), refit=True, param_grid=param_grid_svm, n_jobs=n_jobs
        )
        grid_tree = GridSearchCV(
            tree.DecisionTreeClassifier(criterion="entropy"),
            refit=True,
            param_grid=param_grid_tree,
            n_jobs=n_jobs,
        )
        grid_knn = GridSearchCV(
            KNeighborsClassifier(weights="distance"),
            refit=True,
            param_grid=param_grid_knn,
            n_jobs=n_jobs,
        )
        grid_rf = GridSearchCV(
            RandomForestClassifier(criterion="entropy"),
            refit=True,
            param_grid=param_grid_tree,
            n_jobs=n_jobs,
        )
        X_train, X_test, y_train, y_test = train_test_split(
            datasets[data].iloc[:, 0 : len(datasets[data].dtypes) - 1],
            datasets[data]["Class"],
            test_size=test_size,
            shuffle=True,
        )
        grid_svm.fit(X_train, y_train)
        grid_tree.fit(X_train, y_train)
        grid_knn.fit(X_train, y_train)
        grid_rf.fit(X_train, y_train)

        svm_predictions = grid_svm.predict(X_test)
        tree_predictions = grid_tree.predict(X_test)
        knn_predictions = grid_knn.predict(X_test)
        rf_predictions = grid_rf.predict(X_test)

        report_svm = classification_report(y_test, svm_predictions, output_dict=True)
        report_tree = classification_report(y_test, tree_predictions, output_dict=True)
        report_knn = classification_report(y_test, knn_predictions, output_dict=True)
        report_rf = classification_report(y_test, rf_predictions, output_dict=True)

        acc_20_80["svm"] = acc_20_80["svm"] + report_svm["accuracy"]
        f1_20_80["svm"] = f1_20_80["svm"] + report_svm["macro avg"]["f1-score"]
        acc_20_80["tree"] = acc_20_80["tree"] + report_tree["accuracy"]
        f1_20_80["tree"] = f1_20_80["tree"] + report_tree["macro avg"]["f1-score"]
        acc_20_80["knn"] = acc_20_80["knn"] + report_knn["accuracy"]
        f1_20_80["knn"] = f1_20_80["knn"] + report_knn["macro avg"]["f1-score"]
        acc_20_80['rf'] = acc_20_80['rf'] + report_rf['accuracy']
        f1_20_80['rf'] = f1_20_80['rf'] + report_rf['macro avg']['f1-score']

    print(acc_20_80["svm"] / num_trials)
    print(f1_20_80["svm"] / num_trials)
    print(acc_20_80["tree"] / num_trials)
    print(f1_20_80["tree"] / num_trials)
    print(acc_20_80["knn"] / num_trials)
    print(f1_20_80["knn"] / num_trials)
    print(acc_20_80['rf'] / num_trials)
    print(f1_20_80['rf'] / num_trials)

spambase
0.8669745540161188
0.8586322827779336
0.8868061215249479
0.8807332966114984
0.7671828307525129
0.7555305627959606
0.9309064565788283
0.9266329639635996
rice
0.8905293088363955
0.8858685243412535
0.9230096237970254
0.9217155935653283
0.8757655293088363
0.8715211598542503
0.9243219597550306
0.9227218108536684
senior


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.8368988846224172
0.4556023970591505
1.0
1.0
0.8370817334064728
0.5336306383822046
0.996891570671055
0.9940484180806258


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
# Used for 50/50 split
test_size = 0.5

acc_50_50 = {}
f1_50_50 = {}

for data in datasets:
    for clf in classifiers:
        acc_50_50[clf] = 0
        f1_50_50[clf] = 0
    print(data)
    for trial in range(num_trials):
        grid_svm = GridSearchCV(
            svm.SVC(kernel="rbf"), refit=True, param_grid=param_grid_svm, n_jobs=n_jobs
        )
        grid_tree = GridSearchCV(
            tree.DecisionTreeClassifier(criterion="entropy"),
            refit=True,
            param_grid=param_grid_tree,
            n_jobs=n_jobs,
        )
        grid_knn = GridSearchCV(
            KNeighborsClassifier(weights="distance"),
            refit=True,
            param_grid=param_grid_knn,
            n_jobs=n_jobs,
        )
        grid_rf = GridSearchCV(
            RandomForestClassifier(criterion="entropy"),
            refit=True,
            param_grid=param_grid_tree,
            n_jobs=n_jobs,
        )
        X_train, X_test, y_train, y_test = train_test_split(
            datasets[data].iloc[:, 0 : len(datasets[data].dtypes) - 1],
            datasets[data]["Class"],
            test_size=test_size,
            shuffle=True,
        )
        grid_svm.fit(X_train, y_train)
        grid_tree.fit(X_train, y_train)
        grid_knn.fit(X_train, y_train)
        grid_rf.fit(X_train, y_train)

        svm_predictions = grid_svm.predict(X_test)
        tree_predictions = grid_tree.predict(X_test)
        knn_predictions = grid_knn.predict(X_test)
        rf_predictions = grid_rf.predict(X_test)

        report_svm = classification_report(y_test, svm_predictions, output_dict=True)
        report_tree = classification_report(y_test, tree_predictions, output_dict=True)
        report_knn = classification_report(y_test, knn_predictions, output_dict=True)
        report_rf = classification_report(y_test, rf_predictions, output_dict=True)

        acc_50_50["svm"] = acc_50_50["svm"] + report_svm["accuracy"]
        f1_50_50["svm"] = f1_50_50["svm"] + report_svm["macro avg"]["f1-score"]
        acc_50_50["tree"] = acc_50_50["tree"] + report_tree["accuracy"]
        f1_50_50["tree"] = f1_50_50["tree"] + report_tree["macro avg"]["f1-score"]
        acc_50_50["knn"] = acc_50_50["knn"] + report_knn["accuracy"]
        f1_50_50["knn"] = f1_50_50["knn"] + report_knn["macro avg"]["f1-score"]
        acc_50_50['rf'] = acc_50_50['rf'] + report_rf['accuracy']
        f1_50_50['rf'] = f1_50_50['rf'] + report_rf['macro avg']['f1-score']

    print(acc_50_50["svm"] / num_trials)
    print(f1_50_50["svm"] / num_trials)
    print(acc_50_50["tree"] / num_trials)
    print(f1_50_50["tree"] / num_trials)
    print(acc_50_50["knn"] / num_trials)
    print(f1_50_50["knn"] / num_trials)
    print(acc_50_50['rf'] / num_trials)
    print(f1_50_50['rf'] / num_trials)

spambase
0.8983050847457626
0.8920817347917213
0.9161234245980009
0.9111865934605651
0.7995074605244096
0.7884373803840389
0.946834709546574
0.9436458848815024
rice
0.9035870516185477
0.9002964515785999
0.9259842519685039
0.9242944026473457
0.8806649168853893
0.8767266696070856
0.9294838145231846
0.9276469695116975
senior


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.8369915130231197
0.45562957011994243
1.0
1.0
0.8390400936494
0.5731202283885719
1.0
1.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
# Used for 80/20 split
test_size = 0.2

acc_80_20 = {}
f1_80_20 = {}

for data in datasets:
    for clf in classifiers:
        acc_80_20[clf] = 0
        f1_80_20[clf] = 0
    print(data)
    for trial in range(num_trials):
        grid_svm = GridSearchCV(
            svm.SVC(kernel="rbf"), refit=True, param_grid=param_grid_svm, n_jobs=n_jobs
        )
        grid_tree = GridSearchCV(
            tree.DecisionTreeClassifier(criterion="entropy"),
            refit=True,
            param_grid=param_grid_tree,
            n_jobs=n_jobs,
        )
        grid_knn = GridSearchCV(
            KNeighborsClassifier(weights="distance"),
            refit=True,
            param_grid=param_grid_knn,
            n_jobs=n_jobs,
        )
        grid_rf = GridSearchCV(
            RandomForestClassifier(criterion="entropy"),
            refit=True,
            param_grid=param_grid_tree,
            n_jobs=n_jobs,
        )
        X_train, X_test, y_train, y_test = train_test_split(
            datasets[data].iloc[:, 0 : len(datasets[data].dtypes) - 1],
            datasets[data]["Class"],
            test_size=test_size,
            shuffle=True,
        )
        grid_svm.fit(X_train, y_train)
        grid_tree.fit(X_train, y_train)
        grid_knn.fit(X_train, y_train)
        grid_rf.fit(X_train, y_train)

        svm_predictions = grid_svm.predict(X_test)
        tree_predictions = grid_tree.predict(X_test)
        knn_predictions = grid_knn.predict(X_test)
        rf_predictions = grid_rf.predict(X_test)

        report_svm = classification_report(y_test, svm_predictions, output_dict=True)
        report_tree = classification_report(y_test, tree_predictions, output_dict=True)
        report_knn = classification_report(y_test, knn_predictions, output_dict=True)
        report_rf = classification_report(y_test, rf_predictions, output_dict=True)

        acc_80_20["svm"] = acc_80_20["svm"] + report_svm["accuracy"]
        f1_80_20["svm"] = f1_80_20["svm"] + report_svm["macro avg"]["f1-score"]
        acc_80_20["tree"] = acc_80_20["tree"] + report_tree["accuracy"]
        f1_80_20["tree"] = f1_80_20["tree"] + report_tree["macro avg"]["f1-score"]
        acc_80_20["knn"] = acc_80_20["knn"] + report_knn["accuracy"]
        f1_80_20["knn"] = f1_80_20["knn"] + report_knn["macro avg"]["f1-score"]
        acc_80_20["rf"] = acc_80_20["rf"] + report_rf["accuracy"]
        f1_80_20["rf"] = f1_80_20["rf"] + report_rf["macro avg"]["f1-score"]

    print(acc_80_20["svm"] / num_trials)
    print(f1_80_20["svm"] / num_trials)
    print(acc_80_20["tree"] / num_trials)
    print(f1_80_20["tree"] / num_trials)
    print(acc_80_20["knn"] / num_trials)
    print(f1_80_20["knn"] / num_trials)
    print(acc_80_20["rf"] / num_trials)
    print(f1_80_20["rf"] / num_trials)

spambase
0.9069851610568223
0.9025402908037617
0.9102424900470503
0.9057315155555967
0.8219326818675353
0.8146394324952135
0.9507781397032211
0.9484475590895359
rice
0.9181977252843394
0.9159703631579008
0.9234470691163604
0.9217642731539687
0.8832020997375327
0.8797364806227274
0.9221347331583551
0.9203342396392687
senior


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.8421052631578947
0.4571290576781311
1.0
1.0
0.8581871345029239
0.6678961092257066
1.0
1.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
