## Imports

In [56]:
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from scipy.io.arff import loadarff
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, Normalizer, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
import pandas, time, uuid, statistics
import numpy as np
import json
from joblib import dump, load

## Function Definitions

In [57]:
RANDOM_STATE = 2

def train_model_holdout(X_train, x_test, y_train, y_test, classifier_class, **kwargs):
    
    classifier = classifier_class(**kwargs)
    # Train the neural network classifier
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(x_test)

    output = dict()
    output["accuracy_score"] = sklearn.metrics.accuracy_score(y_test, y_pred, normalize=True)
    output["balanced_accuracy_score"] = sklearn.metrics.balanced_accuracy_score(y_test, y_pred, adjusted=True)
    output["precision_score"] = sklearn.metrics.precision_score(y_test, y_pred, average="weighted", zero_division=0, labels=np.unique(y_pred))
    output["recall_score"] = sklearn.metrics.recall_score(y_test, y_pred, average="weighted", zero_division=0, labels=np.unique(y_pred))
    output["zero-one-loss"] = sklearn.metrics.zero_one_loss(y_test, y_pred, normalize=True)

    return output, classifier

def train_model_cross_val(classifier_class, X, y, splits, shuffle_fold, **kwargs):
    
    classifier = classifier_class(**kwargs)

    # Define k-fold cross-validation
    k_fold = KFold(n_splits=splits, shuffle=shuffle_fold, random_state=RANDOM_STATE)

    # Perform k-fold cross-validation
    scores = cross_val_score(classifier, X, y, cv=k_fold)

    return scores

def preprocessing_convert_to_label(frame, column, label_map = None):

    if label_map:
        frame[column] = frame[column].map(label_map)
    else:
        label_encoder = LabelEncoder()
        frame[column] = label_encoder.fit_transform(frame[column])


def preprocessing_convert_to_one_hot_encoding(frame, column) -> pandas.DataFrame:

    one_hot_encoder = OneHotEncoder()
    encoded_column = one_hot_encoder.fit_transform(frame[[column]])
    
    encoded_df = pandas.DataFrame(encoded_column.toarray(), columns=one_hot_encoder.get_feature_names_out([column]))

    # Concatenate the new DataFrame with the original DataFrame
    return pandas.concat([frame, encoded_df], axis=1).drop(column, axis=1, inplace=False)

def preprocessing_apply_normalize_scaling(frame, column):
    mean = np.mean(frame[column]) # Normalizer did not work properly, only returned array of 1's
    sd = np.std(frame[column])
    frame[column] = (frame[column] - mean) / sd 

def preprocessing_apply_minmax_scaling(frame, column):
    scaler = MinMaxScaler()

    frame[column] = scaler.fit_transform(frame[[column]])


def preprocessing_impute_missing_with_mode(frame: pandas.DataFrame, column: str, missingValue: str | bytes):
    mode = statistics.mode(frame[frame[column] != missingValue][column]) 
    frame.loc[frame[column] == missingValue, column] = mode


## [Choice] Enable/Disable Scaling

### Normalization

In [59]:
SCALING = "normal"

### MinMax Scaling

In [37]:
SCALING = "minmax"

### No Scaling

In [None]:
SCALING = None

## [Choice] One-Hot Encoding/Labeling

### Labeling

In [27]:
LABELING = True

### One hot encoding

In [60]:
LABELING = False

## [Choice] Missing Values for nominals

### Impute with mode (Modus im Deutschen)

In [61]:
IMPUTE = True

### Make 'missing' to new value

In [5]:
IMPUTE = False

## [Choice] Dataset Selection + Preprocessing

### Preprocessing Seattle
Report number cut completely, ocurred_time and reported_time scaling division by 2400, crime_subcategory, beat, neighborhood and precinct to label

Important values: IMPUTE, LABELLING, SCALING

In [62]:
dataset_name = "seattle"

data = loadarff("data\\seattle.arff")
df = pandas.DataFrame(data[0])
y = pandas.DataFrame(df["Primary_Offense_Description"])
x = df.drop("Primary_Offense_Description", axis=1)

x1 = x.copy()
y1 = y.copy()

# Delete unnecessary columns
x1.drop("Report_Number", axis=1, inplace=True)

# Scaling, always apply minmax if not none, because normalization makes no sense
if SCALING is not None:
    x1["Occurred_Time"] /= 2400
    x1["Reported_Time"] /= 2400

# Convert to label or one hot
missing_values = [b"?", b'UNKNOWN', b'?', b'?', b"UNKNOWN"]
to_label = ["Crime_Subcategory", "Precinct", "Sector", "Beat", "Neighborhood"]

for missing_val, column in zip(missing_values, to_label):
    if IMPUTE:
        preprocessing_impute_missing_with_mode(x1, column, missing_val)

    if LABELING:
        preprocessing_convert_to_label(x1, column)
    else:
        x1 = preprocessing_convert_to_one_hot_encoding(x1, column)

preprocessing_convert_to_label(y1, "Primary_Offense_Description")
y1 = y1['Primary_Offense_Description'].values.ravel()

# Missing Values
imputer = SimpleImputer(strategy='mean')
imputer.fit(x1)

imputed_data = imputer.transform(x1)

imputed_frame = pandas.DataFrame(imputed_data, columns=x1.columns)
x1["Occurred_Time"] = imputed_frame["Occurred_Time"]
x1["Reported_Time"] = imputed_frame["Reported_Time"]

### Second Mushrooms
Preprocessing: cap-diameter, stem-height (mean?), stem-width (mean?) scaling; cap-shape, cap-surface, cap-color, stem-color, gill-attachment, gill-spacing, gill-color, stem-root, stem-surface, veil-type, does-bruise-or-bleed, veil-color, has-ring, ring-type, spore-print-color, habitat, season labeling/one hot. as missing values are nominal only, they are seen as an own atrribute

Important values: IMPUTE, LABELLING, SCALING

In [38]:
dataset_name = "mushroom"

df = pandas.read_csv("data\\mushroom\\secondary_data.csv", sep=";", na_values="nan", keep_default_na=False)
x = df.drop("class", axis=1, inplace=False)
y = df["class"]

x1 = x.copy()
y1 = y.copy()

to_label = ['cap-shape', 'cap-surface', 'cap-color', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root', 'stem-color' ,'stem-surface', 'veil-type', 'does-bruise-or-bleed', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color', 'habitat', 'season']
for column in to_label:
    if IMPUTE:
        preprocessing_impute_missing_with_mode(x1, column, "")
    if LABELING:
        preprocessing_convert_to_label(x1, column)
    else:
        x1 = preprocessing_convert_to_one_hot_encoding(x1, column)

to_scale = ["cap-diameter", "stem-height", "stem-width"]


if SCALING is not None:
    for i in to_scale:
        if SCALING == "normal":
            preprocessing_apply_normalize_scaling(x1, i)
        elif SCALING == "minmax":
            preprocessing_apply_minmax_scaling(x1, i)

label_encoder = LabelEncoder()
y1 = label_encoder.fit_transform(y1)

### Congressional Voting
Preprocessing: Labelling all attibutes and handling missing values

Important values: IMPUTE, LABELLING

In [49]:
dataset_name = "congress"

df = pandas.read_csv("data\\congress\\CongressionalVotingID.shuf.lrn.csv", sep=",")
x = df.drop("class", axis=1, inplace=False).drop("ID", axis=1, inplace=False)
y = df["class"]

x1 = x.copy()
y1 = y.copy()

to_label = x1.columns

for i in to_label:
    if IMPUTE:
        preprocessing_impute_missing_with_mode(x1, i, "unknown")

    if LABELING:
        preprocessing_convert_to_label(x1, i, {"y": 1, "n": 0, "unknown": 2})
    else:
        x1 = preprocessing_convert_to_one_hot_encoding(x1, i) # I am not sure how much one hot encoding makes sense here

label_encoder = LabelEncoder()
y1 = label_encoder.fit_transform(y1)

### Reviews
Preprocessing: Mostly scaling, no nominal values (except for target) and no missing values(?)

Important values: SCALING

In [70]:
dataset_name = "reviews"

df = pandas.read_csv("data\\reviews\\amazon_review_ID.shuf.lrn.csv", sep=",")
x = df.drop("Class", axis=1, inplace=False).drop("ID", axis=1, inplace=False)
y = df["Class"]
x["Sums"] = x.sum(axis=1)
x1 = x.copy()
y1 = y.copy()


to_scale = x1.columns

if SCALING is not None:
    for i in to_scale:
        if SCALING == "normal":
            preprocessing_apply_normalize_scaling(x1, i)
        elif SCALING == "minmax":
            preprocessing_apply_minmax_scaling(x1, i)

label_encoder = LabelEncoder()
y1 = label_encoder.fit_transform(y1)


 ### [Optional] Display Data Before and After Preprocessing

In [14]:
display(x)
display(y)
display(x1)
display(y1)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V9992,V9993,V9994,V9995,V9996,V9997,V9998,V9999,V10000,Sums
0,17,4,8,8,9,4,0,2,3,5,...,0,0,0,0,0,0,0,1,1,4057
1,21,9,5,8,6,2,16,3,12,6,...,0,0,0,2,2,1,0,1,0,5609
2,9,7,6,3,8,2,9,4,4,5,...,0,0,0,0,0,0,0,1,1,3204
3,8,3,5,2,4,3,8,2,4,4,...,0,0,1,0,1,0,0,0,0,3488
4,15,8,8,4,7,8,4,7,1,3,...,0,0,0,0,0,0,0,0,0,4916
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,8,1,5,6,0,5,2,2,2,4,...,0,0,0,1,0,0,0,1,0,3210
746,13,7,3,4,7,4,2,3,5,2,...,0,0,0,0,0,1,0,0,0,5548
747,13,8,6,6,11,0,11,2,4,1,...,1,0,0,0,0,0,1,0,0,4690
748,14,16,7,8,11,8,9,3,7,7,...,0,0,0,0,0,0,0,0,0,5698


0           Shea
1          Riley
2        Chachra
3        Agresti
4          Nigam
         ...    
745    Calvinnme
746         Shea
747     Cholette
748      Sherwin
749       Janson
Name: Class, Length: 750, dtype: object

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V9992,V9993,V9994,V9995,V9996,V9997,V9998,V9999,V10000,Sums
0,0.53125,0.210526,0.40,0.40,0.428571,0.266667,0.000000,0.142857,0.230769,0.277778,...,0.0,0.0,0.000,0.0,0.000000,0.000000,0.00,0.333333,0.25,0.571818
1,0.65625,0.473684,0.25,0.40,0.285714,0.133333,0.761905,0.214286,0.923077,0.333333,...,0.0,0.0,0.000,0.4,0.666667,0.333333,0.00,0.333333,0.00,0.836258
2,0.28125,0.368421,0.30,0.15,0.380952,0.133333,0.428571,0.285714,0.307692,0.277778,...,0.0,0.0,0.000,0.0,0.000000,0.000000,0.00,0.333333,0.25,0.426478
3,0.25000,0.157895,0.25,0.10,0.190476,0.200000,0.380952,0.142857,0.307692,0.222222,...,0.0,0.0,0.125,0.0,0.333333,0.000000,0.00,0.000000,0.00,0.474868
4,0.46875,0.421053,0.40,0.20,0.333333,0.533333,0.190476,0.500000,0.076923,0.166667,...,0.0,0.0,0.000,0.0,0.000000,0.000000,0.00,0.000000,0.00,0.718180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,0.25000,0.052632,0.25,0.30,0.000000,0.333333,0.095238,0.142857,0.153846,0.222222,...,0.0,0.0,0.000,0.2,0.000000,0.000000,0.00,0.333333,0.00,0.427500
746,0.40625,0.368421,0.15,0.20,0.333333,0.266667,0.095238,0.214286,0.384615,0.111111,...,0.0,0.0,0.000,0.0,0.000000,0.333333,0.00,0.000000,0.00,0.825865
747,0.40625,0.421053,0.30,0.30,0.523810,0.000000,0.523810,0.142857,0.307692,0.055556,...,0.2,0.0,0.000,0.0,0.000000,0.000000,0.25,0.000000,0.00,0.679673
748,0.43750,0.842105,0.35,0.40,0.523810,0.533333,0.428571,0.214286,0.538462,0.388889,...,0.0,0.0,0.000,0.0,0.000000,0.000000,0.00,0.000000,0.00,0.851423


array([43, 41,  9,  0, 38, 34, 15, 35,  1,  4, 46, 19,  9, 21, 38, 29, 19,
       14, 18,  8, 11, 43, 13, 26, 39, 15, 19,  8, 17, 17, 49, 31, 20, 34,
       17, 23,  5,  2, 44, 13, 13, 12, 29, 32, 45, 48, 39,  1, 13, 39,  1,
       45, 38,  9, 33,  3, 10, 46,  8, 21, 10, 12, 24, 48, 48, 23, 22, 14,
        3, 11,  2, 12,  9, 22, 44, 25, 23, 25, 49,  7, 12, 27, 14, 33, 38,
       11,  5,  0, 16, 30, 40, 26, 34, 25, 43,  1, 43, 42,  9, 38, 33, 22,
       14, 42, 43, 15, 46, 14,  0, 29, 41,  4,  4, 36, 43, 25, 14, 39, 40,
        9, 28, 41, 49, 23, 10, 11, 36, 11, 10, 31,  7, 37, 45, 32, 36, 36,
        3, 14, 39, 26, 19, 44, 11, 15, 43, 18, 44, 44,  4,  6,  6, 15, 17,
       42, 35, 44, 45, 28, 15, 34,  6, 33, 30, 22, 49, 41, 49, 10, 31, 49,
       24,  2, 40, 49, 49,  7, 41, 46, 13, 42, 46, 12,  3, 44, 34, 47, 28,
       18, 13, 26,  1,  7,  5, 47, 35, 37,  9, 48, 33,  1, 23, 24, 45, 44,
       47,  3, 24,  8, 41, 40, 43, 33,  6, 34, 24,  1,  9, 44,  6, 47,  3,
        3, 20, 31,  8, 37

## [Choice + Config] Classifier + Parameter

### Multi Layer Perceptron

In [71]:
selected_classifier = MLPClassifier
name_prefix = "mlp"
kwargs = {
    "hidden_layer_sizes": (100,), # array-like of shape (n_layers - 2,), default=(100,)
    "activation": "logistic", # {"identity", "logistic", "tanh", "relu"}, default="relu"
    "alpha": 0.0001, # float, default=0.0001
    "batch_size": "auto", # int, default="auto"
    "solver": "adam", # {"lbfgs", "sgd", "adam"}, default="adam"
    "learning_rate": "constant", # {"constant", "invscaling", "adaptive"}, default="constant"; only matters if solver="sgd"
    "learning_rate_init": 0.001, # float, default=0.001; only when solver="sgd" or "adam"
    "power_t": 0.5, # float, default=0.5, when solver="sgd"
    "max_iter": 100, # int, default=200
    "shuffle": True, # bool, default=True
    "random_state": RANDOM_STATE, # int, RandomState instance, default=None
    "tol": 0.0001, # float, default=1e-4
    "verbose": True, # bool, default=False
    "warm_start": False, # bool, default=False
    "momentum": 0.9, # float, default=0.9; only when solver="sgd"
    "nesterovs_momentum": True, # bool, default=True; only when momentum > 0 and solver="sgd"
    "early_stopping": False, # bool, default=False; only solver="sgd" or "adam"
    "validation_fraction": 0.1, # float, default=0.1
    "beta_1": 0.9, # float, default=0.9; only solver="adam"
    "beta_2": 0.999, # float, default=0.999; only solver="adam"
    "epsilon": 1e-08, # float, default=1e-8; only solver="adam"
    "n_iter_no_change": 10, # int, default=10; only solver="sgd" or "adam"
    "max_fun": 15000, # int, default=15000; only solver="lbfgs"
}

### k-nn

In [39]:
selected_classifier = KNeighborsClassifier
name_prefix = "knn"
kwargs = {
    "n_neighbors": 5,  # int, default=5
    "weights": "distance",  # {"uniform", "distance"}, callable, or None, default="uniform"
    "algorithm": "ball_tree",  # {"auto", "ball_tree", "kd_tree", "brute"}, default="auto"
    "leaf_size": 1,  # int, default=30, for kd and ball
    "p": 2,  # float, default=2
    "metric": "minkowski",  # str or callable, default="minkowski"
    "metric_params": None,  # dict, default=None
    "n_jobs": None  # int, default=None
}

### Random Forest

In [50]:
selected_classifier = RandomForestClassifier
name_prefix = "rf"
kwargs = {
    "n_estimators": 30,  # int, default=100
    "criterion": "log_loss",  # "gini", "entropy", "log_loss" defaukt=gini
    "max_depth": 50,  # int, default=None CAUTION: default value None is dangerous for the pc
    "min_samples_split": 2,  # int or float, default=2
    "min_samples_leaf": 1,  # int or float, default=1
    "min_weight_fraction_leaf": 0.0,  # float, default=0.0
    "max_features": "sqrt",  # "sqrt", "log2", None, int, or float, default="sqrt"
    "max_leaf_nodes": None,  # int, default=None
    "min_impurity_decrease": 0.0,  # float, default=0.0
    "bootstrap": True,  # bool, default=True
    "oob_score": False,  # bool or callable, default=False
    "n_jobs": -1,  # int, default=None
    "random_state": RANDOM_STATE,  # int, RandomState instance, or None, default=None
    "verbose": 0,  # int, default=0
    "warm_start": False,  # bool, default=False
    "class_weight": None,  # {"balanced", "balanced_subsample"}, dict or list of dicts, default=None
    "ccp_alpha": 0.0,  # non-negative float, default=0.0
    "max_samples": None,  # int or float, default=None
    "monotonic_cst": None  # array-like of int of shape (n_features), default=None
}

## Training

### Holdout Model

In [51]:
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.2, random_state=RANDOM_STATE)
start = time.time()
score1, model1 = train_model_holdout(x1_train, x1_test, y1_train, y1_test, selected_classifier, **kwargs)
duration1 = time.time() - start
modelid1 = str(uuid.uuid4().hex)
modelid1 = dataset_name + modelid1
score1

{'accuracy_score': 0.9772727272727273,
 'balanced_accuracy_score': 0.9615384615384617,
 'precision_score': 0.9784688995215312,
 'recall_score': 0.9772727272727273,
 'zero-one-loss': 0.022727272727272707}

### k-fold cross validation

In [52]:
splits = 5
start = time.time()
score2 = train_model_cross_val(selected_classifier, x1, y1, splits, shuffle_fold=True, **kwargs)
duration2 = time.time() - start
score2, np.mean(score2)

(array([0.97727273, 0.97727273, 0.93181818, 0.97674419, 0.95348837]),
 0.9633192389006343)

## Save results

### Settings and accuracies

In [53]:
with open(f"reports\\{name_prefix}_{dataset_name}.json", "r") as f:
    contents = f.readlines()

if len(contents) > 2:
    contents[-2] = contents[-2].replace("\n", ",\n")

def add_quotes(val):
    return '"' + val + '"'

contents.insert(-1, f'\t{"{"}"id": "{modelid1}", "Settings": {json.dumps(kwargs)}, "Scaling": {add_quotes(SCALING) if SCALING else SCALING}, "Labeling": "{LABELING}", "Impute": "{IMPUTE}", "Holdout": {"{"}"Duration": {duration1}, "Score": {json.dumps(score1)}{"}"}, "CV": {"{"}"Duration": {duration2}, "Accuracy": {str(score2).replace(" ", ",")}, "Mean": {np.mean(score2)}{"}"}{"}"}\r')


with open(f"reports\\{name_prefix}_{dataset_name}.json", "w") as f:
    contents = "".join(contents)
    f.write(contents)

### Model

In [55]:
dump(model1, f"models\\{name_prefix}\\{modelid1}.joblib")

['models\\rf\\congress2f78414fb1cb4e7b8353d34eb6cf28cb.joblib']

## Load Model