In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
import optuna

plt.rcParams['axes.facecolor'] = 'white'

## Data loading and visualization

In [None]:
# load dataset and display it

train_df = pd.read_csv("data/EthanolLevel_TRAIN.tsv", delimiter="\t", header=None)
train_df.rename(columns={0: "Class"}, inplace=True)
train_df

In [None]:
# split the train dataframe into train and validation sets
train_df_class = train_df['Class'] - 1
train_df_spectrum = train_df.drop("Class", axis=1)

train_set_spectrum, val_set_spectrum, train_set_class, val_set_class = train_test_split(train_df_spectrum,
                                                                                        train_df_class,
                                                                                        test_size=0.2,
                                                                                        random_state=42)

print(len(train_set_spectrum), len(val_set_spectrum))

In [None]:
train_set_class.value_counts()

In [None]:
test_df = pd.read_csv("data/EthanolLevel_TEST.tsv", delimiter="\t", header=None)
test_df.rename(columns={0: "Class"}, inplace=True)
test_set_class = test_df['Class'] - 1
test_set_spectrum = test_df.drop("Class", axis=1)

train_set_class.value_counts()

In [None]:
import plotly.express as px

label_names = {0: "Class 0 (E35)", 1: "Class 1 (E38)", 2: "Class 2 (E40)", 3: "Class 3 (E45)"}
value_counts = train_set_class.value_counts().reset_index()
value_counts['label'] = value_counts['Class'].map(label_names)
fig = px.pie(value_counts, values='count', names='label', hole=0.5, width=600, height=400)
fig.write_html("pie_chart.html")

In [None]:
def filter_class(spectra, classes, class_to_filter):
    output_spectra = spectra[classes != class_to_filter]
    output_classes = classes[classes != class_to_filter]
    encoder = LabelEncoder()
    encoder.fit(output_classes)
    encoded_classes = encoder.transform(output_classes)
    return output_spectra, encoded_classes

def plot_series(class_n, color, linewidth, title):
    plt.title(title)
    plt.xlabel("Wavelength (conventional units)")
    plt.ylabel("Intensity (conventional units)")
    for obs in train_set_spectrum[train_set_class == class_n].iterrows():
        plt.plot(obs[1], c=color, linewidth=linewidth)
    plt.show()

In [None]:
plot_series(0, "#636EFA", 0.15, "Class 0 (E35)")
plot_series(1, "#EF553B", 0.15, "Class 1 (E38)")
plot_series(2, "#00CC96", 0.15, "Class 2 (E40)")
plot_series(3, "#AB63FA", 0.15, "Class 3 (E45)")

## Dimensionality reduction using PCA

In [None]:
pca = PCA(n_components=0.999, random_state=42)
pca.fit(train_set_spectrum)

In [None]:
cumulative_explained_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
plt.plot(cumulative_explained_variance_ratio, color='red')
plt.xlabel("Component")
plt.ylabel("Cumulative explained variance ratio")

In [None]:
pca.n_components_

In [None]:
# transform train, val and test sets
train_set_spectrum_pca = pca.transform(train_set_spectrum)
val_set_spectrum_pca = pca.transform(val_set_spectrum)
test_set_spectrum_pca = pca.transform(test_set_spectrum)

## K Nearest Neighbors classifier

In [None]:
# define KNN classifier model and parameters grid

knn_clf = KNeighborsClassifier(n_jobs=-1)
knn_param_grid = {'n_neighbors': [1, 2, 3, 4, 5, 10, 15, 20],
                  'weights': ['uniform', 'distance'],
                  'p': [1, 2]}
grid_search_cv = GridSearchCV(estimator=knn_clf,
                              param_grid=knn_param_grid,
                              cv=5,
                              verbose=3,
                              n_jobs=-1)

In [None]:
# fit grid search CV without dimensionality reduction
grid_search_cv.fit(train_set_spectrum, train_set_class)

In [None]:
grid_search_cv.best_params_

In [None]:
print("Train accuracy: ", grid_search_cv.best_estimator_.score(train_set_spectrum, train_set_class))
print("Val accuracy: ", grid_search_cv.best_estimator_.score(val_set_spectrum, val_set_class))
print("Test accuracy: ", grid_search_cv.best_estimator_.score(test_set_spectrum, test_set_class))

In [None]:
test_predicts = grid_search_cv.best_estimator_.predict(test_set_spectrum)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_true=test_set_class, y_pred=test_predicts)

In [None]:
print(classification_report(test_set_class, test_predicts))

In [None]:
grid_search_cv.fit(train_set_spectrum_pca, train_set_class)

In [None]:
grid_search_cv.best_params_

In [None]:
print("Train accuracy: ", grid_search_cv.best_estimator_.score(train_set_spectrum_pca, train_set_class))
print("Val accuracy: ", grid_search_cv.best_estimator_.score(val_set_spectrum_pca, val_set_class))
print("Test accuracy: ", grid_search_cv.best_estimator_.score(test_set_spectrum_pca, test_set_class))

## XGBoost classifier

In [None]:
def objective(trial):    
    xgb_params = {
        "n_estimators": trial.suggest_categorical("n_estimators", [50, 100, 200, 250]),
        "max_depth": trial.suggest_categorical("max_depth", [2, 4, 5, 6, 8, 10]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5),
        "lambda": trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float('alpha', 1e-8, 1.0, log=True)
    }
    xgb_classifier = XGBClassifier(tree_method='hist',
                                   verbosity=2,
                                   n_jobs=-1,
                                   eval_metric="merror",
                                   random_state=42,
                                   **xgb_params)

    xgb_classifier.fit(train_set_spectrum, train_set_class)
    
    val_accuracy = xgb_classifier.score(val_set_spectrum, val_set_class)
    return val_accuracy

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

In [None]:
study.best_params

In [None]:
xgb_classifier = XGBClassifier(tree_method='hist',
                               verbosity=2,
                               n_jobs=-1,
                               eval_metric="merror",
                               random_state=42,
                               **study.best_params)
xgb_classifier.fit(train_set_spectrum, train_set_class)

In [None]:
print("Train accuracy: ", xgb_classifier.score(train_set_spectrum, train_set_class))
print("Val accuracy: ", xgb_classifier.score(val_set_spectrum, val_set_class))
print("Test accuracy: ", xgb_classifier.score(test_set_spectrum, test_set_class))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_true=test_set_class, y_pred=xgb_classifier.predict(test_set_spectrum))

In [None]:
print(classification_report(y_true=test_set_class, y_pred=xgb_classifier.predict(test_set_spectrum), digits=3))

In [None]:
def objective(trial):    
    xgb_params = {
        "n_estimators": trial.suggest_categorical("n_estimators", [50, 100, 200, 250]),
        "max_depth": trial.suggest_categorical("max_depth", [2, 4, 5, 6, 8, 10]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5),
        "lambda": trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float('alpha', 1e-8, 1.0, log=True)
    }
    xgb_classifier = XGBClassifier(tree_method='hist',
                                   verbosity=2,
                                   n_jobs=-1,
                                   eval_metric="merror",
                                   random_state=42,
                                   **xgb_params)

    xgb_classifier.fit(train_set_spectrum_pca, train_set_class)
    
    val_accuracy = xgb_classifier.score(val_set_spectrum_pca, val_set_class)
    return val_accuracy

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

In [None]:
study.best_params

In [None]:
xgb_classifier = XGBClassifier(tree_method='hist',
                               verbosity=2,
                               n_jobs=-1,
                               eval_metric="merror",
                               random_state=42,
                               **study.best_params)
xgb_classifier.fit(train_set_spectrum_pca, train_set_class)

In [None]:
print("Train accuracy: ", xgb_classifier.score(train_set_spectrum_pca, train_set_class))
print("Val accuracy: ", xgb_classifier.score(val_set_spectrum_pca, val_set_class))
print("Test accuracy: ", xgb_classifier.score(test_set_spectrum_pca, test_set_class))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_true=test_set_class, y_pred=xgb_classifier.predict(test_set_spectrum_pca))
plt.show()

In [None]:
print(classification_report(y_true=test_set_class, y_pred=xgb_classifier.predict(test_set_spectrum_pca), digits=3))