In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')


In [None]:
data_train = pd.read_csv('statlog+shuttle/shuttle.trn', header=None)
data_test = pd.read_csv('statlog+shuttle/shuttle.tst', header=None)


Since all the data was of type object, we convert it into integers first

In [None]:
new_test = data_test[0].str.split(' ', expand=True).astype(int)
new_train = data_train[0].str.split(' ', expand=True).astype(int)


Now the data looks like this-

In [None]:
new_test


In [None]:
new_train


In [None]:
f_train = new_train[9].value_counts()
print(f_train)


In [None]:
f_test = new_test[9].value_counts()
print(f_test)


We see that the dataset is very skewed towards class 1, even if we mix up the dataset and randomize it, it probably would not make much of a difference- hence we will do the X and y split here itself

In [None]:
X_train = new_train.iloc[:, :-1].values
X_test = new_test.iloc[:, :-1].values
y_train = new_train.iloc[:, -1].values
y_test = new_test.iloc[:, -1].values

bdr_y_train = y_train
bdr_y_test = y_test
bdr_X_train = X_train
bdr_X_test = X_test


Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


In [None]:
y_train


Now, we can try our classifiers

Since most of our classes are 1's, we can remove some of them from the training data to see if it improves performance

In [None]:
X_1s = []
for i, _ in enumerate(X_train):
    if (y_train[i] == 1):
        X_1s.append(X_train[i])
print(len(X_1s))


In [None]:
X_rest = []
y_rest = []
for i, _ in enumerate(X_train):
    if (y_train[i] != 1):
        X_rest.append(X_train[i])
        y_rest.append(y_train[i])
print(len(X_rest))


We now have all the X's having class as 1

In [None]:
times = len(X_1s)/6748
times


Since 1's are 5x more than the next best class, we remove half of the X's from the TRAINING SET only

In [None]:
from sklearn.model_selection import train_test_split
y_1s = [1]*34108
X_train_1s, X_test_1s, y_train_1s, y_test_1s = train_test_split(
    X_1s, y_1s, test_size=0.5)
print(len(X_train_1s))


In [None]:
X_train_1s  # Randomly selected 50% of X_1s


In [None]:
X_train_1s = pd.DataFrame(X_train_1s)


In [None]:
X_train_1s


In [None]:
X_rest = pd.DataFrame(X_rest)
y_rest = pd.DataFrame(y_rest)


In [None]:
X_rest


In [None]:
y_rest.value_counts()


In [None]:
y_1s = [1]*len(X_train_1s)
y_1s = pd.DataFrame(y_1s)
y_1s


So now, we have Xs which is not too skewed towards 1, and all of the other elements in the dataset remains.
WE can merge it now to create our grand X_train dataset finally

In [None]:
X_train = pd.concat([X_train_1s, X_rest], axis=0, ignore_index=True)
y_train = pd.concat([y_1s, y_rest], axis=0, ignore_index=True)


In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA()
lda.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score
y_pred = lda.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
print(accuracy)


We got accuracy of 96.9 ~ 97% by doing the right preprocessing of our data for LDA

Next we try the same analysis on QDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
correlation_matrix = X_train.corr()

# Display the correlation matrix as a heatmap

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()


In [None]:
lda_dim_red = LDA(n_components=4)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)


In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
qda = QDA()
qda.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score
y_pred_qda = qda.predict(X_test)
accuracy = accuracy_score(y_pred_qda, y_test)
print(accuracy)


Note: The Notebook sometimes before running shows a weird output that shows the accuracy to be 0.00ish something, but once you run it, it resolves, I am not exactly sure why that error happens, but if you run the code it shows the accuracy correctly ~ 95%ish

Now finally we attempt to create a BDR classifier:

In [None]:
y_train


In [None]:
bdr_y_train


In [None]:
prior_probs = [0]*7
for i in bdr_y_train:
    prior_probs[i-1] += 1
prior_probs


In [None]:
for i,v in enumerate(prior_probs):
    prior_probs[i]/=len(bdr_y_train)
prior_probs

In [None]:
# We need to normalize the data for bdr sets also
from sklearn.preprocessing import StandardScaler

sc2 = StandardScaler()
bdr_X_train = sc.fit_transform(bdr_X_train)
bdr_X_test = sc.transform(bdr_X_test)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity

grid_size = 0.2

# Create a list of class labels
class_labels = np.unique(bdr_y_train)

y_pred = []

kde_models = {}
for label in class_labels:
    class_samples = bdr_X_train[bdr_y_train == label]

    # Create and fit a KDE model for the class
    kde = KernelDensity(bandwidth=0.5)  # Adjust bandwidth as needed
    kde.fit(class_samples)

    # Store the KDE model for this class
    kde_models[label] = kde

# Convert bdr_X_test back to a Pandas DataFrame
bdr_X_test = pd.DataFrame(bdr_X_test)

# Iterate through each test sample
for i in range(len(X_test)):
    # Calculate the grid element where the test sample is located
    grid_element = np.floor(bdr_X_test.iloc[i] / grid_size)

    # Calculate the class with the highest prior probability in the grid element
    max_posterior_prob = -1
    predicted_label = None

    for label in class_labels:
        # Calculate prior probability for the class
        prior_prob = np.sum(
            bdr_y_train[bdr_y_train == label]) / len(bdr_y_train)

        # Calculate the density using the KDE model for the class
        log_density = kde_models[label].score_samples([bdr_X_test.iloc[i]])

        # Calculate the posterior probability
        posterior_prob = prior_prob * np.exp(log_density)

        # Check if this class has a higher posterior probability
        if (posterior_prob > max_posterior_prob):
            max_posterior_prob = posterior_prob
            predicted_label = label

    y_pred.append(predicted_label)


In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"BDR Accuracy: {accuracy:.4f}")


To think about: What is the complexity of building the BDR classifier? What will happen if the
number of attributes and the number of feature vectors in the learning set are large?

Training the BDR classifier takes a very long time, even on this small dataset, if the Learning Set is huge, BDR is not feasible for real time applications