<a href="https://colab.research.google.com/github/sharmashubham240496/KNN/blob/main/KNN_Real_world_Dataset_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#4. Processing a Real-world classification dataset by implementing Feature Search using Nearest Neighbor Backward Elimination and Forward Selection.

I’m using the water quality dataset obtained from Kaggle. Below are the Feature Information for the dataset-
The dataset downloaded includes the following parameters:
1. pH: Continuous values
2. Temperature: Continuous values
3. Turbidity (NTU): Continuous values
4. Dissolved Oxygen (DO): Continuous values
5. Conductivity (µS/cm): Continuous values

Please find the below link to check and download the dataset from Kaggle
https://www.kaggle.com/datasets/shreyanshverma27/water-quality-testing/download?datasetVersionNumber=2


In [2]:
# Dataset link - https://www.kaggle.com/datasets/shreyanshverma27/water-quality-testing/download?datasetVersionNumber=2
# Kaggle site link - https://www.kaggle.com/datasets/shreyanshverma27/water-quality-testing

from google.colab import drive
import csv
import pandas as pd
import numpy as np

drive.mount('/content/drive')
water_quality_file_path = '/content/drive/MyDrive/Assignment_Knn/KNN_dataset_files/Water Quality Testing.csv'

water_quality_df = pd.read_csv(water_quality_file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
water_quality_df.shape

(500, 6)

In [4]:
water_quality_df['Water Label'] = np.where(water_quality_df['pH']<7, 1,
                                     np.where(water_quality_df['pH']>7.02, 3, 2))

water_quality_df = water_quality_df.drop(['Sample ID', 'pH'], axis = 1)

water_quality_df = water_quality_df[['Water Label', 'Temperature (°C)', 'Turbidity (NTU)', 'Dissolved Oxygen (mg/L)', 'Conductivity (µS/cm)']]

In [5]:
water_quality_df.columns

Index(['Water Label', 'Temperature (°C)', 'Turbidity (NTU)',
       'Dissolved Oxygen (mg/L)', 'Conductivity (µS/cm)'],
      dtype='object')

In [6]:
print(water_quality_df.dtypes)

Water Label                  int64
Temperature (°C)           float64
Turbidity (NTU)            float64
Dissolved Oxygen (mg/L)    float64
Conductivity (µS/cm)         int64
dtype: object


In [7]:
def dataset_normalization(input_dataset):
  # Z normalization on all the columns of the dataset except 1st column as it is the lable column
  input_data_array = input_dataset.values
  columns_to_normalize_except_label = np.arange(1, input_data_array.shape[1])
  #calculating the mean and standard deviation for each column except lable column
  columns_respective_means = np.mean(input_data_array[:, columns_to_normalize_except_label], axis=0)
  columns_respective_stds = np.std(input_data_array[:, columns_to_normalize_except_label], axis=0)

  input_data_array[:, columns_to_normalize_except_label] = (input_data_array[:, columns_to_normalize_except_label] - columns_respective_means) / columns_respective_stds

  input_dataset_normalized = pd.DataFrame(input_data_array, columns=input_dataset.columns)

  return input_dataset_normalized

## **Forward Selection**

In [None]:
def evaluating_accuracy(normalized_input_dataset):
    number_of_correctly_classified_labels = 0
    X = normalized_input_dataset[:, 1:]
    y = normalized_input_dataset[:, 0]

    for i in range(len(normalized_input_dataset)):
        current_row_label = y[i]
        current_row_classification = X[i]

        current_row_nearest_neighbor_distance = float('inf')
        nearest_neighbor_encountered_label = None

        for j in range(len(normalized_input_dataset)):
            if i != j:
                euclidean_distance = np.sqrt(np.sum((current_row_classification - X[j])**2))

                if euclidean_distance < current_row_nearest_neighbor_distance:
                    current_row_nearest_neighbor_distance = euclidean_distance
                    nearest_neighbor_encountered_label = y[j]

        if nearest_neighbor_encountered_label == current_row_label:
            number_of_correctly_classified_labels += 1

    classifier_accuracy_evaluation_metric = number_of_correctly_classified_labels / len(normalized_input_dataset)

    return classifier_accuracy_evaluation_metric * 100

In [None]:
def forward_selection(normalized_input_dataset):
    print("Beginning Search")
    selected_feature_set = []
    remaining_feature_set = list(range(1, normalized_input_dataset.shape[1]))
    feature_set = {}

    X = normalized_input_dataset.values

    while len(remaining_feature_set) != 0:
        feature_dict = {}

        for c in remaining_feature_set:
            columns_to_include = [0] + [c] + selected_feature_set
            evaluating_performance_dataset = X[:, columns_to_include]
            accuracy_value = evaluating_accuracy(evaluating_performance_dataset)
            columns_to_include.remove(0)

            print()
            print("Using feature(s)", tuple(columns_to_include), "accuracy is", accuracy_value)
            feature_dict[tuple(columns_to_include)] = accuracy_value

        print()
        max_key = max(feature_dict, key=feature_dict.get)
        print("Feature set", tuple(max_key), "was best, accuracy is", feature_dict[max_key])
        print()

        selected_feature_set.insert(0, max_key[0])
        remaining_feature_set.remove(max_key[0])
        feature_set[max_key] = feature_dict[max_key]

    best_feature_list = max(feature_set, key=feature_set.get)

    print()
    print("Finished search!! The best feature subset is", tuple(best_feature_list), "which has an accuracy of", feature_set[best_feature_list])
    return None

In [None]:
water_quality_normalized_df = dataset_normalization(water_quality_df)

In [None]:
forward_selection(water_quality_normalized_df)

Beginning Search

Using feature(s) (1,) accuracy is 61.8

Using feature(s) (2,) accuracy is 70.8

Using feature(s) (3,) accuracy is 79.80000000000001

Using feature(s) (4,) accuracy is 72.39999999999999

Feature set (3,) was best, accuracy is 79.80000000000001


Using feature(s) (1, 3) accuracy is 88.0

Using feature(s) (2, 3) accuracy is 87.8

Using feature(s) (4, 3) accuracy is 84.2

Feature set (1, 3) was best, accuracy is 88.0


Using feature(s) (2, 1, 3) accuracy is 88.8

Using feature(s) (4, 1, 3) accuracy is 92.2

Feature set (4, 1, 3) was best, accuracy is 92.2


Using feature(s) (2, 4, 1, 3) accuracy is 91.8

Feature set (2, 4, 1, 3) was best, accuracy is 91.8


Finished search!! The best feature subset is (4, 1, 3) which has an accuracy of 92.2


##Backward Elimination

In [8]:
import numpy as np

def evaluating_accuracy(normalized_input_dataset):
    number_of_correctly_classified_labels = 0
    X = normalized_input_dataset[:, 1:]
    y = normalized_input_dataset[:, 0]

    for i in range(len(normalized_input_dataset)):
        current_row_label = y[i]
        current_row_classification = X[i]

        current_row_nearest_neighbor_distance = float('inf')
        nearest_neighbor_encountered_label = None

        for j in range(len(normalized_input_dataset)):
            if i != j:
                euclidean_distance = np.sqrt(np.sum((current_row_classification - X[j])**2))

                if euclidean_distance < current_row_nearest_neighbor_distance:
                    current_row_nearest_neighbor_distance = euclidean_distance
                    nearest_neighbor_encountered_label = y[j]

        if nearest_neighbor_encountered_label == current_row_label:
            number_of_correctly_classified_labels += 1

    classifier_accuracy_evaluation_metric = number_of_correctly_classified_labels / len(normalized_input_dataset)

    return classifier_accuracy_evaluation_metric * 100

In [9]:
def backward_elimination(normalized_input_dataset):
    print("Beginning Search")
    all_feature_set = list(range(1, normalized_input_dataset.shape[1]))
    feature_set = {}

    X = normalized_input_dataset.values

    accuracy_value = evaluating_accuracy(X)
    print("all features", tuple(all_feature_set),"accuracy is", accuracy_value)


    while len(all_feature_set) != 1:
        feature_dict = {}
        for c in all_feature_set:
            temp_ls = all_feature_set.copy()
            temp_ls.remove(c)
            columns_to_include = [0] + temp_ls
            evaluating_performance_dataset = X[:, columns_to_include]
            accuracy_value = evaluating_accuracy(evaluating_performance_dataset)
            columns_to_include.remove(0)

            print()
            print("Using feature(s)", tuple(columns_to_include), "accuracy is", accuracy_value)
            feature_dict[tuple(columns_to_include)] = accuracy_value

        print()
        max_key = max(feature_dict, key=feature_dict.get)
        feature_value = list(set(all_feature_set) - set(max_key))[0]
        print("Feature set", tuple(max_key), "was best, accuracy is", feature_dict[max_key], "eliminating feature is", feature_value)
        print()

        all_feature_set.remove(feature_value)
        feature_set[max_key] = feature_dict[max_key]

    best_feature_list = max(feature_set, key=feature_set.get)

    print()
    print("Finished search!! The best feature subset is", tuple(best_feature_list), "which has an accuracy of", feature_set[best_feature_list])
    return None

In [10]:
%%time

water_quality_df = dataset_normalization(water_quality_df)
backward_elimination(water_quality_df)

Beginning Search
all features (1, 2, 3, 4) accuracy is 91.8

Using feature(s) (2, 3, 4) accuracy is 89.4

Using feature(s) (1, 3, 4) accuracy is 92.2

Using feature(s) (1, 2, 4) accuracy is 89.8

Using feature(s) (1, 2, 3) accuracy is 88.8

Feature set (1, 3, 4) was best, accuracy is 92.2 eliminating feature is 2


Using feature(s) (3, 4) accuracy is 84.2

Using feature(s) (1, 4) accuracy is 89.60000000000001

Using feature(s) (1, 3) accuracy is 88.0

Feature set (1, 4) was best, accuracy is 89.60000000000001 eliminating feature is 3


Using feature(s) (4,) accuracy is 72.39999999999999

Using feature(s) (1,) accuracy is 61.8

Feature set (4,) was best, accuracy is 72.39999999999999 eliminating feature is 1


Finished search!! The best feature subset is (1, 3, 4) which has an accuracy of 92.2
CPU times: user 25.1 s, sys: 0 ns, total: 25.1 s
Wall time: 25.6 s
