In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('large_data.csv')
df['TYPE'] = df['TYPE'].astype('category')

In [3]:
input_names = list(df.columns)
input_names.remove('TYPE')

all_inputs = df[input_names].values
all_labels = df['TYPE'].values

In [4]:
from sklearn.model_selection import train_test_split

(training_inputs,
 testing_inputs,
 training_classes,
 testing_classes) = train_test_split(all_inputs, all_labels, test_size=0.25, random_state=1)

In [5]:
from sklearn.neighbors import KNeighborsClassifier

# Create the classifier
clf = KNeighborsClassifier()

# Train the classifier on the training set
clf.fit(training_inputs, training_classes)

# Validate the classifier on the testing set using classification accuracy
clf.score(testing_inputs, testing_classes)

0.9096634874932518

In [6]:
from sklearn.metrics import classification_report, confusion_matrix

predictions = clf.predict(testing_inputs)
print(classification_report(testing_classes, predictions))

              precision    recall  f1-score   support

     ALLERGY       0.97      0.98      0.97      4118
        COLD       0.43      0.39      0.41       246
       COVID       0.31      0.25      0.28       523
         FLU       0.93      0.94      0.94      6227

    accuracy                           0.91     11114
   macro avg       0.66      0.64      0.65     11114
weighted avg       0.90      0.91      0.91     11114



In [7]:
improved_clf = KNeighborsClassifier(algorithm='kd_tree', n_neighbors=9, weights='uniform')

# Train the classifier on the training set
improved_clf.fit(training_inputs, training_classes)
new_predictions = improved_clf.predict(testing_inputs)
print(classification_report(testing_classes, new_predictions))

              precision    recall  f1-score   support

     ALLERGY       0.97      0.98      0.98      4118
        COLD       0.49      0.41      0.44       246
       COVID       0.35      0.26      0.30       523
         FLU       0.93      0.95      0.94      6227

    accuracy                           0.92     11114
   macro avg       0.68      0.65      0.66     11114
weighted avg       0.91      0.92      0.91     11114



In [9]:
from sklearn.model_selection import GridSearchCV

parameter_grid = {'n_neighbors': [4, 5, 6],
                  'weights': ['uniform', 'distance'],
                  'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

grid_search = GridSearchCV(KNeighborsClassifier(),
                           param_grid=parameter_grid,
                           cv=10)

grid_search.fit(all_inputs, all_labels)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

Best score: 0.9220749380011709
Best parameters: {'algorithm': 'kd_tree', 'n_neighbors': 6, 'weights': 'uniform'}


In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

parameter_grid = {'n_neighbors': [8, 9, 10, 11],
                  'weights': ['uniform'],
                  'algorithm': ['kd_tree']}

grid_search = GridSearchCV(KNeighborsClassifier(),
                           param_grid=parameter_grid,
                           cv=10)

grid_search.fit(all_inputs, all_labels)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

Best score: 0.9247293734032234
Best parameters: {'algorithm': 'kd_tree', 'n_neighbors': 9, 'weights': 'uniform'}
