In [1]:
#### This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

/kaggle/input/classify-fruits-fallinter-b1/sample_submission.csv
/kaggle/input/classify-fruits-fallinter-b1/fruits_test.csv
/kaggle/input/classify-fruits-fallinter-b1/fruits_train.csv




In [2]:
# Load the training and test data
train_data = pd.read_csv('/kaggle/input/classify-fruits-fallinter-b1/fruits_train.csv')
test_data = pd.read_csv('/kaggle/input/classify-fruits-fallinter-b1/fruits_test.csv')

In [3]:
train_data.head()

Unnamed: 0,Id,mass,width,height,label
0,1,160,7.1,7.6,2
1,2,194,7.2,10.3,3
2,3,154,7.2,7.2,2
3,4,154,7.0,7.1,1
4,5,162,7.4,7.2,1


In [4]:
test_data.head()

Unnamed: 0,Id,mass,width,height
0,1,118,6.1,8.1
1,2,158,7.2,7.8
2,3,120,6.0,8.4
3,4,210,7.8,8.0
4,5,156,7.6,7.5


In [5]:
# Extract the features and labels from the training data
X_train = train_data.iloc[:, 1:-1].values
y_train = train_data.iloc[:, -1].values

# Extract the features from the test data
X_test = test_data.iloc[:, 1:].values

In [6]:
X_train

array([[160. ,   7.1,   7.6],
       [194. ,   7.2,  10.3],
       [154. ,   7.2,   7.2],
       [154. ,   7. ,   7.1],
       [162. ,   7.4,   7.2],
       [164. ,   7.2,   7. ],
       [154. ,   7.1,   7.5],
       [116. ,   6.1,   8.5],
       [170. ,   7.6,   7.9],
       [116. ,   5.9,   8.1],
       [144. ,   6.8,   7.4],
       [160. ,   7.5,   7.5],
       [166. ,   6.9,   7.3],
       [142. ,   7.6,   7.8],
       [156. ,   7.4,   7.4],
       [116. ,   6. ,   7.5],
       [356. ,   9.2,   9.2],
       [152. ,   6.5,   8.5],
       [164. ,   7.3,   7.7],
       [162. ,   7.5,   7.1],
       [158. ,   7.1,   7.5],
       [140. ,   7.3,   7.1],
       [186. ,   7.2,   9.2],
       [174. ,   7.3,  10.1],
       [180. ,   8. ,   6.8],
       [168. ,   7.5,   7.6],
       [216. ,   7.3,  10.2],
       [160. ,   7. ,   7.4],
       [172. ,   7.1,   7.6],
       [140. ,   6.7,   7.1],
       [180. ,   7.6,   8.2],
       [362. ,   9.6,   9.2],
       [342. ,   9. ,   9.4],
       [15

In [7]:
# Normalize the features using Min-Max scaling
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
# Define a function to calculate the Euclidean distance between two points
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

# Define the K Nearest Neighbors function
def k_nearest_neighbors(train_data, test_data, k):
    predictions = []
    for test_point in test_data:
        distances = []
        for train_point, train_label in train_data:
            distance = euclidean_distance(test_point, train_point)
            distances.append((train_point, train_label, distance))
        
        distances.sort(key=lambda x: x[2])  # Sort distances in ascending order
        k_nearest = distances[:k]  # Select the k nearest neighbors
        
        labels = [neighbor[1] for neighbor in k_nearest]  # Get the labels of the k nearest neighbors
        prediction = max(set(labels), key=labels.count)  # Perform majority voting
        predictions.append(prediction)
    
    return predictions

To obtain the highest accuracy, splittling the train data

In [9]:
# Split the training data into a training set and a validation set
val_split = int(0.8 * len(X_train))
X_val = X_train[val_split:]
y_val = y_train[val_split:]
X_train = X_train[:val_split]
y_train = y_train[:val_split]

In [16]:
# Initialize a list to store the accuracy values
accuracy_values = []

# Iterate over a range of k values
for k in range(1, 7):
    # Train the KNN model using the training set
    train_data = list(zip(X_train, y_train))
    
    # Make predictions on the validation set
    predictions = k_nearest_neighbors(train_data, X_val, k)
    
    # Calculate the accuracy of the predictions
    accuracy = accuracy_score(y_val, predictions)
    accuracy_values.append((k, accuracy))

# Identify the k value that resulted in the highest accuracy on the validation set
best_k = max(accuracy_values, key=lambda x: x[1])[0]
best_k

1

In [11]:
# Train the final KNN model using the combined training and validation sets with the selected k value
train_data = list(zip(X_train, y_train))
test_predictions = k_nearest_neighbors(train_data, X_test, best_k)

In [12]:
# Create a submission dataframe with ID and Category columns
submission_df = pd.DataFrame({'ID': test_data['Id'], 'Category': test_predictions})

# Save the submission dataframe to a CSV file
submission_df.to_csv('submission.csv', index=False)

In [13]:
submission_df

Unnamed: 0,ID,Category
0,1,3
1,2,1
2,3,3
3,4,2
4,5,1
5,6,1
6,7,2
7,8,3
8,9,3
9,10,2
