In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/classify-fruits-fallinter-b1/sample_submission.csv
/kaggle/input/classify-fruits-fallinter-b1/fruits_test.csv
/kaggle/input/classify-fruits-fallinter-b1/fruits_train.csv


In [2]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import minkowski
from sklearn.preprocessing import StandardScaler
# function to calculate the Minkowski distance between two data points
def minkowski_distance(x1, x2, p):
    return np.power(np.sum(np.power(np.abs(x1 - x2), p)), 1/p)



In [3]:
# function to find K nearest neighbors
def knn(X_train, y_train, X_test, K, p=2):
    distances = []
    # loop over all the training data
    for i in range(len(X_train)):
        # calculate the distance between the test data and the training data
        dist = minkowski_distance(X_test, X_train[i], p)
        distances.append((dist, y_train[i]))
    
    # sort the distances in ascending order
    distances.sort(key=lambda x: x[0])
    
    # take the first K neighbors
    neighbors = distances[:K]
    
    # count the number of votes for each class
    class_votes = {}
    for i in range(K):
        label = neighbors[i][1]
        if label in class_votes:
            class_votes[label] += 1
        else:
            class_votes[label] = 1
            
    # return the label with the most votes
    return max(class_votes, key=class_votes.get)

In [4]:
# read the training and testing CSV files into DataFrames
train_df = pd.read_csv('/kaggle/input/classify-fruits-fallinter-b1/fruits_train.csv')
test_df = pd.read_csv('/kaggle/input/classify-fruits-fallinter-b1/fruits_test.csv')

In [5]:
train_df.head()

Unnamed: 0,Id,mass,width,height,label
0,1,160,7.1,7.6,2
1,2,194,7.2,10.3,3
2,3,154,7.2,7.2,2
3,4,154,7.0,7.1,1
4,5,162,7.4,7.2,1


In [6]:
test_df.head()

Unnamed: 0,Id,mass,width,height
0,1,118,6.1,8.1
1,2,158,7.2,7.8
2,3,120,6.0,8.4
3,4,210,7.8,8.0
4,5,156,7.6,7.5


In [7]:
# Extract the relevant columns from the training data
X_train = train_df[['width', 'height']].values
mass_train = train_df['mass'].values
y_train = train_df['label'].values

# Extract the relevant columns from the test data
X_test = test_df[['width', 'height']].values
mass_test = test_df['mass'].values
# Concatenate the features into separate parameters
X_train_combined = np.column_stack((X_train, mass_train))
X_test_combined = np.column_stack((X_test, mass_test))
# print the shapes of the input arrays
print("X_train shape:",X_train_combined.shape)
print("X_test shape:", X_test_combined.shape)


X_train shape: (40, 3)
X_test shape: (14, 3)


In [8]:
# normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_combined)
X_test = scaler.transform(X_test_combined)

In [9]:
X_train

array([[-0.17372883, -0.41582033, -0.23463339],
       [-0.04963681,  2.30591272,  0.36972534],
       [-0.04963681, -0.81904004, -0.34128493],
       [-0.29782085, -0.91984497, -0.34128493],
       [ 0.19854723, -0.81904004, -0.19908288],
       [-0.04963681, -1.02064989, -0.16353236],
       [-0.17372883, -0.51662526, -0.34128493],
       [-1.41464904,  0.49142402, -1.0167447 ],
       [ 0.44673128, -0.11340554, -0.05688082],
       [-1.66283308,  0.08820431, -1.0167447 ],
       [-0.54600489, -0.61743018, -0.5190375 ],
       [ 0.32263925, -0.51662526, -0.23463339],
       [-0.42191287, -0.71823511, -0.12798185],
       [ 0.44673128, -0.21421047, -0.55458802],
       [ 0.19854723, -0.61743018, -0.30573442],
       [-1.53874106, -0.51662526, -1.0167447 ],
       [ 2.43220361,  1.19705852,  3.24931697],
       [-0.91828096,  0.49142402, -0.37683545],
       [ 0.07445521, -0.3150154 , -0.16353236],
       [ 0.32263925, -0.91984497, -0.19908288],
       [-0.17372883, -0.51662526, -0.270

In [10]:
y_train.shape

(40,)

In [11]:
X_test.shape

(14, 3)

In [12]:
y_train

array([2, 3, 2, 1, 1, 2, 2, 3, 1, 3, 2, 1, 1, 2, 1, 3, 2, 3, 1, 1, 2, 1,
       3, 3, 1, 1, 3, 2, 1, 2, 2, 2, 2, 1, 3, 3, 1, 1, 3, 3])

In [13]:
# define the ensemble method
# set the value of K and p for Minkowski distance
K = 5
p=2
# make predictions for the test data
predictions = []
for i in range(len(X_test_combined)):
    pred = knn(X_train_combined, y_train, X_test_combined[i], K, p)
    predictions.append(pred)

In [14]:

submission_df = pd.DataFrame({'Id': test_df['Id'], 'Category': predictions})
submission_df.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
