<a href="https://colab.research.google.com/github/sanyagupta31/LLM-based-clone/blob/main/knn_movie_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.metrics import accuracy_score

In [None]:
# movie data sets variable name as my name sanygupta as san_gup
san_gup_train = np.array([[100, 0], [0, 100], [15, 90], [85, 20]])
san_gup_classes = np.array([1, 0, 0, 1])  # 1 = Comedy, 0 = Action


In [None]:
# Validation Data
VAL_val = np.array([[95, 6], [85, 15]])
VAL_classes = np.array([0, 1])  # 0 = Action, 1 = Comedy


In [None]:
# Test Data
TEST_test = np.array([[70, 6], [93, 23], [50, 50]])


In [None]:
# Function to compute Euclidean distance
def distance_ecu(x_train, x_test_point):
    distances = []
    for row in range(len(x_train)):
        current_train_point = x_train[row]
        current_distance = np.sqrt(np.sum((current_train_point - x_test_point) ** 2))
        distances.append(current_distance)
    distances = pd.DataFrame(data=distances, columns=['dist'])
    return distances

In [None]:
# Function to find the nearest K neighbors
def nearest_neighbors(distance_point, K):
    df_nearest = distance_point.sort_values(by=['dist'], axis=0)
    df_nearest = df_nearest[:K]
    return df_nearest


In [None]:
# Function to perform voting
def voting(df_nearest, y_train):
    counter_vote = Counter(y_train[df_nearest.index])
    y_pred = counter_vote.most_common()[0][0]
    return y_pred

In [None]:
# KNN from scratch
def KNN_from_scratch(x_train, y_train, x_test, K):
    y_pred = []
    for x_test_point in x_test:
        distance_point = distance_ecu(x_train, x_test_point)
        print(f"Distances for test point {x_test_point}: {distance_point['dist'].tolist()}")
        df_nearest_point = nearest_neighbors(distance_point, K)
        print(f"Nearest neighbors for test point {x_test_point}:")
        print(df_nearest_point)
        y_pred_point = voting(df_nearest_point, y_train)
        y_pred.append(y_pred_point)
    return np.array(y_pred)


In [None]:
# Function to map predicted class numbers to labels
def map_labels(y_pred):
    return ["Comedy" if pred == 1 else "Action" for pred in y_pred]


In [None]:
# Calculate accuracy for different k values
k_values = [1, 3]
for k in k_values:
    y_pred_validation = KNN_from_scratch(san_gup_train, san_gup_classes, VAL_val, k)
    accuracy = accuracy_score(VAL_classes, y_pred_validation)
    print(f'Accuracy for k={k}: {accuracy}')


Distances for test point [95  6]: [7.810249675906654, 133.64505228402584, 116.0, 17.204650534085253]
Nearest neighbors for test point [95  6]:
      dist
0  7.81025
Distances for test point [85 15]: [21.213203435596427, 120.20815280171308, 102.59142264341595, 5.0]
Nearest neighbors for test point [85 15]:
   dist
3   5.0
Accuracy for k=1: 0.5
Distances for test point [95  6]: [7.810249675906654, 133.64505228402584, 116.0, 17.204650534085253]
Nearest neighbors for test point [95  6]:
         dist
0    7.810250
3   17.204651
2  116.000000
Distances for test point [85 15]: [21.213203435596427, 120.20815280171308, 102.59142264341595, 5.0]
Nearest neighbors for test point [85 15]:
         dist
3    5.000000
0   21.213203
2  102.591423
Accuracy for k=3: 0.5


In [None]:
# Use the best k for test data (Assume k=1 for this example)
best_k = 1
y_pred_test = KNN_from_scratch(san_gup_train, san_gup_classes, TEST_test, best_k)


Distances for test point [70  6]: [30.59411708155671, 117.2006825918689, 100.40418317978589, 20.518284528683193]
Nearest neighbors for test point [70  6]:
        dist
3  20.518285
Distances for test point [93 23]: [24.041630560342615, 120.7393887677091, 102.82509421342633, 8.54400374531753]
Nearest neighbors for test point [93 23]:
       dist
3  8.544004
Distances for test point [50 50]: [70.71067811865476, 70.71067811865476, 53.150729063673246, 46.09772228646444]
Nearest neighbors for test point [50 50]:
        dist
3  46.097722


In [None]:
# Map the predicted classes to their labels
y_pred_test_labels = map_labels(y_pred_test)

print(f'Predicted classes for test data with k={best_k}: {y_pred_test_labels}')


Predicted classes for test data with k=1: ['Comedy', 'Comedy', 'Comedy']
