In [1]:
!pip install scikit-learn



In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Step 1: Load data
# Reading the CSV file into a pandas DataFrame
data = pd.read_csv("C:\\Users\\HP\\Downloads\\Placement Data.xls")

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,grade,cgpa,iq,communication,technical_knowledge,placement
0,0,B,7.34,124,1,7,0
1,1,D,8.88,112,1,5,0
2,2,C,7.95,85,8,8,0
3,3,B,6.42,103,8,8,1
4,4,A,6.1,98,6,9,1


In [5]:
data = data.iloc[:,1:]

In [6]:
data

Unnamed: 0,grade,cgpa,iq,communication,technical_knowledge,placement
0,B,7.34,124,1,7,0
1,D,8.88,112,1,5,0
2,C,7.95,85,8,8,0
3,B,6.42,103,8,8,1
4,A,6.10,98,6,9,1
...,...,...,...,...,...,...
495,B,6.47,116,7,10,1
496,B,6.51,126,8,9,1
497,A,8.04,116,3,6,1
498,D,7.68,91,5,5,0


In [7]:
# Check for missing values
print(data.isnull().sum())

grade                  0
cgpa                   0
iq                     0
communication          0
technical_knowledge    0
placement              0
dtype: int64


In [8]:
data.duplicated().sum()

np.int64(0)

In [9]:
data.shape

(500, 6)

In [10]:
# Encode categorical variable 'grade'
le = LabelEncoder()

data['grade'] = le.fit_transform(data['grade'])

In [11]:


data.head()

Unnamed: 0,grade,cgpa,iq,communication,technical_knowledge,placement
0,1,7.34,124,1,7,0
1,3,8.88,112,1,5,0
2,2,7.95,85,8,8,0
3,1,6.42,103,8,8,1
4,0,6.1,98,6,9,1


In [12]:
# Step 4: Transform data
# Define features (X) and target (y)
X = data.drop('placement', axis=1)
y = data['placement']

In [13]:
X.shape

(500, 5)

In [14]:
y.shape

(500,)

In [15]:
# Step 5: Split data into training and testing sets
# Using 80-20 split for training and testing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 42, stratify=None
)

In [16]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape 

((400, 5), (100, 5), (400,), (100,))

In [17]:
scaler = StandardScaler()

In [18]:
x_train_scl = scaler.fit_transform(X_train)
x_test_scl = scaler.transform(X_test)

In [19]:
x_train_scl.shape

(400, 5)

In [20]:
x_train_scl

array([[ 0.91303697, -1.64276431,  1.18743074, -1.51953767,  1.19880545],
       [-1.14451114,  1.07436486, -1.66683005,  0.57397003,  0.48997739],
       [-1.14451114,  0.62483981,  0.31367744, -1.51953767, -0.92767872],
       ...,
       [-0.11573708,  1.03440708, -0.09407411,  0.92288798,  0.84439142],
       [ 0.91303697,  1.41400601,  0.37192766, -0.12386587,  1.19880545],
       [-0.11573708,  1.69371048, -1.08432785, -1.51953767,  0.84439142]])

In [21]:
knn = KNeighborsClassifier(n_neighbors=3)

In [22]:
knn.fit(x_train_scl,y_train)

In [23]:
knn.score(x_test_scl,y_test)

0.84

In [24]:
y_pred = knn.predict(x_test_scl)

In [25]:
y_pred

array([0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1])

In [26]:
y_test

361    0
73     1
374    1
155    1
104    0
      ..
347    0
86     0
75     0
438    0
15     1
Name: placement, Length: 100, dtype: int64

In [27]:
 # Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

In [28]:
accuracy

0.84

In [29]:
# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.82      0.84        51
           1       0.82      0.86      0.84        49

    accuracy                           0.84       100
   macro avg       0.84      0.84      0.84       100
weighted avg       0.84      0.84      0.84       100



In [30]:
mean_squared_error(y_test,y_pred)

0.16

In [31]:
 mean_absolute_error(y_test,y_pred)

0.16

In [32]:
r2_score(y_test,y_pred)

0.3597438975590235

In [33]:
pd.DataFrame({"y_true": y_test, "y_prde":y_pred})

Unnamed: 0,y_true,y_prde
361,0,0
73,1,1
374,1,1
155,1,1
104,0,0
...,...,...
347,0,0
86,0,0
75,0,0
438,0,0


In [35]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold

In [37]:
pipe = Pipeline([
    ("scaler",StandardScaler()),
    ("knn",KNeighborsClassifier())
])

In [38]:
param_grid = {
    'knn__n_neighbors': [3],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan', 'minkowski'],
    'knn__p': [1,2],              # only used when metric='minkowski'
    'knn__algorithm': ['auto','kd_tree','ball_tree','brute']
}

In [39]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [41]:
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=cv,
    scoring='f1_macro',   # or 'accuracy', 'f1_weighted', etc.
    n_jobs=-1
)

In [42]:
grid.fit(x_train_scl,y_train)

In [43]:
print("Best params:", grid.best_params_)
print("Best CV score:", grid.best_score_)

Best params: {'knn__algorithm': 'auto', 'knn__metric': 'euclidean', 'knn__n_neighbors': 3, 'knn__p': 1, 'knn__weights': 'distance'}
Best CV score: 0.8597867683604642
