In [2]:
import pandas as pd 
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.metrics import accuracy_score,confusion_matrix

In [3]:
data = pd.read_csv('../datasets/income(1).csv',na_values=[" ?"]) 


In [4]:
data.isnull().sum()

age                 0
JobType          1809
EdType              0
maritalstatus       0
occupation       1816
relationship        0
race                0
gender              0
capitalgain         0
capitalloss         0
hoursperweek        0
nativecountry       0
SalStat             0
dtype: int64

In [5]:
missing = data[data.isnull().any(axis=1)]

In [6]:
data2 = data.dropna(axis=0)



In [7]:
# Reindexing the salary status names to 0,1
data2['SalStat']=data2['SalStat'].map({' less than or equal to 50,000':0,' greater than 50,000':1})
print(data2['SalStat'])

0        0
1        0
2        1
3        0
4        0
        ..
31973    0
31974    0
31975    0
31976    0
31977    0
Name: SalStat, Length: 30162, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['SalStat']=data2['SalStat'].map({' less than or equal to 50,000':0,' greater than 50,000':1})


In [8]:
new_data=pd.get_dummies(data2, drop_first=True)

In [9]:
# Storing the column names 
columns_list=list(new_data.columns)
print(columns_list)

['age', 'capitalgain', 'capitalloss', 'hoursperweek', 'SalStat', 'JobType_ Local-gov', 'JobType_ Private', 'JobType_ Self-emp-inc', 'JobType_ Self-emp-not-inc', 'JobType_ State-gov', 'JobType_ Without-pay', 'EdType_ 11th', 'EdType_ 12th', 'EdType_ 1st-4th', 'EdType_ 5th-6th', 'EdType_ 7th-8th', 'EdType_ 9th', 'EdType_ Assoc-acdm', 'EdType_ Assoc-voc', 'EdType_ Bachelors', 'EdType_ Doctorate', 'EdType_ HS-grad', 'EdType_ Masters', 'EdType_ Preschool', 'EdType_ Prof-school', 'EdType_ Some-college', 'maritalstatus_ Married-AF-spouse', 'maritalstatus_ Married-civ-spouse', 'maritalstatus_ Married-spouse-absent', 'maritalstatus_ Never-married', 'maritalstatus_ Separated', 'maritalstatus_ Widowed', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'occupation_ Other-service', 'occupation_ Priv-house-serv', 'occupation_ Prof-specialty', 'occupation_ Protective-s

In [10]:
# Separating the input names from data
features=list(set(columns_list)-set(['SalStat']))
print(features)

['nativecountry_ Germany', 'age', 'occupation_ Protective-serv', 'nativecountry_ Holand-Netherlands', 'occupation_ Armed-Forces', 'maritalstatus_ Never-married', 'occupation_ Transport-moving', 'nativecountry_ Vietnam', 'occupation_ Craft-repair', 'maritalstatus_ Married-civ-spouse', 'occupation_ Prof-specialty', 'nativecountry_ Ireland', 'EdType_ 1st-4th', 'occupation_ Exec-managerial', 'nativecountry_ Iran', 'nativecountry_ Mexico', 'JobType_ State-gov', 'nativecountry_ Scotland', 'race_ Other', 'maritalstatus_ Separated', 'relationship_ Unmarried', 'nativecountry_ Haiti', 'EdType_ 7th-8th', 'nativecountry_ France', 'relationship_ Wife', 'nativecountry_ Hong', 'relationship_ Own-child', 'relationship_ Not-in-family', 'nativecountry_ Ecuador', 'nativecountry_ Taiwan', 'occupation_ Farming-fishing', 'race_ Asian-Pac-Islander', 'hoursperweek', 'maritalstatus_ Married-AF-spouse', 'nativecountry_ Thailand', 'EdType_ Masters', 'race_ Black', 'EdType_ Preschool', 'EdType_ 12th', 'JobType_ P

In [11]:
# Storing the output values in y
y=new_data['SalStat'].values
print(y)


[0 0 1 ... 0 0 0]


In [12]:
# Storing the values from input features
x = new_data[features].values
print(x)

[[False 45 False ... False False True]
 [False 24 False ... False False True]
 [False 44 False ... False False True]
 ...
 [False 23 False ... False False True]
 [False 42 False ... False False True]
 [False 29 False ... False False True]]


In [13]:
# Splitting the data into train and test
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size=0.3, random_state=0)

<h2 style="
    color: #1e3c72;
    background: #e3f2fd;
    padding: 10px 18px;
    border-left: 6px solid #1e3c72;
    border-radius: 8px;
    font-size: 24px;
    font-family:'Inter', Arial, sans-serif;
">
KNN(K-NEAREST NEIGHBOURS)
</h2>

In [14]:
# Storing the K nearest neighbors classifier
KNN_classifier = KNeighborsClassifier(n_neighbors = 5)  

In [15]:
# Fitting the values for X and Y
KNN_classifier.fit(train_x, train_y) 


In [16]:
# Predicting the test values with model
prediction = KNN_classifier.predict(test_x)


In [17]:
# Performance metric check
confusionMmatrix = confusion_matrix(test_y, prediction)
print(confusionMmatrix)

[[6176  647]
 [ 808 1418]]


In [18]:
# Calculating the accuracy
accuracy_score=accuracy_score(test_y, prediction)
print(accuracy_score)

0.8392087523483258


In [19]:
import pickle
import os

# Create models directory if it doesn't exist
os.makedirs('models', exist_ok=True)

# Save the KNN model as pickle file
with open('models/knn_model.pkl', 'wb') as f:
    pickle.dump(KNN_classifier, f)

print("KNN model saved successfully as 'models/knn'")

KNN model saved successfully as 'models/knn'


In [20]:
print('Misclassified samples: %d' % (test_y != prediction).sum())

Misclassified samples: 1455


In [23]:
"""
Effect of K value on classifier
"""
Misclassified_sample = []
# Calculating error for K values between 1 and 20
for i in range(1, 20):  
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(train_x, train_y)
    pred_i = knn.predict(test_x)
    Misclassified_sample.append((test_y != pred_i).sum())


In [24]:
print(Misclassified_sample)

[np.int64(1766), np.int64(1516), np.int64(1522), np.int64(1437), np.int64(1455), np.int64(1455), np.int64(1456), np.int64(1452), np.int64(1481), np.int64(1432), np.int64(1465), np.int64(1446), np.int64(1451), np.int64(1435), np.int64(1421), np.int64(1415), np.int64(1435), np.int64(1423), np.int64(1443)]
