In [1]:
# To work with dataframes
import pandas as pd 

# To perform numerical operations
import numpy as np

# To visualize data
import seaborn as sns

# To partition the data
from sklearn.model_selection import train_test_split

# importing the library of KNN
from sklearn.neighbors import KNeighborsClassifier  

# Importing performance metrics - accuracy score & confusion matrix
from sklearn.metrics import accuracy_score,confusion_matrix

In [2]:
data = pd.read_csv('income.csv',na_values=[" ?"]) 

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31978 entries, 0 to 31977
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            31978 non-null  int64 
 1   JobType        30169 non-null  object
 2   EdType         31978 non-null  object
 3   maritalstatus  31978 non-null  object
 4   occupation     30162 non-null  object
 5   relationship   31978 non-null  object
 6   race           31978 non-null  object
 7   gender         31978 non-null  object
 8   capitalgain    31978 non-null  int64 
 9   capitalloss    31978 non-null  int64 
 10  hoursperweek   31978 non-null  int64 
 11  nativecountry  31978 non-null  object
 12  SalStat        31978 non-null  object
dtypes: int64(4), object(9)
memory usage: 3.2+ MB


In [8]:
data.describe()

Unnamed: 0,age,capitalgain,capitalloss,hoursperweek
count,31978.0,31978.0,31978.0,31978.0
mean,38.579023,1064.360623,86.739352,40.41785
std,13.662085,7298.596271,401.594301,12.345285
min,17.0,0.0,0.0,1.0
25%,28.0,0.0,0.0,40.0
50%,37.0,0.0,0.0,40.0
75%,48.0,0.0,0.0,45.0
max,90.0,99999.0,4356.0,99.0


In [13]:
data.describe(include = "O") #this is a O not a zero(0)

Unnamed: 0,JobType,EdType,maritalstatus,occupation,relationship,race,gender,nativecountry,SalStat
count,30169,31978,31978,30162,31978,31978,31978,31978,31978
unique,8,16,7,14,6,5,2,41,2
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,"less than or equal to 50,000"
freq,22286,10368,14692,4038,12947,27430,21370,29170,24283


In [5]:
print(data.isnull().sum())

age                 0
JobType          1809
EdType              0
maritalstatus       0
occupation       1816
relationship        0
race                0
gender              0
capitalgain         0
capitalloss         0
hoursperweek        0
nativecountry       0
SalStat             0
dtype: int64


In [3]:
missing = data[data.isnull().any(axis=1)]

In [4]:
missing

Unnamed: 0,age,JobType,EdType,maritalstatus,occupation,relationship,race,gender,capitalgain,capitalloss,hoursperweek,nativecountry,SalStat
8,17,,11th,Never-married,,Own-child,White,Female,0,0,5,United-States,"less than or equal to 50,000"
17,32,,Some-college,Married-civ-spouse,,Husband,White,Male,0,0,40,United-States,"less than or equal to 50,000"
29,22,,Some-college,Never-married,,Own-child,White,Male,0,0,40,United-States,"less than or equal to 50,000"
42,52,,12th,Never-married,,Other-relative,Black,Male,594,0,40,United-States,"less than or equal to 50,000"
44,63,,1st-4th,Married-civ-spouse,,Husband,White,Male,0,0,35,United-States,"less than or equal to 50,000"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31892,59,,Bachelors,Married-civ-spouse,,Husband,White,Male,0,0,40,United-States,"greater than 50,000"
31934,20,,HS-grad,Never-married,,Other-relative,White,Female,0,0,35,United-States,"less than or equal to 50,000"
31945,28,,Some-college,Married-civ-spouse,,Wife,White,Female,0,1887,40,United-States,"greater than 50,000"
31967,80,,HS-grad,Widowed,,Not-in-family,White,Male,0,0,24,United-States,"less than or equal to 50,000"


In [6]:
data2 = data.dropna(axis=0)

In [14]:
data2.describe()

Unnamed: 0,age,capitalgain,capitalloss,hoursperweek
count,30162.0,30162.0,30162.0,30162.0
mean,38.437902,1092.007858,88.372489,40.931238
std,13.134665,7406.346497,404.29837,11.979984
min,17.0,0.0,0.0,1.0
25%,28.0,0.0,0.0,40.0
50%,37.0,0.0,0.0,40.0
75%,47.0,0.0,0.0,45.0
max,90.0,99999.0,4356.0,99.0


In [15]:
data2['SalStat']=data2['SalStat'].map({' less than or equal to 50,000':0,' greater than 50,000':1})
print(data2['SalStat'])

0        0
1        0
2        1
3        0
4        0
        ..
31973    0
31974    0
31975    0
31976    0
31977    0
Name: SalStat, Length: 30162, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [16]:
data2['SalStat']

0        0
1        0
2        1
3        0
4        0
        ..
31973    0
31974    0
31975    0
31976    0
31977    0
Name: SalStat, Length: 30162, dtype: int64

In [17]:
new_data=pd.get_dummies(data2, drop_first=True)

In [18]:
columns_list=list(new_data.columns)
print(columns_list)

['age', 'capitalgain', 'capitalloss', 'hoursperweek', 'SalStat', 'JobType_ Local-gov', 'JobType_ Private', 'JobType_ Self-emp-inc', 'JobType_ Self-emp-not-inc', 'JobType_ State-gov', 'JobType_ Without-pay', 'EdType_ 11th', 'EdType_ 12th', 'EdType_ 1st-4th', 'EdType_ 5th-6th', 'EdType_ 7th-8th', 'EdType_ 9th', 'EdType_ Assoc-acdm', 'EdType_ Assoc-voc', 'EdType_ Bachelors', 'EdType_ Doctorate', 'EdType_ HS-grad', 'EdType_ Masters', 'EdType_ Preschool', 'EdType_ Prof-school', 'EdType_ Some-college', 'maritalstatus_ Married-AF-spouse', 'maritalstatus_ Married-civ-spouse', 'maritalstatus_ Married-spouse-absent', 'maritalstatus_ Never-married', 'maritalstatus_ Separated', 'maritalstatus_ Widowed', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'occupation_ Other-service', 'occupation_ Priv-house-serv', 'occupation_ Prof-specialty', 'occupation_ Protective-s

In [19]:
features=list(set(columns_list)-set(['SalStat']))
print(features)

['EdType_ Bachelors', 'nativecountry_ United-States', 'nativecountry_ Canada', 'capitalgain', 'nativecountry_ Philippines', 'EdType_ Doctorate', 'maritalstatus_ Separated', 'capitalloss', 'hoursperweek', 'relationship_ Own-child', 'occupation_ Handlers-cleaners', 'EdType_ 11th', 'nativecountry_ Jamaica', 'gender_ Male', 'occupation_ Tech-support', 'relationship_ Not-in-family', 'occupation_ Armed-Forces', 'JobType_ Self-emp-not-inc', 'nativecountry_ Italy', 'EdType_ Assoc-acdm', 'nativecountry_ England', 'JobType_ State-gov', 'relationship_ Other-relative', 'nativecountry_ Ireland', 'maritalstatus_ Never-married', 'nativecountry_ Yugoslavia', 'JobType_ Without-pay', 'occupation_ Priv-house-serv', 'nativecountry_ Trinadad&Tobago', 'EdType_ 7th-8th', 'nativecountry_ Hungary', 'EdType_ Masters', 'nativecountry_ Puerto-Rico', 'nativecountry_ Laos', 'nativecountry_ Peru', 'nativecountry_ Hong', 'race_ Black', 'nativecountry_ Mexico', 'race_ Asian-Pac-Islander', 'JobType_ Self-emp-inc', 'EdT

In [20]:
y=new_data['SalStat'].values
print(y)

[0 0 1 ... 0 0 0]


In [21]:
x = new_data[features].values
print(x)

[[0 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]]


In [22]:
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size=0.3, random_state=0)

In [23]:
print(train_x,test_x,train_y,test_y)

[[0 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]] [[0 1 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]] [0 0 0 ... 1 0 0] [0 0 0 ... 0 0 0]


# KNN

In [24]:
# Storing the K nearest neighbors classifier
KNN_classifier = KNeighborsClassifier(n_neighbors = 5)  

In [25]:
KNN_classifier.fit(train_x, train_y)

KNeighborsClassifier()

In [26]:
prediction = KNN_classifier.predict(test_x)

In [27]:
confusionMmatrix = confusion_matrix(test_y, prediction)
print(confusionMmatrix)

[[6173  650]
 [ 843 1383]]


In [28]:
accuracy_score=accuracy_score(test_y, prediction)
print(accuracy_score)

print('Misclassified samples: %d' % (test_y != prediction).sum())

0.8350093933031274
Misclassified samples: 1493


In [29]:
"""
Effect of K value on classifier
"""
Misclassified_sample = []
# Calculating error for K values between 1 and 20
for i in range(1, 20):  
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(train_x, train_y)
    pred_i = knn.predict(test_x)
    Misclassified_sample.append((test_y != pred_i).sum())

print(Misclassified_sample)

[1766, 1516, 1515, 1436, 1493, 1438, 1451, 1432, 1458, 1436, 1441, 1447, 1451, 1423, 1413, 1390, 1424, 1396, 1434]
