In [1]:
import pandas as pd  #importing pandas to create and work on dataframes
import numpy as np   #for creating numpy arrays
import seaborn as sns #for visualising the data

from sklearn import metrics  #metrics are used to evaluate the model
from sklearn.preprocessing import LabelEncoder   # labelencoder is used to convert the data type from object to categorical

import warnings 
from sklearn.metrics import classification_report,confusion_matrix,r2_score,mean_squared_error #various evaluation methods

warnings.filterwarnings("ignore")  # to ignore the warnings

In [2]:
df = pd.read_csv('adult-all.csv')  # reading the data as a dataframe using pandas

In [3]:
df #to have an overview of how the data looks

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48836,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K
48837,64,?,321403,HS-grad,9,Widowed,?,Other-relative,Black,Male,0,0,40,United-States,<=50K
48838,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K
48839,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K


In [4]:
df = df.drop('77516',axis = 1)  #dropping unnecessary columns

In [5]:
df = df.drop('13',axis = 1)


In [6]:
df = df.drop('39',axis = 1)


In [7]:
df = df.drop('2174',axis = 1)


In [8]:
df = df.drop('0',axis = 1)

In [9]:
df = df.drop('40',axis = 1)

In [10]:
#renaming the cols with appropriate nomencleature

df = df.rename(columns={"United-States": "country", "White": "race","Not-in-family": "family status","State-gov": "job category","Bachelors": "education","Never-married": "marital status","Male": "gender","<=50K": "salary"})


In [11]:
df.head(50)  #to overview the dataframe

Unnamed: 0,job category,education,marital status,Adm-clerical,family status,race,gender,country,salary
0,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
1,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
2,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
3,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K
4,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,United-States,<=50K
5,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,Jamaica,<=50K
6,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,>50K
7,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,United-States,>50K
8,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,>50K
9,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,Black,Male,United-States,>50K


In [12]:
df = df.replace({'?': np.nan}).dropna()  #dropping the rows with the question mark in them
df.shape # to know the shape of the data

(45221, 9)

In [13]:
df.head(50) #data without any question mark

Unnamed: 0,job category,education,marital status,Adm-clerical,family status,race,gender,country,salary
0,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
1,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
2,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
3,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K
4,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,United-States,<=50K
5,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,Jamaica,<=50K
6,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,>50K
7,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,United-States,>50K
8,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,>50K
9,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,Black,Male,United-States,>50K


In [14]:
df.isna().sum() #to check the count of nan values in each column

job category      0
education         0
marital status    0
Adm-clerical      0
family status     0
race              0
gender            0
country           0
salary            0
dtype: int64

In [15]:
df.info() #to check the datatype of cols

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45221 entries, 0 to 48840
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   job category    45221 non-null  object
 1   education       45221 non-null  object
 2   marital status  45221 non-null  object
 3   Adm-clerical    45221 non-null  object
 4   family status   45221 non-null  object
 5   race            45221 non-null  object
 6   gender          45221 non-null  object
 7   country         45221 non-null  object
 8   salary          45221 non-null  object
dtypes: object(9)
memory usage: 3.5+ MB


In [16]:
le = LabelEncoder() #to convert the data type from object to category

In [17]:
df["job category"] = le.fit_transform(df["job category"])
df["education"] = le.fit_transform(df["education"])
df["Adm-clerical"] = le.fit_transform(df["Adm-clerical"])
df["family status"] = le.fit_transform(df["family status"])
df["race"] = le.fit_transform(df["race"])
df["marital status"] = le.fit_transform(df["marital status"])
df["gender"] = le.fit_transform(df["gender"])
df["country"] = le.fit_transform(df["country"])
df["salary"] = le.fit_transform(df["salary"])


In [18]:
df["job category"] = df["job category"].astype("category")

In [19]:
df["education"] = df["education"].astype("category")

In [20]:
df["Adm-clerical"] = df["Adm-clerical"].astype("category")

In [21]:
df["family status"] = df["family status"].astype("category")

In [22]:
df["race"] = df["race"].astype("category")

In [23]:
df["marital status"] = df["marital status"].astype("category")

In [24]:
df["gender"] = df["gender"].astype("category")

In [25]:
df["country"] = df["country"].astype("category")

In [26]:
df["salary"] = df["salary"].astype("category")

In [27]:
df.info() #to check the conversion of data type

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45221 entries, 0 to 48840
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   job category    45221 non-null  category
 1   education       45221 non-null  category
 2   marital status  45221 non-null  category
 3   Adm-clerical    45221 non-null  category
 4   family status   45221 non-null  category
 5   race            45221 non-null  category
 6   gender          45221 non-null  category
 7   country         45221 non-null  category
 8   salary          45221 non-null  category
dtypes: category(9)
memory usage: 754.8 KB


In [28]:
df_feat = df.drop('salary',axis = 1) #for the feature matrix

In [29]:
from sklearn.model_selection import train_test_split #to split the data into test and train data

In [30]:
X = df_feat 
y = df['salary'] #target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101) #taking 70% as train data

In [31]:
from sklearn.neighbors import KNeighborsClassifier #importing algorithm

In [32]:
knn = KNeighborsClassifier(n_neighbors=20) #randomly initialized k-value of 20

In [33]:
knn.fit(X_train,y_train) #fitting the algorithm with training data

KNeighborsClassifier(n_neighbors=20)

In [34]:
y_pred = knn.predict(X_test)  #predicting how the algorithm works with test data

In [35]:
mse = mean_squared_error(y_pred,y_test) #testing the error between predicted data and the actual data
mse

0.18279649148669566

In [36]:
print(confusion_matrix(y_pred,y_test)) #making a confusion matrix of the predicted data and actual data

[[9413 1757]
 [ 723 1674]]


In [37]:
print(classification_report(y_pred,y_test)) #making a classification report of the prediction

              precision    recall  f1-score   support

           0       0.93      0.84      0.88     11170
           1       0.49      0.70      0.57      2397

    accuracy                           0.82     13567
   macro avg       0.71      0.77      0.73     13567
weighted avg       0.85      0.82      0.83     13567



In [38]:
#accuracy of our model is found to be 82 %  