In [1]:
import numpy as np
import pandas as pd 


In [2]:
data = pd.read_csv("adult.csv")
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [3]:
print(data.info())
print(data.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None
(32561, 15)


In [4]:
data.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

***There were no empty cells in the datset when i checked but when i saw the dataset i saw there are some values marked as "?", so i have to change it to NaN first and then i handled it like empty cells ***

In [5]:
data.isin(['?']).sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64

In [6]:
data.replace('?', np.nan , inplace=True)

In [7]:
data.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64

In [8]:
data['workclass'] = data['workclass'].fillna(data['workclass'].mode()[0])
data['occupation'] = data['occupation'].fillna(data['occupation'].mode()[0])
data['native.country'] = data['native.country'].fillna(data['native.country'].mode()[0])

In [9]:
data.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education.num      int64
marital.status    object
occupation        object
relationship      object
race              object
sex               object
capital.gain       int64
capital.loss       int64
hours.per.week     int64
native.country    object
income            object
dtype: object

***There are some vlaues other than integer also hence we use label encoder to convert it to integer, so that it can be fitted to the model***

In [10]:
from sklearn.preprocessing import LabelEncoder
for col in data.columns:
    if data[col].dtypes == 'object':
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col].astype(str))
        
data.dtypes

age               int64
workclass         int64
fnlwgt            int64
education         int64
education.num     int64
marital.status    int64
occupation        int64
relationship      int64
race              int64
sex               int64
capital.gain      int64
capital.loss      int64
hours.per.week    int64
native.country    int64
income            int64
dtype: object

In [11]:
X = data.drop(['income'],axis = 1)
y = data['income']
print(X)
print(y)

       age  workclass  fnlwgt  education  education.num  marital.status  \
0       90          3   77053         11              9               6   
1       82          3  132870         11              9               6   
2       66          3  186061         15             10               6   
3       54          3  140359          5              4               0   
4       41          3  264663         15             10               5   
...    ...        ...     ...        ...            ...             ...   
32556   22          3  310152         15             10               4   
32557   27          3  257302          7             12               2   
32558   40          3  154374         11              9               2   
32559   58          3  151910         11              9               6   
32560   22          3  201490         11              9               4   

       occupation  relationship  race  sex  capital.gain  capital.loss  \
0               9        

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state = 42)

In [13]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# ***Logistic Regression***

In [14]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state = 0)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

# ***SGD Classifier***

In [15]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(max_iter = 11, tol=None)
sgd.fit(X_train, y_train)
sgd_pred = sgd.predict(X_test)

# ***Decision Tree Classifier***

In [16]:
from sklearn.tree import DecisionTreeClassifier
dct = DecisionTreeClassifier(max_depth=2)
dct.fit(X_train,y_train)
dct_pred = dct.predict(X_test)

# ***Random Forest Classifier***

In [17]:
from sklearn.ensemble import RandomForestClassifier
rd = RandomForestClassifier(n_estimators= 101, n_jobs=-1)
rd.fit(X_train,y_train)
rd_pred = rd.predict(X_test)

# ***K Neighbors Classifier***

In [18]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train,y_train)
knn_pred = knn.predict(X_test)

> > # ***Calculating the accuracy score for the above models***

In [19]:
from sklearn.metrics import confusion_matrix, accuracy_score
ac_lr = round(accuracy_score(y_test, lr_pred)*100,2)
ac_sgd = round(accuracy_score(y_test, sgd_pred)*100,2)
ac_dct = round(accuracy_score(y_test, dct_pred)*100,2)
ac_rd = round(accuracy_score(y_test, rd_pred)*100,2)
ac_knn = round(accuracy_score(y_test, knn_pred)*100,2)
########################################################################

cm_lr = confusion_matrix(y_test, lr_pred)
cm_sgd = confusion_matrix(y_test, sgd_pred)
cm_dct = confusion_matrix(y_test, dct_pred)
cm_rd = confusion_matrix(y_test, rd_pred)
cm_knn = confusion_matrix(y_test, knn_pred)

In [20]:
results = pd.DataFrame({
    'Model': ['Logistic Regression','Stochastic Gradient Decent','Decision Tree','Random Forest',  
              'KNN'],
    'Accuracy': [ac_lr,ac_sgd,ac_dct,ac_rd,ac_knn],
    'Confusion_Matrix':[cm_lr,cm_sgd,cm_dct,cm_rd,cm_knn]})                
result_df = results.sort_values(by='Accuracy', ascending=False)
result_df = result_df.set_index('Model')
result_df.head(5)

Unnamed: 0_level_0,Accuracy,Confusion_Matrix
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
Random Forest,85.48,"[[4622, 354], [592, 945]]"
Stochastic Gradient Decent,82.45,"[[4673, 303], [840, 697]]"
Logistic Regression,82.39,"[[4686, 290], [857, 680]]"
Decision Tree,82.39,"[[4754, 222], [925, 612]]"
KNN,82.2,"[[4461, 515], [644, 893]]"
