#**IMPORTING THE DATASET**

In [1]:
!pip install ucimlrepo
from ucimlrepo import fetch_ucirepo

# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


#**Pre-processing X**

In [2]:
from sklearn import preprocessing

# Label encoding columns
le = preprocessing.LabelEncoder()
X.loc[:, 'workclass'] = le.fit_transform(X.loc[:, 'workclass'])
X.loc[:, 'education'] = le.fit_transform(X.loc[:, 'education'])
X.loc[:, 'marital-status'] = le.fit_transform(X.loc[:, 'marital-status'])
X.loc[:, 'occupation'] = le.fit_transform(X.loc[:, 'occupation'])
X.loc[:, 'relationship'] = le.fit_transform(X.loc[:, 'relationship'])
X.loc[:, 'race'] = le.fit_transform(X.loc[:, 'race'])
X.loc[:, 'sex'] = le.fit_transform(X.loc[:, 'sex'])

# Dropping unnecessary columns
X = X.drop(['fnlwgt', 'capital-gain', 'capital-loss', 'native-country'], axis=1)
X = X.dropna()

X

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,hours-per-week
0,39,7,9,13,4,1,1,4,1,40
1,50,6,9,13,2,4,0,4,1,13
2,38,4,11,9,0,6,1,4,1,40
3,53,4,1,7,2,6,0,2,1,40
4,28,4,9,13,2,10,5,2,0,40
...,...,...,...,...,...,...,...,...,...,...
48837,39,4,9,13,0,10,1,4,0,36
48838,64,9,11,9,6,15,2,2,1,40
48839,38,4,9,13,2,10,0,4,1,50
48840,44,4,9,13,0,1,3,1,1,40


#**Pre-processing Y**

In [3]:
y = y.dropna()

# Replacing unwanted data with the real one
y = y.replace('>50K.', '>50K')
y = y.replace('<=50K.', '<=50K')

# Label encode the output
y = le.fit_transform(y.values.ravel())
y

array([0, 0, 0, ..., 0, 0, 1])

#**SPLITTING THE DATA INTO TRAIN AND TEST**

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#**Building and training the model**

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

logreg = LogisticRegression(random_state=16, solver='lbfgs', max_iter=1000)
logreg.fit(X_train, y_train)
print(logreg)

dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
print(dtree)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print(rf)

sv = svm.SVC(kernel='linear')
sv.fit(X_train, y_train)
print(sv)

naiveBayes = GaussianNB()
naiveBayes.fit(X_train, y_train)
print(naiveBayes)

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
print(knn)

LogisticRegression(max_iter=1000, random_state=16)
DecisionTreeClassifier()
RandomForestClassifier()
SVC(kernel='linear')
GaussianNB()
KNeighborsClassifier(n_neighbors=3)


#**Accuracy of every other Algorithm**

In [6]:
from sklearn import metrics

y_pred = logreg.predict(X_test)
print("Accuracy (Logistic Regression):",metrics.accuracy_score(y_test, y_pred))

y_pred = dtree.predict(X_test)
print("Accuracy (Decision Tree):",metrics.accuracy_score(y_test, y_pred))

y_pred = rf.predict(X_test)
print("Accuracy (Random Forest):",metrics.accuracy_score(y_test, y_pred))

y_pred = sv.predict(X_test)
print("Accuracy (SVM):",metrics.accuracy_score(y_test, y_pred))

y_pred = naiveBayes.predict(X_test)
print("Accuracy (Naive Bayes):",metrics.accuracy_score(y_test, y_pred))

y_pred = knn.predict(X_test)
print("Accuracy (KNN):",metrics.accuracy_score(y_test, y_pred))

Accuracy (Logistic Regression): 0.8078616030299929
Accuracy (Decision Tree): 0.7854437506397789
Accuracy (Random Forest): 0.8195311700276384
Accuracy (SVM): 0.8119561879414474
Accuracy (Naive Bayes): 0.775104923738356
Accuracy (KNN): 0.7960896714095609
