In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# define the dataset location
filename = 'adult-all.csv'
# load the csv file as a data frame
dataframe = pd.read_csv(filename, header=None, na_values='?')
dataframe.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label']

dataframe = dataframe.dropna() # drop rows with missing


dataframe['workclass'] = dataframe['workclass'].astype('category').cat.codes
dataframe['education'] = dataframe['education'].astype('category').cat.codes
dataframe['marital-status'] = dataframe['marital-status'].astype('category').cat.codes
dataframe['occupation'] = dataframe['occupation'].astype('category').cat.codes
dataframe['relationship'] = dataframe['relationship'].astype('category').cat.codes
dataframe['race'] = dataframe['race'].astype('category').cat.codes
dataframe['sex'] = dataframe['sex'].astype('category').cat.codes
dataframe['native-country'] = dataframe['native-country'].astype('category').cat.codes
dataframe['label'] = dataframe['label'].astype('category').cat.codes
#dataframe['label'] = dataframe['label'].astype('object')
colstoremove=[ 'education-num']
dataframe.drop(colstoremove, inplace=True, axis=1)
dataframe




Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,5,77516,9,4,0,1,4,1,2174,0,40,38,0
1,50,4,83311,9,2,3,0,4,1,0,0,13,38,0
2,38,2,215646,11,0,5,1,4,1,0,0,40,38,0
3,53,2,234721,1,2,5,0,2,1,0,0,40,38,0
4,28,2,338409,9,2,9,5,2,0,0,0,40,4,0
5,37,2,284582,12,2,3,5,4,0,0,0,40,38,0
6,49,2,160187,6,3,7,1,2,0,0,0,16,22,0
7,52,4,209642,11,2,3,0,4,1,0,0,45,38,1
8,31,2,45781,12,4,9,1,4,0,14084,0,50,38,1
9,42,2,159449,9,2,3,0,4,1,5178,0,40,38,1


In [2]:
#splitting train and test data
def splitandnormalizedata(dataframe):
    x = dataframe.drop('label', axis=1)
    y = dataframe['label']
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)

    #normalizing dataset
    scaler = StandardScaler()
    x_train = pd.DataFrame(scaler.fit_transform(x_train),columns = x.columns)

    x_test = pd.DataFrame(scaler.transform(x_test))
    return x_train, x_test, y_train, y_test





In [3]:
def trainmodel(model,x_train,y_train,x_test):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    return (round(accuracy_score(y_test,y_pred)*100,2))

In [4]:
x_train, x_test, y_train, y_test=splitandnormalizedata(dataframe)

In [5]:
#doing logistic regression
lr = LogisticRegression()
print(trainmodel(lr,x_train,y_train,x_test))


79.35




In [6]:
#random forest classifier

rf = RandomForestClassifier()
print(trainmodel(rf,x_train,y_train,x_test))




84.68


In [7]:
#naive bayes
nb = GaussianNB()
print(trainmodel(nb,x_train,y_train,x_test))




79.02


In [8]:
#SVM
svc = LinearSVC()
print(trainmodel(svc,x_train,y_train,x_test))


79.64




In [9]:
#decision tree
dt = DecisionTreeClassifier()
print(trainmodel(dt,x_train,y_train,x_test))


80.92


In [10]:
#importance of feature in random forest
def calculateimportanceoffeatures(model):
    features=x_train.columns
    importance = np.round(model.feature_importances_,2)
    df = pd.DataFrame(list(zip(features, importance)), columns =['feature', 'importance score'])
    df.sort_values(by=['importance score'], ascending=False,inplace=True)
    return df

    



In [11]:
#importance of features in random forest
calculateimportanceoffeatures(rf)

Unnamed: 0,feature,importance score
2,fnlwgt,0.18
0,age,0.15
9,capital-gain,0.13
6,relationship,0.11
3,education,0.09
5,occupation,0.08
11,hours-per-week,0.08
4,marital-status,0.07
1,workclass,0.04
10,capital-loss,0.04


In [12]:
#removing less imporant features 
colstoremove=[ 'sex','race','native-country','marital-status']
dataframe.drop(colstoremove, inplace=True, axis=1)

In [13]:
x_train, x_test, y_train, y_test=splitandnormalizedata(dataframe)
rf = RandomForestClassifier()
print(trainmodel(rf,x_train,y_train,x_test))



85.26
