In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [4]:
dataset=pd.read_csv('Adult Census Income.csv')
dataset.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [3]:
for i in dataset.columns:
    print(i)
    print(dataset[i].value_counts())
    print("------------------------------------------------------")

age
36    898
31    888
34    886
23    877
35    876
     ... 
83      6
85      3
88      3
86      1
87      1
Name: age, Length: 73, dtype: int64
------------------------------------------------------
workclass
Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64
------------------------------------------------------
fnlwgt
164190    13
123011    13
203488    13
113364    12
148995    12
          ..
201155     1
127384     1
270522     1
315804     1
145522     1
Name: fnlwgt, Length: 21648, dtype: int64
------------------------------------------------------
education
HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school

In [4]:
#Column Education and Education num are correlated attributes.
dataset.drop(['education'],axis=1,inplace=True)

In [5]:
#Column containing null values or ? are replaced by mode of column value in the dataset.
dataset['workclass']=dataset['workclass'].replace('?','Private')
dataset['occupation']=dataset['occupation'].replace('?','Prof-specialty')
dataset['native.country']=dataset['native.country'].replace('?','United-States')
dataset.head()

Unnamed: 0,age,workclass,fnlwgt,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,Private,77053,9,Widowed,Prof-specialty,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,Private,186061,10,Widowed,Prof-specialty,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [6]:
dataset['marital.status']=dataset['marital.status'].replace(['Married-AF-spouse','Married-civ-spouse'],'Married')
dataset['marital.status']=dataset['marital.status'].replace('Never-Married','Not Married')
dataset['marital.status']=dataset['marital.status'].replace(['Divorced','Separated','Married-spouse-absent','Widowed'],'Other')
dataset.head()

Unnamed: 0,age,workclass,fnlwgt,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,Private,77053,9,Other,Prof-specialty,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,9,Other,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,Private,186061,10,Other,Prof-specialty,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,4,Other,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,10,Other,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [7]:
# Identify categorical columns
categorical_columns = ['workclass', 'marital.status', 'occupation', 'relationship', 'race', 'sex','native.country','income']  # Replace with your categorical column names

# Used LabelEncoder to convert categorical variables to numerical
label_encoders = {}
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    dataset[col] = label_encoders[col].fit_transform(dataset[col])


In [8]:
X = dataset.drop('income', axis=1)
y = dataset['income']

In [9]:

# Initialize the SelectKBest feature selector with chi-squared score
num_features_to_select = 10  # Choose the number of top features you want to select
selector = SelectKBest(score_func=chi2, k=num_features_to_select)

# Fit the selector to the data and transform the features
X_new = selector.fit_transform(X, y)

# Get the chi-squared scores
chi2_scores = selector.scores_

# Get the indices of the selected features
selected_feature_indices = selector.get_support(indices=True)

# Print the chi-squared scores for each feature
for idx, score in enumerate(chi2_scores):
    print(f"Feature {idx}: Chi-squared score = {score:.4f}")
    
# Print the indices of selected features
print("\n\n Selected Feature Indices:", selected_feature_indices)


print("\n\n")

features=list(X.columns)
for i in range(len(features)):
    if i in selected_feature_indices:
        print("Features ",i,":",features[i])

Feature 0: Chi-squared score = 8600.6118
Feature 1: Chi-squared score = 0.0935
Feature 2: Chi-squared score = 171147.6829
Feature 3: Chi-squared score = 2401.4218
Feature 4: Chi-squared score = 3759.5273
Feature 5: Chi-squared score = 100.3564
Feature 6: Chi-squared score = 3659.1431
Feature 7: Chi-squared score = 33.0313
Feature 8: Chi-squared score = 502.4394
Feature 9: Chi-squared score = 82192467.1415
Feature 10: Chi-squared score = 1372145.8902
Feature 11: Chi-squared score = 6476.4090
Feature 12: Chi-squared score = 17.4342


 Selected Feature Indices: [ 0  2  3  4  5  6  8  9 10 11]



Features  0 : age
Features  2 : fnlwgt
Features  3 : education.num
Features  4 : marital.status
Features  5 : occupation
Features  6 : relationship
Features  8 : sex
Features  9 : capital.gain
Features  10 : capital.loss
Features  11 : hours.per.week


In [10]:
selected_columns = ['age', 'fnlwgt', 'education.num', 'marital.status','occupation','relationship', 'sex','capital.gain','capital.loss','hours.per.week']
x = X[selected_columns]

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [12]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier


model=XGBClassifier()
model.fit(X_train,Y_train)

# Initialize and train models
logreg_model = LogisticRegression()
logreg_model.fit(X_train,Y_train)

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train,Y_train)

nb_model = GaussianNB()
nb_model.fit(X_train,Y_train)

svm_model = SVC(kernel='linear')
svm_model.fit(X_train,Y_train)
# Make predictions
y_pred = model.predict(X_test)
y_pred_logreg = logreg_model.predict(X_test)
y_pred_knn = knn_model.predict(X_test)
y_pred_nb = nb_model.predict(X_test)
y_pred_svm = svm_model.predict(X_test)

# Calculate and print accuracies
accuracy_xgb = accuracy_score(Y_test,y_pred)
accuracy_logreg = accuracy_score(Y_test, y_pred_logreg)
accuracy_knn = accuracy_score(Y_test, y_pred_knn)
accuracy_nb = accuracy_score(Y_test, y_pred_nb)
accuracy_svm = accuracy_score(Y_test, y_pred_svm)

print("XGBoosting Accuracy             :   ", accuracy_xgb)
print("Logistic Regression Accuracy    :   ", accuracy_logreg)
print("k-Nearest Neighbors Accuracy    :   ", accuracy_knn)
print("Naive Bayes Accuracy            :   ", accuracy_nb)
print("Support Vector Machine Accuracy :   ", accuracy_svm)


XGBoosting Accuracy             :    0.8681099339781975
Logistic Regression Accuracy    :    0.8380162751420236
k-Nearest Neighbors Accuracy    :    0.8340242591739597
Naive Bayes Accuracy            :    0.7994779671426377
Support Vector Machine Accuracy :    0.8404729003531399


In [14]:
import pickle 
pickle.dump(model,open("income_pred.pkl","wb"))