## Importing all necessary libraries

In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score,roc_curve,confusion_matrix
from sklearn.preprocessing import binarize

In [None]:
columns=['age','workclass','fnlwgt','education','education-num','marital_status','occupation','relationship','race','sex','capital_gain'
        ,'capital-loss','hours-per-week','native-country','income']

In [None]:
test_dir = '/kaggle/input/us-census-data/adult-test.csv'
training_dir = '/kaggle/input/us-census-data/adult-training.csv'

In [None]:
data=pd.read_csv(training_dir,names=columns)

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.info()

## cheking for any null value

In [None]:
data.isnull().sum()

## Distributing 'age' into different categories



In [None]:
age_categ=[]
for age in data.age:
    if age<13:
        age_categ.append('kid')
    else:
        if age<19:
            age_categ.append('teen')
        else:
            if age<35:
                age_categ.append('young')
            else:
                if age<50:
                    age_categ.append('adult')
                else:
                    age_categ.append('old')
data.insert(1,'age_categ',age_categ)

In [None]:
sns.countplot(data.age_categ)


### Dropping 'age' column



In [None]:
data.drop(['age'],axis=1,inplace =True)

In [None]:
data.workclass.unique()


'?' is not any workclass . It needs to be handled.

In [None]:
(data.workclass==' ?').sum()/len(data)*100
# 5 percent of workclass is filled with ?


Filling '?' in workclass with mode

In [None]:
data.workclass.replace(' ?',data.workclass.mode()[0],inplace=True)


Never-worked and Without-pay can be considered as same column



In [None]:
data.workclass.replace(' Never-worked',' Without-pay',inplace=True)

In [None]:
plt.xticks(rotation=90)
sns.countplot(data.workclass)

In [None]:
data.fnlwgt.plot(kind='box')

There are some outliers which needs to be handled.



In [None]:
data=data[data.fnlwgt<600000]

In [None]:
data.fnlwgt.plot(kind='box')


In [None]:
plt.figure(figsize=(10,10))
plt.xticks(rotation=90)
sns.countplot(data.education)

In [None]:
sns.countplot(data['education-num'])
#education number is alternative way of representating education column so we can drop one of them

In [None]:
data.drop(['education'],axis=1,inplace=True)

Different categories in 'marital_status'



In [None]:
data['marital_status'].unique()

In [None]:
plt.xticks(rotation=90)
sns.countplot(data['marital_status'])

In [None]:
data['occupation'].unique()

Filling '?' in occupation with mode



In [None]:
data.occupation.replace(' ?',data.occupation.mode()[0],inplace=True)

In [None]:

plt.xticks(rotation=90)
sns.countplot(data.occupation)

In [None]:
data.relationship.unique()


In [None]:
plt.xticks(rotation=90)
sns.countplot(data.relationship)

In [None]:
plt.xticks(rotation=90)
sns.countplot(data.race)

In [None]:
data.race.unique()


Asian-Pac-Islander,Amer-Indian-Eskim can be combined to other category because they have very low count



In [None]:
replace=data.race.unique()[2:]
for to_replace in replace:
    print(to_replace)
    data['race'].replace(to_replace,' Other',inplace=True)

In [None]:
sns.countplot(data.race)

In [None]:
sns.countplot(data.sex)


we can make two categories in capital gain one with no capital and another with some capital



In [None]:
data['is_capital']=[0 if capital==0 else 1 for capital in data['capital_gain']]


In [None]:
sns.countplot(data['is_capital'])


we can make two categories in capital loss one with no capital loss and another with some capital loss



In [None]:
data['is_loss']=[0 if capital==0 else 1 for capital in data['capital-loss']]


In [None]:
sns.countplot(data['is_loss'])


In [None]:
#dropping capital_gain and capital_loss
data.drop(['capital_gain','capital-loss'],axis=1,inplace=True)

In [None]:
data['hours-per-week'].hist(bins=15)


Dividing 'hours-per-week' into categories



In [None]:

diff_hours_categ=['>=60','>40&<60','<=40&>30','<=30']
hours_categ=[]
for hours in data['hours-per-week']:
    if hours>=60:
        hours_categ.append(diff_hours_categ[0])
    else:
        if hours>40:
            hours_categ.append(diff_hours_categ[1])
        else:
            if hours>30:
                hours_categ.append(diff_hours_categ[2])
            else:
                hours_categ.append(diff_hours_categ[3])
data['hours_categ_week']=hours_categ

In [None]:
sns.countplot(data['hours_categ_week'])


In [None]:
#dropping hours per week
data.drop(['hours-per-week'],axis=1,inplace=True)

In [None]:
data['native-country'].value_counts()


Since,except United-States other countries have low count so they can belong to same category(other)



In [None]:
# we can make only two native country United-States and other
data['native-country']=[' United-States' if country==' United-States' else ' Other' for country in data['native-country']]


In [None]:
sns.countplot(data['native-country'])


In [None]:
sns.countplot(data.income)


Finally,after performing data cleaning and feature engineering let's take a look at our dataset¶


In [None]:
data.head()


## Now lets start bivariate analysis¶


In [None]:

diff_categ_count=data['age_categ'].value_counts()
group_table=data.groupby(['age_categ','income']).size().astype(float)
for categ in group_table.index.levels[0]:
    for income in group_table[categ].index:
        group_table[categ][income]=group_table[categ][income]/diff_categ_count[categ]*100
group_table.unstack().plot(kind='bar',stacked=True)
plt.ylabel('percentage of income categ')

In [None]:
#since adult and old distrbution is similar in income so we can make them one 
data.age_categ.replace('old','adult',inplace=True)

In [None]:
diff_categ_count=data['workclass'].value_counts()
group_table=data.groupby(['workclass','income']).size().astype(float)
for categ in group_table.index.levels[0]:
    for income in group_table[categ].index:
        group_table[categ][income]=group_table[categ][income]/diff_categ_count[categ]*100
group_table.unstack().plot(kind='bar',stacked=True)
plt.ylabel('percentage of income categ')

In [None]:
sns.violinplot(data['income'],data['fnlwgt'],inner='quart')


As we can see that fnlwgt distribution is same for both income types so we can drop it



In [None]:
data.drop(['fnlwgt'],axis=1,inplace=True)


In [None]:
diff_categ_count=data['education-num'].value_counts()
group_table=data.groupby(['education-num','income']).size().astype(float)
for categ in group_table.index.levels[0]:
    for income in group_table[categ].index:
        group_table[categ][income]=group_table[categ][income]/diff_categ_count[categ]*100
group_table.unstack().plot(kind='bar',stacked=True)
plt.ylabel('percentage of income categ')

In [None]:
# 15 and 16  , 11 and 12 , 2 and 3 ,4 to 7 can be combined
replace_dict={
    15:16,11:12,3:2,5:4,6:4,7:4
}
for num in replace_dict:
    data.replace(num,replace_dict[num],inplace=True)

In [None]:
diff_categ_count=data['education-num'].value_counts()
group_table=data.groupby(['education-num','income']).size().astype(float)
for categ in group_table.index.levels[0]:
    for income in group_table[categ].index:
        group_table[categ][income]=group_table[categ][income]/diff_categ_count[categ]*100
group_table.unstack().plot(kind='bar',stacked=True)
plt.ylabel('percentage of income categ')

In [None]:
diff_categ_count=data['marital_status'].value_counts()
group_table=data.groupby(['marital_status','income']).size().astype(float)
for categ in group_table.index.levels[0]:
    for income in group_table[categ].index:
        group_table[categ][income]=group_table[categ][income]/diff_categ_count[categ]*100
group_table.unstack().plot(kind='bar',stacked=True)
plt.ylabel('percentage of income categ')

In [None]:
#reducing some categories
data.replace(' Married-civ-spouse',' Married-AF-spouse',inplace=True)
data.replace(' Married-spouse-absent',' Widowed',inplace=True)

In [None]:

diff_categ_count=data['occupation'].value_counts()
group_table=data.groupby(['occupation','income']).size().astype(float)
for categ in group_table.index.levels[0]:
    for income in group_table[categ].index:
        group_table[categ][income]=group_table[categ][income]/diff_categ_count[categ]*100
group_table.unstack().plot(kind='bar',stacked=True,figsize=(10,10))
plt.ylabel('percentage of income categ')

In [None]:

diff_categ_count=data['relationship'].value_counts()
group_table=data.groupby(['relationship','income']).size().astype(float)
for categ in group_table.index.levels[0]:
    for income in group_table[categ].index:
        group_table[categ][income]=group_table[categ][income]/diff_categ_count[categ]*100
group_table.unstack().plot(kind='bar',stacked=True)
plt.ylabel('percentage of income categ')

In [None]:
diff_categ_count=data['race'].value_counts()
group_table=data.groupby(['race','income']).size().astype(float)
for categ in group_table.index.levels[0]:
    for income in group_table[categ].index:
        group_table[categ][income]=group_table[categ][income]/diff_categ_count[categ]*100
group_table.unstack().plot(kind='bar',stacked=True)
plt.ylabel('percentage of income categ')

In [None]:
diff_categ_count=data['sex'].value_counts()
group_table=data.groupby(['sex','income']).size().astype(float)
for categ in group_table.index.levels[0]:
    for income in group_table[categ].index:
        group_table[categ][income]=group_table[categ][income]/diff_categ_count[categ]*100
group_table.unstack().plot(kind='bar',stacked=True)
plt.ylabel('percentage of income categ')

In [None]:

diff_categ_count=data['native-country'].value_counts()
group_table=data.groupby(['native-country','income']).size().astype(float)
for categ in group_table.index.levels[0]:
    for income in group_table[categ].index:
        group_table[categ][income]=group_table[categ][income]/diff_categ_count[categ]*100
group_table.unstack().plot(kind='bar',stacked=True)
plt.ylabel('percentage of income categ')

In [None]:

diff_categ_count=data['is_capital'].value_counts()
group_table=data.groupby(['is_capital','income']).size().astype(float)
for categ in group_table.index.levels[0]:
    for income in group_table[categ].index:
        group_table[categ][income]=group_table[categ][income]/diff_categ_count[categ]*100
group_table.unstack().plot(kind='bar',stacked=True)
plt.ylabel('percentage of income categ')

In [None]:
diff_categ_count=data['is_loss'].value_counts()
group_table=data.groupby(['is_loss','income']).size().astype(float)
for categ in group_table.index.levels[0]:
    for income in group_table[categ].index:
        group_table[categ][income]=group_table[categ][income]/diff_categ_count[categ]*100
group_table.unstack().plot(kind='bar',stacked=True)
plt.ylabel('percentage of income categ')

In [None]:
diff_categ_count=data['hours_categ_week'].value_counts()
group_table=data.groupby(['hours_categ_week','income']).size().astype(float)
for categ in group_table.index.levels[0]:
    for income in group_table[categ].index:
        group_table[categ][income]=group_table[categ][income]/diff_categ_count[categ]*100
group_table.unstack().plot(kind='bar',stacked=True)
plt.ylabel('percentage of income categ')

In [None]:
data.replace('>=60','>40',inplace=True)
data.replace('>40&<60','>40',inplace=True)

In [None]:
data.head()


In [None]:
features=list(data.columns)
print(features)

features.remove('income')
X=data[features].copy()
Y=data['income']

In [None]:
X.head()


## Label Encoding
In order to pass the features to the model for prediction, they must be continuous. So , we will use LabelEncoder to encode all the categorical variable into continuous values.

In [None]:
le=LabelEncoder()
for feature in features:
    X[feature]=le.fit_transform(X[feature])
Y=[0 if val == ' <=50K' else 1 for val in Y]

## One-Hot Encoding
In some categorical features, where the number of unique values is large ,if we do label encoding then our model will bes biased for value assigned to different categories which is not good. So, we will use one-hot encding to solve this issue.

In [None]:
X=pd.get_dummies(X,columns=features)
X.head()

## Splitting Data Into Train And Test Set¶


In [None]:
train_x,test_x,train_y,test_y=train_test_split(X,Y,test_size=0.20,random_state=9)


## Building Model


In [None]:
lr=LogisticRegression()


In [None]:
lr.fit(train_x,train_y)
print('accuracy on training data:',lr.score(train_x,train_y))

In [None]:
predicted_y=lr.predict(test_x)
print(classification_report(test_y,predicted_y))
print('accuracy_score is on test data: ',accuracy_score(test_y,predicted_y))

In [None]:
plt.figure(figsize=(5,5))
sns.heatmap(confusion_matrix(test_y,predicted_y),annot=True,fmt='.5g')
plt.ylabel('actual class')
plt.xlabel('predicted class')