In [1]:
import pandas as pd
import numpy as np

In [2]:
income_data=pd.read_csv('adult.csv',skipinitialspace=True,na_values='?')
income_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [3]:
income_data.dropna(inplace=True)

In [4]:
income_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45222 entries, 0 to 48841
Data columns (total 15 columns):
age                45222 non-null int64
workclass          45222 non-null object
fnlwgt             45222 non-null int64
education          45222 non-null object
educational-num    45222 non-null int64
marital-status     45222 non-null object
occupation         45222 non-null object
relationship       45222 non-null object
race               45222 non-null object
gender             45222 non-null object
capital-gain       45222 non-null int64
capital-loss       45222 non-null int64
hours-per-week     45222 non-null int64
native-country     45222 non-null object
income             45222 non-null object
dtypes: int64(6), object(9)
memory usage: 5.5+ MB


In [5]:
#fetching the categorical attributes:
categorical_dataframe=income_data.loc[:,'age':'native-country'].select_dtypes(include=object)
categorical_attributes=categorical_dataframe.columns.tolist()
categorical_indices=[]
for attribute in categorical_attributes:
    categorical_indices.append(income_data.columns.get_loc(attribute))
categorical_indices

[1, 3, 5, 6, 7, 8, 9, 13]

In [6]:
#fetching the numerical indices :
numerical_dataframe=income_data.loc[:,'age':'native-country'].select_dtypes(include=['int64'])
numerical_attributes=numerical_dataframe.columns.tolist()
numerical_indices=[]
for attribute in numerical_attributes:
    numerical_indices.append(income_data.columns.get_loc(attribute))
numerical_indices

[0, 2, 4, 10, 11, 12]

In [7]:
#label encoding categorical attributes:
from sklearn.preprocessing import LabelEncoder
encoder_object=LabelEncoder()
for index in categorical_indices:
    income_data.iloc[:,index]=encoder_object.fit_transform(income_data.iloc[:,index])
income_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,2,226802,1,7,4,6,3,2,1,0,0,40,38,<=50K
1,38,2,89814,11,9,2,4,0,4,1,0,0,50,38,<=50K
2,28,1,336951,7,12,2,10,0,4,1,0,0,40,38,>50K
3,44,2,160323,15,10,2,6,0,2,1,7688,0,40,38,>50K
5,34,2,198693,0,6,4,7,1,4,1,0,0,30,38,<=50K


In [8]:
#one hot encoding the categorical features:
from sklearn.preprocessing import OneHotEncoder
hot_encoder=OneHotEncoder(categorical_features=categorical_indices)
X=income_data.loc[:,'age':'native-country']
X=hot_encoder.fit_transform(X).toarray()
X=pd.DataFrame(data=X)

In [9]:
#checking the splits rendered to each categorical feature by OneHotEncoding:
splits_rendered=pd.DataFrame(data={'indices':categorical_indices,'names':income_data.columns[categorical_indices],'split_count':hot_encoder.n_values_})
splits_rendered

Unnamed: 0,indices,names,split_count
0,1,workclass,7
1,3,education,16
2,5,marital-status,7
3,6,occupation,14
4,7,relationship,6
5,8,race,5
6,9,gender,2
7,13,native-country,41


In [10]:
#label encoding the target attribute:
Y=encoder_object.fit_transform(income_data['income'])

X= expanded dataset of input attributes

Y= label encoded target features

In [11]:
#creating the training sets and testing sets:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=10000,random_state=0)

In [12]:
#standardizing the input attributes:
from sklearn.preprocessing import StandardScaler
standardizer=StandardScaler()
X_train=standardizer.fit_transform(X_train)
X_test=standardizer.transform(X_test)

In [13]:
#reducing the dimensionality of the data:
from sklearn.decomposition import PCA
pca_object=PCA(0.85)
X_train=pca_object.fit_transform(X_train)
X_test=pca_object.transform(X_test)
print('Number of Components:',pca_object.n_components_)
print('Explained Variance Ratio :',pca_object.explained_variance_ratio_)

Number of Components: 73
Explained Variance Ratio : [0.04387756 0.02992763 0.02504624 0.02249955 0.01871125 0.01712441
 0.01638694 0.01523568 0.01382803 0.01346406 0.01270211 0.01249658
 0.01195047 0.01175642 0.01169999 0.01147317 0.01133051 0.01122957
 0.01099752 0.01098778 0.0107934  0.01076328 0.01055065 0.01050035
 0.01037386 0.0103613  0.01031977 0.01029354 0.01021702 0.01018767
 0.01012602 0.0101182  0.01007452 0.01003686 0.00997547 0.00992417
 0.00989656 0.00986286 0.0098221  0.00980293 0.00972718 0.00968932
 0.00967204 0.00967093 0.00964536 0.00964114 0.00963644 0.00963152
 0.00962847 0.00962738 0.00961766 0.00961271 0.00960649 0.00959824
 0.00959122 0.00958118 0.00956155 0.00954488 0.00953421 0.00950746
 0.00948925 0.00947022 0.00943834 0.00941605 0.00937729 0.00934363
 0.00932189 0.00927715 0.00922861 0.00919858 0.00917523 0.00915159
 0.00911709]
