In [3]:
# Importing dependencies
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

# For Hypothesis testing
import statsmodels.formula.api as smf

### Loading the Dataset

In [4]:
# Loading the dataset into pandas dataframe
path = "../data/census-income.data.gz"
censusColnames = ['Age', 'ClassOfWorker', 'Industry', 'Occupation', 'Education',
                  'WagePerHr', 'EducationalInst', 'MaritalStatus', 'IndustryCode', 
                  'OccupationCode', 'Race', 'HispanicOrigin', 'Sex', 'MemLabourUnion',
                  'UnemploymentReason', 'EmploymentStatus', 'CapitalGain', 'CapitalLoss',
                  'Dividends', 'FEDERALTAX', 'TaxFilerStat', 'PrevState', 
                  'HouseholdStatus', 'HouseholdSummary', 'INSTANCEWEIGHT', 
                  'MigrationCode (MSA)', 'MigrationCode (REG)', 
                  'MigrationCode (WITHIN REG)', 'LiveInHouse', 
                  'MigrationPrevResInSunbelt', 'NumOfPersonForEmployer', 'Parent', 
                  'BirthCountryFather', 'BirthCountryMother', 'BirthCountrySelf', 
                  'Citizenship', 'OwnBusiness', 'VeteranQA', 'VeteranBenefits', 
                  'WeeksWorked', 'Year', 'Income']
censusDf = pd.read_csv(path, sep=r',', skipinitialspace=True, 
                       names = censusColnames, header='infer')

# Printing the dimensions of the dataset
print(censusDf.shape[0],"rows,", censusDf.shape[1],"columns")

# Displaying first five elements of all columns
with pd.option_context('display.max_columns', None):
    display(censusDf.head())

199523 rows, 42 columns


Unnamed: 0,Age,ClassOfWorker,Industry,Occupation,Education,WagePerHr,EducationalInst,MaritalStatus,IndustryCode,OccupationCode,Race,HispanicOrigin,Sex,MemLabourUnion,UnemploymentReason,EmploymentStatus,CapitalGain,CapitalLoss,Dividends,FEDERALTAX,TaxFilerStat,PrevState,HouseholdStatus,HouseholdSummary,INSTANCEWEIGHT,MigrationCode (MSA),MigrationCode (REG),MigrationCode (WITHIN REG),LiveInHouse,MigrationPrevResInSunbelt,NumOfPersonForEmployer,Parent,BirthCountryFather,BirthCountryMother,BirthCountrySelf,Citizenship,OwnBusiness,VeteranQA,VeteranBenefits,WeeksWorked,Year,Income
0,73,Not in universe,0,0,High school graduate,0,Not in universe,Widowed,Not in universe or children,Not in universe,White,All other,Female,Not in universe,Not in universe,Not in labor force,0,0,0,Nonfiler,Not in universe,Not in universe,Other Rel 18+ ever marr not in subfamily,Other relative of householder,1700.09,?,?,?,Not in universe under 1 year old,?,0,Not in universe,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,- 50000.
1,58,Self-employed-not incorporated,4,34,Some college but no degree,0,Not in universe,Divorced,Construction,Precision production craft & repair,White,All other,Male,Not in universe,Not in universe,Children or Armed Forces,0,0,0,Head of household,South,Arkansas,Householder,Householder,1053.55,MSA to MSA,Same county,Same county,No,Yes,1,Not in universe,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,- 50000.
2,18,Not in universe,0,0,10th grade,0,High school,Never married,Not in universe or children,Not in universe,Asian or Pacific Islander,All other,Female,Not in universe,Not in universe,Not in labor force,0,0,0,Nonfiler,Not in universe,Not in universe,Child 18+ never marr Not in a subfamily,Child 18 or older,991.95,?,?,?,Not in universe under 1 year old,?,0,Not in universe,Vietnam,Vietnam,Vietnam,Foreign born- Not a citizen of U S,0,Not in universe,2,0,95,- 50000.
3,9,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,White,All other,Female,Not in universe,Not in universe,Children or Armed Forces,0,0,0,Nonfiler,Not in universe,Not in universe,Child <18 never marr not in subfamily,Child under 18 never married,1758.14,Nonmover,Nonmover,Nonmover,Yes,Not in universe,0,Both parents present,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.
4,10,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,White,All other,Female,Not in universe,Not in universe,Children or Armed Forces,0,0,0,Nonfiler,Not in universe,Not in universe,Child <18 never marr not in subfamily,Child under 18 never married,1069.16,Nonmover,Nonmover,Nonmover,Yes,Not in universe,0,Both parents present,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.


In [5]:
# Some statistics about the data
censusDf.describe()

Unnamed: 0,Age,Industry,Occupation,WagePerHr,CapitalGain,CapitalLoss,Dividends,INSTANCEWEIGHT,NumOfPersonForEmployer,OwnBusiness,VeteranBenefits,WeeksWorked,Year
count,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0
mean,34.494199,15.35232,11.306556,55.426908,434.71899,37.313788,197.529533,1740.380269,1.95618,0.175438,1.514833,23.174897,94.499672
std,22.310895,18.067129,14.454204,274.896454,4697.53128,271.896428,1984.163658,993.768156,2.365126,0.553694,0.851473,24.411488,0.500001
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.87,0.0,0.0,0.0,0.0,94.0
25%,15.0,0.0,0.0,0.0,0.0,0.0,0.0,1061.615,0.0,0.0,2.0,0.0,94.0
50%,33.0,0.0,0.0,0.0,0.0,0.0,0.0,1618.31,1.0,0.0,2.0,8.0,94.0
75%,50.0,33.0,26.0,0.0,0.0,0.0,0.0,2188.61,4.0,0.0,2.0,52.0,95.0
max,90.0,51.0,46.0,9999.0,99999.0,4608.0,99999.0,18656.3,6.0,2.0,2.0,52.0,95.0


## Wrangling

In [None]:
# Continuous Features
ordinalFeatures = ['Age', 'WagePerHr', 'CapitalGain', 'CapitalLoss','Dividends', 
     'INSTANCEWEIGHT', 'NumOfPersonForEmployer', 'WeeksWorked']

# Nominal Features
nominalFeatures = ['ClassOfWorker', 'Industry', 'Occupation', 'Education', 
                  'EducationalInst', 'MaritalStatus', 'IndustryCode', 'OccupationCode',
                  'Race', 'HispanicOrigin', 'Sex', 'MemLabourUnion', 
                  'UnemploymentReason', 'EmploymentStatus','FEDERALTAX', 
                   'TaxFilerStat', 'PrevState', 'HouseholdStatus', 
                  'HouseholdSummary', 'MigrationCode (MSA)', 'MigrationCode (REG)', 
                  'MigrationCode (WITHIN REG)', 'LiveInHouse', 'MigrationPrevResInSunbelt',
                  'Parent', 'BirthCountryFather', 'BirthCountryMother',
                  'BirthCountrySelf', 'Citizenship', 'OwnBusiness', 'VeteranQA', 'VeteranBenefits', 
                  'Year', 'Income']

# Check the features
print(len(censusColnames) == len(ordinalFeatures) + len(nominalFeatures))

### Missing Values

In [6]:
# Lets get some statistics about the dataset
censusDf.info()
censusDf.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199523 entries, 0 to 199522
Data columns (total 42 columns):
Age                           199523 non-null int64
ClassOfWorker                 199523 non-null object
Industry                      199523 non-null int64
Occupation                    199523 non-null int64
Education                     199523 non-null object
WagePerHr                     199523 non-null int64
EducationalInst               199523 non-null object
MaritalStatus                 199523 non-null object
IndustryCode                  199523 non-null object
OccupationCode                199523 non-null object
Race                          199523 non-null object
HispanicOrigin                198649 non-null object
Sex                           199523 non-null object
MemLabourUnion                199523 non-null object
UnemploymentReason            199523 non-null object
EmploymentStatus              199523 non-null object
CapitalGain                   199523 non-null

Unnamed: 0,Age,Industry,Occupation,WagePerHr,CapitalGain,CapitalLoss,Dividends,INSTANCEWEIGHT,NumOfPersonForEmployer,OwnBusiness,VeteranBenefits,WeeksWorked,Year
count,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0
mean,34.494199,15.35232,11.306556,55.426908,434.71899,37.313788,197.529533,1740.380269,1.95618,0.175438,1.514833,23.174897,94.499672
std,22.310895,18.067129,14.454204,274.896454,4697.53128,271.896428,1984.163658,993.768156,2.365126,0.553694,0.851473,24.411488,0.500001
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.87,0.0,0.0,0.0,0.0,94.0
25%,15.0,0.0,0.0,0.0,0.0,0.0,0.0,1061.615,0.0,0.0,2.0,0.0,94.0
50%,33.0,0.0,0.0,0.0,0.0,0.0,0.0,1618.31,1.0,0.0,2.0,8.0,94.0
75%,50.0,33.0,26.0,0.0,0.0,0.0,0.0,2188.61,4.0,0.0,2.0,52.0,95.0
max,90.0,51.0,46.0,9999.0,99999.0,4608.0,99999.0,18656.3,6.0,2.0,2.0,52.0,95.0


* We can observe from the above statistics that, there are no missing values in numerical columns of the dataset. 
* From the first five lines of dataframe displayed above we saw that there are some garbage/missing values in the dataframe labelled as '?', lets try to track them.

In [7]:
# There are lot of '?' appearing in the dataset lets track them
for i in censusDf.columns:
    if '?' in list(censusDf[i]):
        print(censusDf.loc[censusDf[i].isin(['?'])][i].value_counts())

?    708
Name: PrevState, dtype: int64
?    99696
Name: MigrationCode (MSA), dtype: int64
?    99696
Name: MigrationCode (REG), dtype: int64
?    99696
Name: MigrationCode (WITHIN REG), dtype: int64
?    99696
Name: MigrationPrevResInSunbelt, dtype: int64
?    6713
Name: BirthCountryFather, dtype: int64
?    6119
Name: BirthCountryMother, dtype: int64
?    3393
Name: BirthCountrySelf, dtype: int64


The above missing values does not makes much sense if we substitute them, as they are nominal values. Let us label all the above missing values as 'Unavailable'. Also there are four columns in which there almost 50% of the values which are '?', it is better to drop those columns, as high proportion of missing values can be misleading.

In [8]:
# Dropping the columns with missing values more than 50% and storing in a new dataframe
censusDf_cleaned = censusDf.drop(['MigrationCode (MSA)', 'MigrationCode (REG)', 
                                  'MigrationCode (WITHIN REG)', 
                                  'MigrationPrevResInSunbelt'], axis=1)

# Replacing the '?' with the label 'Unavailable'
censusDf_cleaned = censusDf_cleaned.replace('?', 'Unavailable')

In [10]:
for i in censusDf_cleaned.columns:
    if 'Unavailable' in list(censusDf_cleaned[i]):
        print(censusDf_cleaned.loc[censusDf_cleaned[i].isin(['Unavailable'])][i].value_counts())

Unavailable    708
Name: PrevState, dtype: int64
Unavailable    6713
Name: BirthCountryFather, dtype: int64
Unavailable    6119
Name: BirthCountryMother, dtype: int64
Unavailable    3393
Name: BirthCountrySelf, dtype: int64


### Feature Engineering

In [8]:
# Replacing the income values with dummy variables
# - 50000. as the baseline. 0 for - 50000. and 1 for 50000+.
censusDf_cleaned['Income'] = pd.get_dummies(censusDf_cleaned.Income).iloc[:,1:]

## Problem Statement

>From the various features in the census data set our aim is to build a predictive model to determine whether the income level for the people in United States exceeds the bracket of $50,000.

## Hypothesis Generation

From our problem statement is clear that it is a binary classification problem.

Let us generate some hypotheses which will help us in building the models more efficiently. We need to figure out some hypotheses which might influence our final outcome, hence we need to answer a simple question.

**Is There a Relationship Between the Response and Predictors?**

To test this we use the test between the Null Hypothesis $H_0$ versus the Alternate Hypothesis $H_a$.
* $H_0$ : There is no relationship between the response Income and the predictors.
    * To test the Null Hypothesis we test whether all the regression coefficients are zero.
* $H_0$ : There is some realtionship between the response and the predictors.
    * To test the Alternate Hypothesis we find  at least one coefficient that is non-zero.
    
*To perform the Hypothesis tests we will be performing multivariate linear regression on ordinal values of the dataset using **statsmodels** library.*


In [9]:
# Constructing a linearmodel using the ordinal values for our initial hypothesis test
hypothesis_test_model = smf.ols(formula=("Income ~ Age + Industry + Occupation + "
             "WagePerHr + CapitalGain + CapitalLoss + Dividends + "
             "INSTANCEWEIGHT + NumOfPersonForEmployer + OwnBusiness +"
             "VeteranBenefits + WeeksWorked + Year"), data=censusDf_cleaned).fit()

# Printing the summary of the model
hypothesis_test_model.summary()

0,1,2,3
Dep. Variable:,Income,R-squared:,0.195
Model:,OLS,Adj. R-squared:,0.195
Method:,Least Squares,F-statistic:,3710.0
Date:,"Mon, 09 Oct 2017",Prob (F-statistic):,0.0
Time:,02:22:46,Log-Likelihood:,22185.0
No. Observations:,199523,AIC:,-44340.0
Df Residuals:,199509,BIC:,-44200.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.4296,0.092,-4.688,0.000,-0.609,-0.250
Age,0.0010,3.15e-05,32.642,0.000,0.001,0.001
Industry,0.0004,4.23e-05,9.753,0.000,0.000,0.000
Occupation,-0.0040,4.61e-05,-86.916,0.000,-0.004,-0.004
WagePerHr,-8.225e-06,1.81e-06,-4.545,0.000,-1.18e-05,-4.68e-06
CapitalGain,9.776e-06,1.05e-07,93.318,0.000,9.57e-06,9.98e-06
CapitalLoss,9.952e-05,1.8e-06,55.394,0.000,9.6e-05,0.000
Dividends,1.552e-05,2.48e-07,62.566,0.000,1.5e-05,1.6e-05
INSTANCEWEIGHT,2.085e-06,4.89e-07,4.266,0.000,1.13e-06,3.04e-06

0,1,2,3
Omnibus:,119596.189,Durbin-Watson:,2.002
Prob(Omnibus):,0.0,Jarque-Bera (JB):,995579.191
Skew:,2.882,Prob(JB):,0.0
Kurtosis:,12.302,Cond. No.,895000.0


We can see from the above result that none of the coefficients are zero, also some of the features have significant p-values, which indicates that there is a significant relationship among the predictors and the response. 

* Hence we reject our Null Hypothesis $H_0$.

We should keep in mind that we have not considered all the features for our hypothesis generation, we will explore more about the nominal features as we proceed in the coming sections.

## Baseline

In order to evaluate our model we should define some baseline. Let us generate some statistics about our response variable so that we can set our baseline.

In [10]:
# Getting the count
incomeCount = censusDf_cleaned['Income'].value_counts()
print(incomeCount)

# Getting the proportion of data having -50000 as response
print(float(incomeCount[0]/len(censusDf_cleaned['Income'])))

0    187141
1     12382
Name: Income, dtype: int64
0.9379419916500854


Most of the values are 0 in the responce variable, Income. Which means that the dataset is heavily skewed towards having income less than \$50,000. Which means that if we predict only below \$50,000, still our model accuracy would be **93.79%**.

---

## Rough work