In [1]:
# Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

## Step 1: Get the data Set
### Import the dataset

In [20]:
data = pd.read_csv("Data sets/adult.csv")

## Step 2: Understanding the data set

In [21]:
print("Total number of records: ", data.shape[0])
print("Total number of features: ",data.shape[1])

Total number of records:  48842
Total number of features:  15


#### Columns in the dataset
#### Target / Depedent variable = Income

In [14]:
data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [15]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,0


In [16]:
data.dtypes

age                 int64
workclass          object
fnlwgt              int64
education          object
educational-num     int64
marital-status     object
occupation         object
relationship       object
race               object
gender             object
capital-gain        int64
capital-loss        int64
hours-per-week      int64
native-country     object
income              int64
dtype: object

### Distinct value of the depedent variable / Target variable

In [22]:
print(data['income'].value_counts())

<=50K    37155
>50K     11687
Name: income, dtype: int64


#### So we have only 2 categories of Data <=50 and >=50

## Step 3: Clean the dataset
#### Has we can see most of the data in the dataset are categorical data.
We need to convert this categorical data into numerical data.
Even our Depedent var also seems to be categorical data. 

List of columns that needs to be converted to int / float data type

workclass          

education          

marital-status     

occupation         

relationship       

race               

gender             

native-country     

income             

In [23]:
data['income'] = data['income'].map(lambda x: 0 if x == '<=50K' else 1)

In [24]:
print(data['income'].value_counts())

0    37155
1    11687
Name: income, dtype: int64


In [46]:
def check_columns_for_categorical_data(data):
    cat_name = []
    for col_name in data.columns:
        if data[col_name].dtype == 'object':
            cat_name.append(col_name)
            print(col_name+" has "+ str(len(data[col_name].unique())) + " values" )
    if(len(cat_name) == 0):
        print("The data contains only numerical data")
        return(0)
    else:
        return (cat_name)
        

In [45]:
cat_name = check_columns_for_categorical_data(data)

The data contains only numerical data


Most of the columns seems to have very few unique observations except native country. Let's check the unique value of native-country

In [26]:
print(data['native-country'].value_counts())

United-States                 43832
Mexico                          951
?                               857
Philippines                     295
Germany                         206
Puerto-Rico                     184
Canada                          182
El-Salvador                     155
India                           151
Cuba                            138
England                         127
China                           122
South                           115
Jamaica                         106
Italy                           105
Dominican-Republic              103
Japan                            92
Guatemala                        88
Poland                           87
Vietnam                          86
Columbia                         85
Haiti                            75
Portugal                         67
Taiwan                           65
Iran                             59
Nicaragua                        49
Greece                           49
Peru                        

Looks like the majority of the data consists of US.

Let's map the data into two value 1 for US and 0 for non-US.

In [27]:
data['native-country'] = data['native-country'].map(lambda x: 1 if x == 'United-States' else 0)

In [28]:
data['native-country'].value_counts()

1    43832
0     5010
Name: native-country, dtype: int64

Now Let's convert all the category data into dummy variable data

In [29]:
print(cat_name)

['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']


In [30]:
# Function to dummy all the categorical variables used for modeling
def dummy_df(df, todummy_list):
    for x in todummy_list:
        dummies = pd.get_dummies(df[x], prefix=x, dummy_na=False)
        df = df.drop(x, 1)
        df = pd.concat([df, dummies], axis=1)
    return df

In [33]:
data = dummy_df(data, cat_name)

In [34]:
data.columns

Index(['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss',
       'hours-per-week', 'income', 'workclass_?', 'workclass_Federal-gov',
       'workclass_Local-gov', 'workclass_Never-worked', 'workclass_Private',
       'workclass_Self-emp-inc', 'workclass_Self-emp-not-inc',
       'workclass_State-gov', 'workclass_Without-pay', 'education_10th',
       'education_11th', 'education_12th', 'education_1st-4th',
       'education_5th-6th', 'education_7th-8th', 'education_9th',
       'education_Assoc-acdm', 'education_Assoc-voc', 'education_Bachelors',
       'education_Doctorate', 'education_HS-grad', 'education_Masters',
       'education_Preschool', 'education_Prof-school',
       'education_Some-college', 'marital-status_Divorced',
       'marital-status_Married-AF-spouse', 'marital-status_Married-civ-spouse',
       'marital-status_Married-spouse-absent', 'marital-status_Never-married',
       'marital-status_Separated', 'marital-status_Widowed', 'occupation_?',
       

Lets split the Depedent variable and the independent variable

In [34]:
x = data.drop('income',1)

In [35]:
y = data['income']

In [38]:
print(x.shape,y.shape)

(48842, 68) (48842,)


#### Note: that the x is a two dimensional array where as y is a one dimentional array

In [40]:
x.head()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,gender_Female,gender_Male,native-country_0,native-country_1
0,25,226802,7,0,0,40,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1
1,38,89814,9,0,0,50,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
2,28,336951,12,0,0,40,0,0,1,0,...,0,0,0,0,0,1,0,1,0,1
3,44,160323,10,7688,0,40,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1
4,18,103497,10,0,0,30,1,0,0,0,...,0,0,0,0,0,1,1,0,0,1


In [48]:
# Confirm that x contains only numerical data
cat_name = check_columns_for_categorical_data(data)

The data contains only numerical data


#### Handle Missing data

In [53]:
def check_null_values(data):
    null_cols = []
    for col_name in x.columns:
        null_values = x[col_name].isnull().sum()
        if (null_values != 0):
            null_cols.append(col_name)
            print(col_name + " contains "+ str(null_values))
    if len(null_cols):
        return(null_cols)
    else:
        print("Data contains no null values")
        return(0)

In [55]:
null_cols_in_x = check_null_values(x)

Data contains no null values


#### Let's train the model with out performing Feature selection to see how it impacts our model's results

In [60]:
from sklearn.model_selection import train_test_split

In [62]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state=True)

In [63]:
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

(39073, 68) (9769, 68) (39073,) (9769,)


In [70]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

In [71]:
regressor = LinearRegression()
l_regressor = LogisticRegression()

In [72]:
regressor.fit(x_train,y_train)
l_regressor.fit(x_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [74]:
print("Linear Regression model score = ", regressor.score(x_train,y_train)*100)
print("Logistic Regression model score = ",l_regressor.score(x_train,y_train)*100)

Linear Regression model score =  36.539834374432445
Logistic Regression model score =  79.7788754382822


#### So, as we can see without performing any feature selection our model has a very low score for Linear-Regression model.
Another reason why our Linear model score is so low is due to the regression line that we get for this model
When we have only 2 possible outcomes to predict it becomes a classification problem rather than a regression problem.

#### Hence we need to use a classification model.
#### But the Logistic Regression model is not so bad with a score above >75%
Classification model can also be used for data set where we would need to predict more than one class.
But for our example we have only 2 possible outcome so we go for Binary Regression model.


#### So, the score we got for our model is ~80% without performing feature selection
Now, let's try to see various methods where we can impove the model's score.

In [76]:
import sklearn.feature_selection

select = sklearn.feature_selection.SelectKBest(k=20)
selected_features = select.fit(x_train, y_train)
indices_selected = selected_features.get_support(indices=True)
colnames_selected = [x.columns[i] for i in indices_selected]

X_train_selected = x_train[colnames_selected]
X_test_selected = x_test[colnames_selected]

In [79]:
print(X_train_selected.shape)
print(X_train_selected.columns)

(39073, 20)
Index(['age', 'educational-num', 'capital-gain', 'capital-loss',
       'hours-per-week', 'workclass_Self-emp-inc', 'education_Bachelors',
       'education_Masters', 'education_Prof-school',
       'marital-status_Married-civ-spouse', 'marital-status_Never-married',
       'occupation_Exec-managerial', 'occupation_Other-service',
       'occupation_Prof-specialty', 'relationship_Husband',
       'relationship_Not-in-family', 'relationship_Own-child',
       'relationship_Unmarried', 'gender_Female', 'gender_Male'],
      dtype='object')


For our previous model we selected about 68 models now let's see if by selecting just these 20 columns help improve the model score

In [81]:
l_regressor_2 = LogisticRegression()
l_regressor_2.fit(X_train_selected,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [84]:
print("Logistic Regression after feature selection",l_regressor_2.score(X_train_selected,y_train)*100)


Logistic Regression after feature selection 84.39331507690733


As we can see we got a significant increase in performance in our model after performing feature selection