In [47]:
import numpy as np
import pandas as pd
import io
import requests
import seaborn as sns
from matplotlib import pyplot as plt
import pickle
import os
from pandas.api.types import CategoricalDtype


from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion


In [1]:
def load_dataset(path, urls):
    if not os.path.exists(path):
        os.mkdir(path)

    for url in urls:
        data = requests.get(url).content
        filename = os.path.join(path, os.path.basename(url))
        with open(filename, "wb") as file:
            file.write(data)

In [11]:
urls = ["http://archive.ics.uci.edu/ml/machine-learning-  databases/adult/adult.data",
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names",
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"]

load_dataset('Data', urls)

In [17]:
columns = ["age", "workClass", "fnlwgt", "education", "education-num","marital-status", "occupation", "relationship",
          "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]

train_data = pd.read_csv('Data/adult.data', names=columns, 
             sep=' *, *', na_values='?', engine='python')
test_data  = pd.read_csv('Data/adult.test', names=columns, 
             sep=' *, *', skiprows=1, na_values='?',engine='python')

In [21]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workClass         30725 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education-num     32561 non-null int64
marital-status    32561 non-null object
occupation        30718 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital-gain      32561 non-null int64
capital-loss      32561 non-null int64
hours-per-week    32561 non-null int64
native-country    31978 non-null object
income            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [22]:
num_attributes = train_data.select_dtypes(include=['int'])
print(num_attributes.columns)

Index([u'age', u'fnlwgt', u'education-num', u'capital-gain', u'capital-loss',
       u'hours-per-week'],
      dtype='object')


In [26]:
num_attributes.hist(figsize=(10,10))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f0a37738810>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f0a376a7fd0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f0a38a81050>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f0a376536d0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f0a3753e210>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f0a3756cfd0>]],
      dtype=object)

In [27]:
train_data.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [29]:
class ColumnsSelector(BaseEstimator, TransformerMixin):
    def __init__(self, type):
        self.type = type
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        return X.select_dtypes(include=[self.type])

In [36]:
num_pipeline = Pipeline(steps=[
    ("num_attr_selector", ColumnsSelector(type='int')),
    ("scaler", StandardScaler())
])

In [39]:
class CategoricalImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns = None, strategy='most_frequent'):
        self.columns = columns
        self.strategy = strategy
    def fit(self,X, y=None):
        if self.columns is None:
            self.columns = X.columns
        if self.strategy is 'most_frequent':
            self.fill = {column: X[column].value_counts().index[0] for column in self.columns}
        else:
            self.fill ={column: '0' for column in self.columns}
        return self
    def transform(self,X):
        X_copy = X.copy()
        for column in self.columns:
             X_copy[column] = X_copy[column].fillna(self.fill[column])
        return X_copy

In [43]:
class CategoricalEncoder(BaseEstimator, TransformerMixin):
      def __init__(self, dropFirst=True):
        self.categories=dict()
        self.dropFirst=dropFirst
    
      def fit(self, X, y=None):
        join_df = pd.concat([train_data, test_data])
        join_df = join_df.select_dtypes(include=['object'])
        for column in join_df.columns:
          self.categories[column] = join_df[column].value_counts().index.tolist()
        return self
    
      def transform(self, X):
        X_copy = X.copy()
        X_copy = X_copy.select_dtypes(include=['object'])
        for column in X_copy.columns:
          X_copy[column] = X_copy[column].astype({column:CategoricalDtype(self.categories[column])})
        return pd.get_dummies(X_copy, drop_first=self.dropFirst)

In [50]:
cat_pipeline = Pipeline(steps=[
    ("cat_attr_selector", ColumnsSelector(type='object')),
    ("cat_imputer", CategoricalImputer(columns=
          ['workClass','occupation', 'native-country'])),
    ("encoder", CategoricalEncoder(dropFirst=True))
])

In [51]:
full_pipeline = FeatureUnion([("num_pipe", num_pipeline), 
                ("cat_pipeline", cat_pipeline)])

In [55]:
train_data.drop(['fnlwgt', 'education'], axis=1, inplace=True)
test_data.drop(['fnlwgt', 'education'], axis=1, inplace=True)

ValueError: labels ['fnlwgt' 'education'] not contained in axis

In [58]:
train_copy = train_data.copy()
train_copy["income"] = train_copy["income"].apply(lambda x:0 if 
                        x=='<=50K' else 1)

X_train = train_copy.drop('income', axis =1)
Y_train = train_copy['income']
print X_train

       age         workClass  education-num         marital-status  \
0       39         State-gov             13          Never-married   
1       50  Self-emp-not-inc             13     Married-civ-spouse   
2       38           Private              9               Divorced   
3       53           Private              7     Married-civ-spouse   
4       28           Private             13     Married-civ-spouse   
5       37           Private             14     Married-civ-spouse   
6       49           Private              5  Married-spouse-absent   
7       52  Self-emp-not-inc              9     Married-civ-spouse   
8       31           Private             14          Never-married   
9       42           Private             13     Married-civ-spouse   
10      37           Private             10     Married-civ-spouse   
11      30         State-gov             13     Married-civ-spouse   
12      23           Private             13          Never-married   
13      32          