## Import data

In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.preprocessing import StandardScaler

columns = ['Age','Workclass','fnlgwt','Education','Education num','Marital Status',
           'Occupation','Relationship','Race','Sex','Capital Gain','Capital Loss',
           'Hours/Week','Native country','Income']

train = pd.read_csv('../input/adult-training.csv', names=columns)
test = pd.read_csv('../input/adult-test.csv', names=columns, skiprows=1)

%matplotlib inline

In [None]:
train.head()

More information about dataset (including what fnlgwt is): [archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names](http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names)

In [None]:
train.info()

In [None]:
test.info()

## Cleaning data
Some cells contain ' ?', we convert them to NaN

In [None]:
train.replace(' ?', np.nan, inplace=True)
test.replace(' ?', np.nan, inplace=True)

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
test.isnull().sum()

As we see only Workclass, Occupation and Native country features have missing values.

# Features engineering

## Income

Simply change Income into 0's and 1's

In [None]:
train['Income'] = train['Income'].apply(lambda x: 1 if x==' >50K' else 0)
test['Income'] = test['Income'].apply(lambda x: 1 if x==' >50K.' else 0)

## Age

In [None]:
plt.hist(train['Age']);

Age looks skewed, it needs to be normalized. It'll be done later with sklearn.preprocessing.StandardScaller().

## Workclass

There are many empty rows, let's replace them with 0 and check how data plot looks like.

In [None]:
train['Workclass'].fillna(' 0', inplace=True)
test['Workclass'].fillna(' 0', inplace=True)

In [None]:
sns.factorplot(x="Workclass", y="Income", data=train, kind="bar", size = 6, 
palette = "muted")
plt.xticks(rotation=45);

In [None]:
train['Workclass'].value_counts()

As Never-worked and Without-pay look very similar, we merge them.

In [None]:
train['Workclass'].replace(' Without-pay', ' Never-worked', inplace=True)
test['Workclass'].replace(' Without-pay', ' Never-worked', inplace=True)

## fnlgwt

In [None]:
train['fnlgwt'].describe()

Fnlgwt feature has high numers and big sandard deviation, let's take logarithm of that.

In [None]:
train['fnlgwt'] = train['fnlgwt'].apply(lambda x: np.log1p(x))
test['fnlgwt'] = test['fnlgwt'].apply(lambda x: np.log1p(x))

In [None]:
train['fnlgwt'].describe()

## Education

In [None]:
sns.factorplot(x="Education",y="Income",data=train,kind="bar", size = 7, 
palette = "muted")
plt.xticks(rotation=60);

Primary education is devided into grades, they all give almost the same result. We can merge them into one feature - Primary.

In [None]:
def primary(x):
    if x in [' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th', ' 10th', ' 11th', ' 12th']:
        return ' Primary'
    else:
        return x

In [None]:
train['Education'] = train['Education'].apply(primary)
test['Education'] = test['Education'].apply(primary)

In [None]:
sns.factorplot(x="Education",y="Income",data=train,kind="bar", size = 6, 
palette = "muted")
plt.xticks(rotation=60);

## Education num

In [None]:
sns.factorplot(x="Education num",y="Income",data=train,kind="bar", size = 6, 
palette = "muted")
plt.xticks(rotation=60);

## Marital Status

In [None]:
sns.factorplot(x="Marital Status",y="Income",data=train,kind="bar", size = 5, 
palette = "muted")
plt.xticks(rotation=60);

In [None]:
train['Marital Status'].value_counts()

There are very few Married-AF-spouse features. They are similar to Married-civ-spouse, so we can merge them.

In [None]:
train['Marital Status'].replace(' Married-AF-spouse', ' Married-civ-spouse', inplace=True)
test['Marital Status'].replace(' Married-AF-spouse', ' Married-civ-spouse', inplace=True)

In [None]:
sns.factorplot(x="Marital Status",y="Income",data=train,kind="bar", size = 6, 
palette = "muted")
plt.xticks(rotation=60);

## Occupation

In [None]:
train['Occupation'].fillna(' 0', inplace=True)
test['Occupation'].fillna(' 0', inplace=True)

In [None]:
sns.factorplot(x="Occupation",y="Income",data=train,kind="bar", size = 8, 
palette = "muted")
plt.xticks(rotation=60);

In [None]:
train['Occupation'].value_counts()

Everything looks good, except Armed-Forces. They are similar to 0 and that's what we replace them with.

In [None]:
train['Occupation'].replace(' Armed-Forces', ' 0', inplace=True)
test['Occupation'].replace(' Armed-Forces', ' 0', inplace=True)

In [None]:
sns.factorplot(x="Occupation",y="Income",data=train,kind="bar", size = 8, 
palette = "muted")
plt.xticks(rotation=60);

## Relationship

In [None]:
sns.factorplot(x="Relationship",y="Income",data=train,kind="bar", size = 6, 
palette = "muted")
plt.xticks(rotation=60);

In [None]:
train['Relationship'].value_counts()

Looks good.

## Race

In [None]:
sns.factorplot(x="Race",y="Income",data=train,kind="bar", size = 6, 
palette = "muted")
plt.xticks(rotation=45);

In [None]:
train['Race'].value_counts()

Nothing to change.

## Sex

In [None]:
sns.factorplot(x="Sex",y="Income",data=train,kind="bar", size = 4, 
palette = "muted");

Here neither.

## Capital Gain , Capital Loss,  Hours/Week

These features just need to be standarized.

## Native country 

In [None]:
train['Native country'].fillna(' 0', inplace=True)
test['Native country'].fillna(' 0', inplace=True)

In [None]:
sns.factorplot(x="Native country",y="Income",data=train,kind="bar", size = 10, 
palette = "muted")
plt.xticks(rotation=80);

We need to segregate these countries into a few categories.

In [None]:
def native(country):
    if country in [' United-States', ' Cuba', ' 0']:
        return 'US'
    elif country in [' England', ' Germany', ' Canada', ' Italy', ' France', ' Greece', ' Philippines']:
        return 'Western'
    elif country in [' Mexico', ' Puerto-Rico', ' Honduras', ' Jamaica', ' Columbia', ' Laos', ' Portugal', ' Haiti',
                     ' Dominican-Republic', ' El-Salvador', ' Guatemala', ' Peru', 
                     ' Trinadad&Tobago', ' Outlying-US(Guam-USVI-etc)', ' Nicaragua', ' Vietnam', ' Holand-Netherlands' ]:
        return 'Poor' # no offence
    elif country in [' India', ' Iran', ' Cambodia', ' Taiwan', ' Japan', ' Yugoslavia', ' China', ' Hong']:
        return 'Eastern'
    elif country in [' South', ' Poland', ' Ireland', ' Hungary', ' Scotland', ' Thailand', ' Ecuador']:
        return 'Poland team'
    
    else: 
        return country    

In [None]:
train['Native country'] = train['Native country'].apply(native)
test['Native country'] = test['Native country'].apply(native)

In [None]:
train['Native country'].value_counts()

In [None]:
sns.factorplot(x="Native country",y="Income",data=train,kind="bar", size = 5, 
palette = "muted")
plt.xticks(rotation=60);

# One-hot encoding

Now we need to encode categorical features, we are going to do it with pd.get_dummies(). As this method may cause some problems, we merge datasets. It ensures that dimensions for both datasets are equal and also that given feature corresponds to the same dimension in both train and test datasets.

generating output file.

In [None]:
test_data = pd.DataFrame({
    'Age':test['Age'],
    'Workclass':test['Workclass'],
    'fnlgwt':test['fnlgwt'],
    'Education':test['Education'],
    'Education num':test['Education num'],
    'Marital Status':test['Marital Status'],
    'Occupation':test['Occupation'],
    'Relationship':test['Relationship'],
    'Race':test['Race'],
    'Sex':test['Sex'],
    'Capital Gain':test['Capital Gain'],
    'Capital Loss':test['Capital Loss'],
    'Hours/Week':test['Hours/Week'],
    'Native country':test['Native country'],
    'Income':test['Income']
})
test_data.to_csv('test.csv', index = False)


trainning_data = pd.DataFrame({
    'Age':train['Age'],
    'Workclass':train['Workclass'],
    'fnlgwt':train['fnlgwt'],
    'Education':train['Education'],
    'Education num':train['Education num'],
    'Marital Status':train['Marital Status'],
    'Occupation':train['Occupation'],
    'Relationship':train['Relationship'],
    'Race':train['Race'],
    'Sex':train['Sex'],
    'Capital Gain':train['Capital Gain'],
    'Capital Loss':train['Capital Loss'],
    'Hours/Week':train['Hours/Week'],
    'Native country':train['Native country'],
    'Income':train['Income']
})
trainning_data.to_csv('trainning.csv', index = False)
