# Project 1 | **EDA**

In [95]:
# Import dependencies
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

## Import data

In [96]:
# Import adult data
data = pd.read_csv("raw_data/project_adult.csv", index_col=0)

data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
5514,33,Local-gov,198183,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,50,United-States,>50K
19777,36,Private,86459,Assoc-voc,11,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,1887,50,United-States,>50K
10781,58,Self-emp-not-inc,203039,9th,5,Separated,Craft-repair,Not-in-family,White,Male,0,0,40,United-States,<=50K
32240,21,Private,180190,Assoc-voc,11,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,46,United-States,<=50K
9876,27,Private,279872,Some-college,10,Divorced,Other-service,Not-in-family,White,Male,0,0,40,United-States,<=50K


In [97]:
# Import validation data
validation_data = pd.read_csv("raw_data/project_validation_inputs.csv", index_col=0)

validation_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
14160,27,Private,160178,Some-college,10,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,38,United-States
27048,45,State-gov,50567,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States
28868,29,Private,185908,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,55,United-States
5667,30,Private,190040,Bachelors,13,Never-married,Machine-op-inspct,Not-in-family,White,Female,0,0,40,United-States
7827,29,Self-emp-not-inc,189346,Some-college,10,Divorced,Craft-repair,Not-in-family,White,Male,2202,0,50,United-States


In [98]:
# Summarize numerical variables
data.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,26048.0,26048.0,26048.0,26048.0,26048.0,26048.0
mean,38.576513,189759.2,10.080544,1053.617437,87.988828,40.388552
std,13.642904,105232.9,2.575202,7230.800656,403.629787,12.331139
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117963.0,9.0,0.0,0.0,40.0
50%,37.0,178263.5,10.0,0.0,0.0,40.0
75%,48.0,237006.5,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


## Missing values

### Missing values in the adult data

In [99]:
# Count NULLs
print(data.isnull().sum(), '\n')

# Count NAs
print(data.isna().sum())

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64 

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64


### Missing values in the validation data

In [100]:
# Count NULLs
print(validation_data.isnull().sum(), '\n')

# Count NAs
print(validation_data.isna().sum())

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
dtype: int64 

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
dtype: int64


## Data preprocessing

**Numerical variables**:

- `age`
- `fnlwgt`
- `education-num`
- `capital-gain`
- `capital-loss`
- `hours-per-week`

**Categorical variables**:

- `workclass`
- `marital-status`
- `education`
- `occupation`
- `relationship`
- `race`
- `sex`
- `native-country`

In [101]:
# Define a function that processes our data
def processing(data, 
               numerical_cols = numerical_cols, 
               categorical_cols = categorical_cols) :
    # Create a new dataframe for storing the processed data
    processed_data = data.copy()

    # Drop the education column, since it already has a numerical analog
    processed_data = processed_data.drop('education', axis=1)

    # Encode categorical values
    le = LabelEncoder()
    for i in categorical_cols :
        processed_data[i] = le.fit_transform(data[i])

    # Standardize numerical values
    scaler = StandardScaler()
    processed_data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

    return processed_data

In [102]:
# Define categorical columns for training data
categorical_cols = ['workclass', 'marital-status', 'occupation',
                    'relationship', 'race', 'sex', 'native-country',
                    'income']

# Define numerical columns
numerical_cols = ['age', 'fnlwgt', 'education-num', 
                  'capital-gain', 'capital-loss', 'hours-per-week']

# Process the training data
processed_adult = processing(data = data)

processed_adult.head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
5514,-0.408756,2,0.080051,1.133702,4,10,1,4,0,-0.145715,-0.217998,0.77946,39,>50K
19777,-0.188857,4,-0.981653,0.357049,2,4,0,4,1,-0.145715,4.457168,0.77946,39,>50K
10781,1.423734,6,0.126197,-1.97291,5,3,1,4,1,-0.145715,-0.217998,-0.03151,39,<=50K
32240,-1.288351,4,-0.090935,0.357049,2,5,0,4,1,-0.145715,-0.217998,0.455072,39,<=50K
9876,-0.848554,4,0.856334,-0.031277,0,8,1,4,1,-0.145715,-0.217998,-0.03151,39,<=50K


In [103]:
# Redefine the categorical columns for the validation data
categorical_cols = ['workclass', 'marital-status', 'occupation',
                    'relationship', 'race', 'sex', 'native-country']

# Process the validation data
processed_validation = processing(data = validation_data)

processed_validation.head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
14160,-0.851191,3,-0.277853,-0.031693,0,1,1,4,0,-0.147225,-0.211274,-0.212164,37
27048,0.469374,6,-1.30409,-0.421896,2,4,5,4,0,-0.147225,-0.211274,-0.051009,37
28868,-0.704461,3,-0.036955,1.138915,2,4,0,2,1,-0.147225,-0.211274,1.157652,37
5667,-0.631097,3,0.001731,1.138915,4,7,1,4,0,-0.147225,-0.211274,-0.051009,37
7827,-0.704461,5,-0.004766,-0.031693,0,3,1,4,1,0.128973,-0.211274,0.754765,37


In [105]:
# Output processed data
processed_adult.to_csv('processed_data/processed_project_adult.csv')
validation_data.to_csv('processed_data/processed_project_validation.csv')