# Census Income
---

#### Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

ModuleNotFoundError: No module named 'sklearn'

#### Import the data file

In [None]:
raw_df = pd.read_csv('files/datasets/adult.csv')
raw_df.head()

Because values from the CSV file in Kaggle uses `?` as null values, we will replace those question marks with a NaN. 

In [None]:
raw_df.replace(to_replace='?', value=np.nan, inplace=True)

In [None]:
raw_df.replace(to_replace='<=50K', value=0, inplace=True)
raw_df.replace(to_replace='>50K', value=1, inplace=True)

#### View information about the dataframe

In [None]:
raw_df.info()

#### Split the data into target and features

In [None]:
def xy_split(dataframe, y_column):
    features = dataframe.loc[:, dataframe.columns != y_column]
    target = dataframe[y_column]
    return features, target

X, y = xy_split(raw_df, 'income')

##### Create training and testing sets

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##### Create a copy of the original dataset

In [None]:
df = X_train.copy()

---

# Data Preparation

#### Import necesary libraries

In [None]:
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from sklearn import set_config
set_config(transform_output="pandas")

#### Determine which columns have low cardinality

In [None]:
def ohe_eligibility(datafame):
    for i in datafame.columns:
        if len(datafame[i].value_counts()) < 10:
            print(f'{len(datafame[i].value_counts())} - {i}')
        else:
            pass
        
ohe_eligibility(df)

### Fixing/Encoding Marital Status column

In [None]:
df['marital.status'].value_counts(dropna=False)

In [None]:
sns.countplot(x='marital.status', data=df)
plt.xticks(rotation=45)
plt.show()

Reduce categories by merging different categories into one.

Single: **Never Married**

Break: **Divorced**, **Widowed**, **Separated**

Married: **Married-civ-spouse**, **Married-spouse-absent**, **Married AF spouse**

In [None]:
replacements_dict = {"Never-married": "single", 
                     "Divorced": "break", 
                     "Separated": "break",
                     "Widowed": "break", 
                     "Married-civ-spouse": "married",
                     "Married-spouse-absent": "married", 
                     "Married-AF-spouse": "married",}

In [None]:
import pandas as pd
from sklearn.preprocessing import FunctionTransformer

# Define a custom function to replace a value in a dataframe
def replace_values(dataframe, column, replacements_dict):
    dataframe[column] = dataframe[column].replace(replacements_dict)
    return dataframe

# Create a FunctionTransformer object
replace_transformer = FunctionTransformer(replace_values, kw_args={'column': 'marital.status', 'replacements_dict': replacements_dict})

# # Apply the transformer to a example dataframe
df_new = replace_transformer.transform(df)

In [None]:
df_new.head()

In [None]:
df_new['marital.status'].value_counts()

In [None]:
# Apply One-Hot Encoding

ohe_enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

In [None]:
ct = make_column_transformer(
    (ohe_enc, ['marital.status']))

In [None]:
ms_enc = ct.fit_transform(df_new)

In [None]:
ms_enc.head()

----

### Fixing Workclass column

In [None]:
df['workclass'].value_counts(dropna=False)

Let's merge the categorical varaibles to reduce cardinality

In [None]:
replacements_dict_wc = {"Private": "P.Employed", 
                        "Self-emp-not-inc": "S.Employed", 
                        "Local-gov": "G.Employed",
                        "State-gov": "G.Employed", 
                        "Self-emp-inc": "S.Employed",
                        "Federal-gov": "G.Employed", 
                        "Without-pay": "P.Employed",
                        "Never-worked": "Unemployed"}

Private          - P.Employed         <br>
Self-emp-not-inc - S.Employed      <br>
Local-gov        - G.Employed       <br>
NaN              - NaN                <br>
State-gov        - G.Employed       <br>
Self-emp-inc     - S.Employed      <br>
Federal-gov      - G.Employed       <br>
Without-pay      - P.Employed           <br>
Never-worked     - Unemployed         <br>

In [None]:
raw_df_copy = raw_df.copy()

wc_df = replace_values(raw_df_copy, 'workclass', replacements_dict_wc)

In [None]:
wc_df.head()

In [None]:
wc_df['workclass'].value_counts(dropna=False)

In [None]:
# Determine how workclass and income are nested
sns.countplot(x = 'workclass', 
              hue = 'income', 
              data = wc_df)
 
# Show the plot
plt.show()

In [None]:
# Use as a transformer

replace_transformer_wc = FunctionTransformer(replace_values, kw_args={'column': 'workclass', 'replacements_dict': replacements_dict_wc})

### Fixing Relationship column (NA)

In [None]:
wc_df['relationship'].value_counts()

```
parent       - Husband        - husband sa kanyang family    <br>
no_family    - Not-in-family  - walang fam    <br>
no_family    - Own-child      - child sa kanyang fam    <br>
no_family    -Unmarried      - walang fam    <br>
parent       - Wife           - husband sa kanyang family    <br>
no_family    - Other-relative - walang fam    <br>

```

In [None]:
replacements_dict_r = {"Husband": "parent",           
                       "Not-in-family": "no_family",      
                       "Own-child": "no_family",   
                       "Unmarried": "no_family",  
                       "Wife": "parent",      
                       "Other-relative": "no_family"}

In [None]:
r_df = replace_values(wc_df, 'relationship', replacements_dict_r)

In [None]:
r_df

In [None]:
replace_transformer_r = FunctionTransformer(replace_values, kw_args={'column': 'relationship', 'replacements_dict': replacements_dict_r})

---

### Apply function transformers as for loop

In [None]:
transformers = [(replace_transformer_r, 'relationship', replacements_dict_r),    
                (replace_transformer_wc, 'workclass', replacements_dict_wc),    
                (replace_transformer, 'marital.status', replacements_dict)]

for transformer, column, replacements_dict in transformers:
    replace_transformer = FunctionTransformer(replace_values, kw_args={'column': column, 'replacements_dict': replacements_dict})

In [None]:
# Data Before

raw_df_copy.head()

In [None]:
df_transformed = replace_transformer.fit_transform(raw_df_copy)

In [None]:
# Data after, it works

df_transformed.head()

### Create an initial pipeline

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
ohe_enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
std_scl = StandardScaler()
oe_enc = OrdinalEncoder(handle_unknown='error')

In [None]:
count_process = ColumnTransformer([
    ('ohe', ohe_enc, ['marital.status', 'sex', 'relationship', 'race']),
    ('std-scale', std_scl, ['age', 'fnlwgt']), 
    ('passthrough', 'passthrough', ['education.num', 'capital.gain', 'capital.loss', 'hours.per.week', 'workclass'])
])

In [None]:
dt_primer = count_process.fit_transform(df_transformed)
dt_primer.head()

In [None]:
dt_primer.shape

In [None]:
dt_primer.isnull().sum().tail()

#### Decision Tree as Imputer

In [None]:
def RandomForestImputer(dataframe, y_column):
    df = dataframe.dropna()
    X = df.drop(y_column, axis=1)
    y = df[y_column]
    
    

In [None]:
dt_train = dt_primer.dropna()

In [None]:
# Split the data into features and target variable
X = dt_train.drop('passthrough__workclass', axis=1)
y = dt_train['passthrough__workclass']

In [None]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [None]:
# Train the DecisionTreeClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

In [None]:
# Predict the target values for the test set
y_pred = clf.predict(X_test)

In [None]:
y_pred_impute = clf.predict(dt_primer.loc[:, dt_primer.columns != 'passthrough__workclass'])

In [None]:
pd.DataFrame(y_pred_impute)

In [None]:
# Evaluate the model's performance
print("Accuracy:", clf.score(X_test, y_test))

---

Applied `DecisionTreeClassifier` as imputer, the accuracy is good, but when the imputer is replaced to be a `RandomForestClassfier`, it performed much better.

In [None]:
raw_df_copy['marital.status'].value_counts()

In [None]:
# Determine how workclass and income are nested
sns.countplot(x = 'relationship', 
              hue = 'income', 
              data = wc_df)
 
# Show the plot
plt.show()

### Fixing Occupation column (NA)

Let's try to impute this using columns from other data, we wil discover if it is possible to impute the `occupation` based on sex, race, education and workclass.

In [None]:
wc_df['occupation'].value_counts(dropna=False)

In [None]:
hs_grads = wc_df.loc[wc_df['education.num'] == 10]

In [None]:
hs_grads.loc[hs_grads['occupation'].isna()]

In [None]:
hs_grads['occupation'].value_counts()

### Fixing Race column (NA)

In [None]:
wc_df['race'].value_counts(dropna=False)

In [None]:
# Determine how workclass and income are nested
sns.countplot(x = 'race', 
              hue = 'income', 
              data = wc_df)
 
# Show the plot
plt.show()

In [None]:
# Apply KNNImputing or MICE

In [None]:
df['relationship'].value_counts(dropna=False)

Ano kinalaman neto?!

#### Explore KNNImputing

In [None]:
ohe_enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ms_pipe = make_pipeline(replace_transformer, ohe_enc)

In [None]:
count_process = ColumnTransformer([ 
    ('ohe', ohe_enc, ['relationship', 'sex']), 
    ('repl_tf', ms_pipe, ['marital.status'])
])

In [None]:
count_process.fit_transform(df)

### Next Task 

* How to apply FunctionTransformer inside a pipeline

* [ ] Age
* [ ] workclass
* [ ] fnlwght
* [ ] education
* [ ] education.num
* [X] martial.status
* [ ] occupation
* [X] relationship
* [ ] race
* [X] sex
* [ ] occupation
* [ ] capital.gain
* [ ] capital.loss
* [ ] hours.per.week
* [ ] native.country
* [ ] income

---

In [None]:
freq_imp = SimpleImputer(strategy="most_frequent")
ohe_enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_pipeline = make_pipeline(freq_imp, ohe_enc)

In [None]:
from category_encoders import CountEncoder

cnt_encoder = CountEncoder()

ohe_enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cardinality_pipeline = make_pipeline(freq_imp, cnt_encoder)

In [None]:
df.columns

In [None]:
oe = OrdinalEncoder()

In [None]:
count_process = ColumnTransformer([
    ('count', cardinality_pipeline, ['relationship', 'marital.status']),
    ('ohe', ohe_enc, ['sex'])
])

In [None]:
df['workclass'].isnull().sum()

In [None]:
count_process.fit_transform(df)

----

In [None]:
df

---

---