In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import RobustScaler

from helper import load_data

In [2]:
data_path = "./data/"
path_train = os.path.join(data_path, "train.csv")
path_test = os.path.join(data_path, "test.csv")

In [3]:
data_train, data_test = load_data(path_train, path_test)

In [36]:
pd.get_dummies(data_train[["native_country"]]).shape

(32561, 42)

In [35]:
pd.get_dummies(data_test[["native_country"]]).shape

(16281, 41)

In [37]:
class DataPreprocessor:
    def __init__(self):
        self.train_mean = None
        self.train_std = None
        self.num_cols = ["age", "fnlwgt", "capital_gain", "capital_loss", "hours_per_week"]
        self.cat_cols = ["workclass", "education_num", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"]
        self.kept_cat_cols = ["workclass", "relationship", "sex"]
        self.kept_order_cat_cols = "education_num"
        self.kept_num_cols = ["age", "fnlwgt", "capital_gain", "capital_loss", "hours_per_week"]
        self.all_native_countries = None
        #used for remove less frequent categorical columns
        self.keep_cols = None
        self.drop_category = ["workclass_Never-worked"]
        self.robust_scaler = None

    def _transform_label(self, data_train):
        label_dict = {'<=50K': 0, '>50K': 1}
        data_train_ = data_train.copy()
        data_train_["income"] = data_train_["income"].apply(lambda x: label_dict[x])
        return data_train_ 
        
    def _do_one_hot_encoding(self, data_cat: pd.DataFrame, isTraining = False):
        if isTraining:
            self.all_native_countries = data_cat["native_country"].value_counts().index.sort_values().to_list()
        else:
            #fix missing columns in testing dataset
            data_cat["native_country"] = data_cat["native_country"].astype(pd.CategoricalDtype(categories=self.all_native_countries))
        data_one_hot = pd.get_dummies(data_cat)
        # if not isTraining:
        #     data_one_hot = data_one_hot.drop(self.drop_category, axis = 1)
        return data_one_hot
        
    def _normalize_data(self, X_data: pd.DataFrame, isTraining = False):
        if isTraining:
            self.train_mean = X_data.mean(axis = 0)
            self.train_std = X_data.std(axis = 0)
        normalized_data = (X_data - self.train_mean) / self.train_std
        return normalized_data

    def _remove_less_frequent_cat_features(self, X_data_cat: pd.DataFrame, lower_bound_ratio = 0.5, isTraining = False):
        if isTraining:
            cols = X_data_cat.columns
            keep_cols_bool = (X_data_cat.sum(axis = 0) / X_data_cat.shape[0]) > lower_bound_ratio
            self.keep_cols = cols[keep_cols_bool]
        return X_data_cat[self.keep_cols]

    def select_special_column(self, data: pd.DataFrame, isTraining = False):
        """
        Special column
        - Education_num: 1
        - workclass: never-worked
        """
        special_index = []
        cond1 = (data["education_num"] == 1)
        cond2 = (data["workclass"] == "Never-worked")
        #data_kept = data[~cond1 & ~cond2]
        if not isTraining:
            #Keep these rows in testing dataset
            #Manually label them after
            data_kept = data
            special_index = data[cond1 | cond2].index.tolist()
        return data_kept, special_index

    def keep_num_columns(self, data_num: pd.DataFrame):
        return data_num[self.kept_num_cols]

    def keep_cat_columns(self, data_cat: pd.DataFrame):
        return data_cat[self.kept_cat_cols]

    def keep_order_cat_columns(self, data: pd.DataFrame):
        return data[self.kept_order_cat_cols]

    def robust_scaling(self, data_num: pd.DataFrame, isTraining = False):
        if isTraining:
            self.robust_scaler = RobustScaler()
            self.robust_scaler.fit(data_num)
        data_num_scaled = self.robust_scaler.transform(data_num)
        return data_num_scaled

    def preprocess_train_data(self, data_train: pd.DataFrame):
        #avoid changes in original dataset
        data_train_ = self._transform_label(data_train)
        #data_train_, _ = self.select_special_column(data_train_, isTraining=True)

        #preprocessing - numerical 
        #data_train_num = self._normalize_data(data_train_num, isTraining=True)
        #data_train_num = self.keep_num_columns(data_train_)
        data_train_num = data_train_[self.num_cols]
        #data_train_num = self.robust_scaling(data_train_num, isTraining=True) #return np.array

        #preprocessing - categorical (16 features) / order categorical (1 features)
        #data_train_order_cat = self.keep_order_cat_columns(data_train_)
        #data_train_cat = self.keep_cat_columns(data_train_)
        data_train_cat = data_train_[self.cat_cols]
        data_train_cat = self._do_one_hot_encoding(data_train_cat, isTraining=True)
        print(data_train_cat.shape)
        #data_train_cat = self._remove_less_frequent_cat_features(data_train_cat, isTraining=True)

        #combine
        # data_train_cat = np.array(data_train_cat)
        # data_train_order_cat = np.array(data_train_order_cat)
        # print(data_train_cat.shape)
        # print(data_train_order_cat.shape)
        #data_train_preprocessed = np.concatenate([data_train_num, data_train_order_cat.reshape(-1, 1), data_train_cat], axis = 1)
        data_train_preprocessed = pd.concat([data_train_num, data_train_cat], axis = 1)
        X_train = np.array(data_train_preprocessed)

        #process y
        y_train = np.array(data_train_["income"])

        return X_train, y_train

    def preprocess_test_data(self, data_test: pd.DataFrame):
        #avoid changes in original dataset
        data_test_ = data_test.copy()
        #data_test_, special_index = self.select_special_column(data_test, isTraining=False)

        #preprocessing - numerical
        data_test_num = data_test_[self.num_cols]
        #data_test_num = self._normalize_data(data_test_num, isTraining=False)
        #data_test_num = self.keep_num_columns(data_test_)
        #data_test_num = self.robust_scaling(data_test_num, isTraining=False) #return np.array

        #preprocessing - categorical
        data_test_cat = data_test_[self.cat_cols]
        # data_test_cat = self.keep_cat_columns(data_test_)
        data_test_cat = self._do_one_hot_encoding(data_test_cat, isTraining=False)
        print(data_test_cat.shape)
        # data_test_order_cat = self.keep_order_cat_columns(data_test_)
        #data_test_cat = self._remove_less_frequent_cat_features(data_test_cat, isTraining=False)

        #combine        
        # data_test_cat = np.array(data_test_cat)
        # data_test_order_cat = np.array(data_test_order_cat)
        # print(data_test_cat.shape)
        # print(data_test_order_cat.shape)
        #data_test_preprocessed = np.concatenate([data_test_num, data_test_order_cat.reshape(-1, 1), data_test_cat], axis = 1)
        data_test_preprocessed = pd.concat([data_test_num, data_test_cat], axis = 1)
        X_test = np.array(data_test_preprocessed)
        return X_test, special_index


In [38]:
DP = DataPreprocessor()
X_train, y_train = DP.preprocess_train_data(data_train)
X_test, special_index = DP.preprocess_test_data(data_test)

(32561, 103)
(16281, 103)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
data_test

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States
16277,64,?,321403,HS-grad,9,Widowed,?,Other-relative,Black,Male,0,0,40,United-States
16278,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States
16279,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States


In [10]:
print(X_train.shape)
print(X_test.shape)

(32503, 106)
(16281, 107)


In [30]:
data_test.shape

(16281, 14)

In [29]:
X_test.shape

(16246, 22)

In [32]:
cond1 = (data_test["education_num"] == 1)
cond2 = (data_test["workclass"] == "Never-worked")
data_test[cond1 | cond2].shape


(35, 14)