In [1]:
# import required modules
import os
import time
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, balanced_accuracy_score # balanced_accuracy_score with adjusted=True is Informedness
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

df_train = pd.read_csv('Datasets for Assignment 3/census-income.csv')
df_test = pd.read_csv('Datasets for Assignment 3/census-income-test.csv')


When to One-Hot Encode vs Label Encode?

```To prevent biases from being introduced, One-Hot Encoding is preferable for nominal data (where there is no inherent order among categories). Label encoding, however, might be more appropriate for ordinal data (where categories naturally have an order)```

So we should one-hot encode columns like class of worker, state of residence, etc. After reviewing the column descriptions I decided to one-hot encode all the following columns:

'ACLSWKR', 'ADTIND', 'ADTOCC', 'AMARITL', 'AMJIND', 'AMJOCC', 'ARACE', 'AREORGN', 'ASEX', 'AUNMEM', 'AUNTYPE', 'AWKSTAT', 'FILESTAT', 'GRINREG', 'GRINST', 'HHDFMX', 'HHDREL', 'MIGMTR1', 'MIGMTR3', 'MIGMTR4', 'PARENT', 'PEFNTVTY', 'PEMNTVTY', 'PENATVTY', 'PRCITSHP', 'SEOTR'

**https://www.geeksforgeeks.org/ml-one-hot-encoding-of-datasets-in-python/**



In [8]:
def Preprocessing(df_train, df_test, columns_to_one_hot_encode, columns_to_label_encode, columns_to_scale):

    def OneHotEncode (df, columns_to_one_hot_encode):
        from sklearn.preprocessing import OneHotEncoder
        onehotencode = OneHotEncoder()      

        for item in columns_to_one_hot_encode:
            df[item] = df[item].astype('category') # Must convert the strings to category numbers for One Hot to work
            df[item + '_new'] = df[item].cat.codes # Rob: Need to research this more
            # print(f" Column: {item}")
            # print(df[item + '_new'])

        OneHot_df = pd.DataFrame(onehotencode.fit_transform(df[columns_to_one_hot_encode]).toarray())

        PostOneHot_df = df.join(OneHot_df) # Appends the OneHot_df to the original dataframe to create a new one
        PostOneHot_df[:-100] # Check results from the One Hot Encoding
        PostOneHot_df = PostOneHot_df.drop(columns=columns_to_one_hot_encode)
        df = PostOneHot_df
        return df

    def StripSpaces (df):
        # I noticed some of the columns get imported with leading spaces. I want to strip() these right away
        for column in df.select_dtypes(include=object): # Only review the columns with a str datatype
            df[column] = df[column].apply(lambda x: x.strip())
        return df
    
    def PreLabelEncode(df):
        #   Before label encoding we want to apply some value judgements to the data to give the resulting labels some ranking
        #   education
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Children", "0"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Less than 1st grade", "1"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("1st 2nd 3rd or 4th grade", "2"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("5th or 6th grade", "3"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("7th and 8th grade", "4"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("9th grade", "5"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("10th grade", "6"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("11th grade", "7"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("12th grade no diploma", "8"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("High school graduate", "9"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Some college but no degree", "10"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Associates degree-occup /vocational", "11"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Associates degree-academic program", "12"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Bachelors degree(BA AB BS)", "13"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Masters degree(MA MS MEng MEd MSW MBA)", "14"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Doctorate degree(PhD EdD)", "15")) # Sorry Dr. Feuz, but the professional doctorates have you beat in earning potential
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Prof school degree (MD DDS DVM LLB JD)", "16"))

        #   enrolled in edu inst last wk
        df["AHSCOL"] = df["AHSCOL"].apply(lambda x: x.replace("Not in universe", "0"))
        df["AHSCOL"] = df["AHSCOL"].apply(lambda x: x.replace("High school", "1"))
        df["AHSCOL"] = df["AHSCOL"].apply(lambda x: x.replace("College or university", "2"))

        #   live in this house 1 year ago
        df["MIGSAME"] = df["MIGSAME"].apply(lambda x: x.replace("Not in universe under 1 year old", "0"))
        df["MIGSAME"] = df["MIGSAME"].apply(lambda x: x.replace("No", "1"))
        df["MIGSAME"] = df["MIGSAME"].apply(lambda x: x.replace("Yes", "2"))

        #   migration prev res in sunbelt
        df["MIGSUN"] = df["MIGSUN"].apply(lambda x: x.replace("?", "0"))
        df["MIGSUN"] = df["MIGSUN"].apply(lambda x: x.replace("Not in universe", "1"))
        df["MIGSUN"] = df["MIGSUN"].apply(lambda x: x.replace("No", "1"))
        df["MIGSUN"] = df["MIGSUN"].apply(lambda x: x.replace("Yes", "2"))

        #   fill inc questionnaire for veteran's admin
        df["VETQVA"] = df["VETQVA"].apply(lambda x: x.replace("Not in universe", "0"))
        df["VETQVA"] = df["VETQVA"].apply(lambda x: x.replace("No", "1"))
        df["VETQVA"] = df["VETQVA"].apply(lambda x: x.replace("Yes", "2"))

        return df

    def LabelEncode(df, columns_to_label_encode):
        from sklearn.preprocessing import LabelEncoder
        LabelEncode = LabelEncoder()

        for item in columns_to_label_encode:
            df[item]= LabelEncode.fit_transform(df[item])
            print(f"Post Label Encoding for {item}: {df[item].unique()}")

        return df
    
    def StandardScale(df, columns_to_scale):
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()

        scaled_columns = scaler.fit_transform(df[columns_to_scale])
        df[columns_to_scale] = scaled_columns

        return df
    
    df_train = StripSpaces(df_train)
    df_train = OneHotEncode(df_train, columns_to_one_hot_encode)
    df_train = PreLabelEncode(df_train)
    df_train = LabelEncode(df_train, columns_to_label_encode)
    df_train = StandardScale(df_train, columns_to_scale)

    print(f"df_test before processing: {df_test.info()}")    
    df_test = StripSpaces(df_test)
    df_test = OneHotEncode(df_test, columns_to_one_hot_encode)
    df_test = PreLabelEncode(df_test)
    df_test = LabelEncode(df_test, columns_to_label_encode)
    df_test = StandardScale(df_test, columns_to_scale)    
    print(f"df_test after processing: {df_test.info()}")    
    
    return(df_train, df_test)


Now let's label encode some of the columns, but first let's update the columns so they have an inherent rank order

Reference: https://www.geeksforgeeks.org/ml-label-encoding-of-datasets-in-python


In [9]:
df_test.values

array([[ 0.15095836,  9.        , -0.20155813, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.41961503,  3.        , -0.20155813, ...,  1.        ,
         0.        ,  0.        ],
       [-1.46098168,  0.        , -0.20155813, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-0.47590721, 11.        , -0.20155813, ...,  1.        ,
         0.        ,  0.        ],
       [-0.20725054,  5.        , -0.20155813, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.44946561, 12.        , -0.20155813, ...,  1.        ,
         0.        ,  0.        ]])

In [10]:
# This cell helps view the values we want to label encode
df_test['VETQVA'].unique().tolist()

[0, 1, 2]

In [11]:
columns_to_one_hot_encode = ['ACLSWKR', 'ADTIND', 'ADTOCC', 'AMARITL', 'AMJIND', 'AMJOCC', 'ARACE', 'AREORGN', 'ASEX', 'AUNMEM', 'AUNTYPE', 'AWKSTAT', 'FILESTAT', 'GRINREG', 'GRINST', 'HHDFMX', 'HHDREL', 'MIGMTR1', 'MIGMTR3', 'MIGMTR4', 'PARENT', 'PEFNTVTY', 'PEMNTVTY', 'PENATVTY', 'PRCITSHP', 'SEOTR']
columns_to_label_encode = ['AHGA','AHSCOL','MIGSAME','MIGSUN','VETQVA',]
columns_to_scale = ['AAGE','AHRSPAY','CAPGAIN','CAPLOSS','DIVVAL','NOEMP','WKSWORK',]

df_train, df_test = Preprocessing(df_train, df_test, columns_to_one_hot_encode, columns_to_label_encode, columns_to_scale)

KeyError: 'ACLSWKR'

Here is a Kaggle page showing the best categorical classifiers for a given data set:
* https://www.kaggle.com/code/jeffd23/10-classifier-showdown-in-scikit-learn
* Comment about grid search: https://www.kaggle.com/code/jeffd23/10-classifier-showdown-in-scikit-learn/comments#135499



In [12]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99762 entries, 0 to 99761
Columns: 506 entries, AAGE to 466
dtypes: float64(474), int64(6), int8(26)
memory usage: 367.8 MB


In [14]:
# df_train.info()

X_train = df_train.drop(columns='CLASS').values # Include ALL columns except CLASS
y_train = df_train['CLASS'].values # Only include Class

# Would normally run the following line, but CLASS isn't in the test data
# X_test = df_test.drop(columns='CLASS').values # Include ALL columns except CLASS
X_test = df_test.values # Include ALL columns except CLASS

In [16]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifiers = [
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
    train_predictions = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)

In [None]:
df[:-100]

Would be good to write code that would loop through all the columns and print out the uniques to add decisions about one-hot vs label encoding vs scaling:

So that takes one column and generates 9 columns