In [1]:
# import required modules
import os
import time
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, balanced_accuracy_score # balanced_accuracy_score with adjusted=True is Informedness
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

df = pd.read_csv('Datasets for Assignment 3/census-income.csv')


In [2]:
def preprocessing(df):
    # I noticed some of the columns get imported with leading spaces. We want to eliminate these right away
    for column in df.select_dtypes(include=object): # Only review the columns with a str datatype
        df[column] = df[column].apply(lambda x: x.strip())
    return df

df = preprocessing(df)

When to One-Hot Encode vs Label Encode?

```To prevent biases from being introduced, One-Hot Encoding is preferable for nominal data (where there is no inherent order among categories). Label encoding, however, might be more appropriate for ordinal data (where categories naturally have an order)```

So we should one-hot encode columns like class of worker, state of residence, etc. After reviewing the colums I decided to one-hot encode all the following columns:

'ACLSWKR', 'ADTIND', 'ADTOCC', 'AMARITL', 'AMJIND', 'AMJOCC', 'ARACE', 'AREORGN', 'ASEX', 'AUNMEM', 'AUNTYPE', 'AWKSTAT', 'FILESTAT', 'GRINREG', 'GRINST', 'HHDFMX', 'HHDREL', 'MIGMTR1', 'MIGMTR3', 'MIGMTR4', 'PARENT', 'PEFNTVTY', 'PEMNTVTY', 'PENATVTY', 'PRCITSHP', 'SEOTR'

When I tried to run this I got:
Expected 2D array, got 1D array instead:

**https://www.geeksforgeeks.org/ml-one-hot-encoding-of-datasets-in-python/**



In [None]:
# To Do: Move preprocessing code into methods so the same code can run on both the train and test sets

In [3]:
from sklearn.preprocessing import OneHotEncoder
OneHotEncode = OneHotEncoder()

columns_to_one_hot_encode = ['ACLSWKR', 'ADTIND', 'ADTOCC', 'AMARITL', 'AMJIND', 'AMJOCC', 'ARACE', 'AREORGN', 'ASEX', 'AUNMEM', 'AUNTYPE', 'AWKSTAT', 'FILESTAT', 'GRINREG', 'GRINST', 'HHDFMX', 'HHDREL', 'MIGMTR1', 'MIGMTR3', 'MIGMTR4', 'PARENT', 'PEFNTVTY', 'PEMNTVTY', 'PENATVTY', 'PRCITSHP', 'SEOTR']

for item in columns_to_one_hot_encode:
    df[item] = df[item].astype('category') # Must convert the strings to category numbers for One Hot to work
    df[item + '_new'] = df[item].cat.codes # Rob: Need to research this more
    # print(f" Column: {item}")
    # print(df[item + '_new'])

OneHot_df = pd.DataFrame(OneHotEncode.fit_transform(df[columns_to_one_hot_encode]).toarray())

PostOneHot_df = df.join(OneHot_df) # Appends the OneHot_df to the original dataframe to create a new one
PostOneHot_df[:-100] # Check results from the One Hot Encoding
PostOneHot_df = PostOneHot_df.drop(columns=columns_to_one_hot_encode)
df = PostOneHot_df
df


Unnamed: 0,AAGE,AHGA,AHRSPAY,AHSCOL,CAPGAIN,CAPLOSS,DIVVAL,MIGSAME,MIGSUN,NOEMP,...,458,459,460,461,462,463,464,465,466,467
0,73,High school graduate,0,Not in universe,0,0,0,Not in universe under 1 year old,?,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,58,Some college but no degree,0,Not in universe,0,0,0,No,Yes,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,18,10th grade,0,High school,0,0,0,Not in universe under 1 year old,?,0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,9,Children,0,Not in universe,0,0,0,Yes,Not in universe,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,10,Children,0,Not in universe,0,0,0,Yes,Not in universe,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199518,87,7th and 8th grade,0,Not in universe,0,0,0,Not in universe under 1 year old,?,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
199519,65,11th grade,0,Not in universe,6418,0,9,Yes,Not in universe,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
199520,47,Some college but no degree,0,Not in universe,0,0,157,Not in universe under 1 year old,?,6,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
199521,16,10th grade,0,High school,0,0,0,Not in universe under 1 year old,?,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


Now let's label encode some of the columns, but first let's update the columns so they have an inherent rank order

Reference: https://www.geeksforgeeks.org/ml-label-encoding-of-datasets-in-python


In [4]:
# This cell helps view the values we want to label encode
df['VETQVA'].unique().tolist()

['Not in universe', 'No', 'Yes']

In [5]:
# This cell is for label encoding prep
columns_to_label_encode = ['AHGA','AHSCOL','MIGSAME','MIGSUN','VETQVA',]

# Before label encoding we want to apply some value judgements to the data to give the resulting labels some ranking
#   education
df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Children", "0"))
df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Less than 1st grade", "1"))
df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("1st 2nd 3rd or 4th grade", "2"))
df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("5th or 6th grade", "3"))
df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("7th and 8th grade", "4"))
df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("9th grade", "5"))
df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("10th grade", "6"))
df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("11th grade", "7"))
df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("12th grade no diploma", "8"))
df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("High school graduate", "9"))
df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Some college but no degree", "10"))
df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Associates degree-occup /vocational", "11"))
df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Associates degree-academic program", "12"))
df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Bachelors degree(BA AB BS)", "13"))
df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Masters degree(MA MS MEng MEd MSW MBA)", "14"))
df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Doctorate degree(PhD EdD)", "15")) # Sorry Dr. Feuz, but the professional doctorates have you beat in earning potential
df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Prof school degree (MD DDS DVM LLB JD)", "16"))

#   enrolled in edu inst last wk
df["AHSCOL"] = df["AHSCOL"].apply(lambda x: x.replace("Not in universe", "0"))
df["AHSCOL"] = df["AHSCOL"].apply(lambda x: x.replace("High school", "1"))
df["AHSCOL"] = df["AHSCOL"].apply(lambda x: x.replace("College or university", "2"))

#   live in this house 1 year ago
df["MIGSAME"] = df["MIGSAME"].apply(lambda x: x.replace("Not in universe under 1 year old", "0"))
df["MIGSAME"] = df["MIGSAME"].apply(lambda x: x.replace("No", "1"))
df["MIGSAME"] = df["MIGSAME"].apply(lambda x: x.replace("Yes", "2"))

#   migration prev res in sunbelt
df["MIGSUN"] = df["MIGSUN"].apply(lambda x: x.replace("?", "0"))
df["MIGSUN"] = df["MIGSUN"].apply(lambda x: x.replace("Not in universe", "1"))
df["MIGSUN"] = df["MIGSUN"].apply(lambda x: x.replace("No", "1"))
df["MIGSUN"] = df["MIGSUN"].apply(lambda x: x.replace("Yes", "2"))

#   fill inc questionnaire for veteran's admin
df["VETQVA"] = df["VETQVA"].apply(lambda x: x.replace("Not in universe", "0"))
df["VETQVA"] = df["VETQVA"].apply(lambda x: x.replace("No", "1"))
df["VETQVA"] = df["VETQVA"].apply(lambda x: x.replace("Yes", "2"))

for item in columns_to_label_encode:
    print(df[item].value_counts().sort_index())
    print()



AHGA
0     47422
1       819
10    27820
11     5358
12     4363
13    19865
14     6541
15     1263
16     1793
2      1799
3      3277
4      8007
5      6230
6      7557
7      6876
8      2126
9     48407
Name: count, dtype: int64

AHSCOL
0    186943
1      6892
2      5688
Name: count, dtype: int64

MIGSAME
0    101212
1     15773
2     82538
Name: count, dtype: int64

MIGSUN
0    99696
1    94041
2     5786
Name: count, dtype: int64

VETQVA
0    197539
1      1593
2       391
Name: count, dtype: int64



In [6]:
from sklearn.preprocessing import LabelEncoder
LabelEncode = LabelEncoder()

for item in columns_to_label_encode:
    df[item]= LabelEncode.fit_transform(df[item])
    print(f"Post Label Encoding for {item}: {df[item].unique()}")

df = df.drop(columns=columns_to_label_encode)

Post Label Encoding for AHGA: [16  2 13  0  5  6  1  4 11 15  3  8 10 14  7 12  9]
Post Label Encoding for AHSCOL: [0 1 2]
Post Label Encoding for MIGSAME: [0 1 2]
Post Label Encoding for MIGSUN: [0 2 1]
Post Label Encoding for VETQVA: [0 1 2]


In [None]:
# May want to add some metrics of my own

In [None]:
# To Do: Scale the remaining numerical data

In [None]:
# To Do: Cross Fit Testing

Here is a Kaggle page showing the best categorical classifiers for a given data set:
* https://www.kaggle.com/code/jeffd23/10-classifier-showdown-in-scikit-learn
* Comment about grid search: https://www.kaggle.com/code/jeffd23/10-classifier-showdown-in-scikit-learn/comments#135499



In [None]:
# Models to try: CategoricalNB, RandomForest

In [7]:
df[:-100]

Unnamed: 0,AAGE,AHRSPAY,CAPGAIN,CAPLOSS,DIVVAL,NOEMP,VETYN,WKSWORK,CLASS,ACLSWKR_new,...,458,459,460,461,462,463,464,465,466,467
0,73,0,0,0,0,0,2,0,-1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,58,0,0,0,0,1,2,52,-1,6,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,18,0,0,0,0,0,2,0,-1,3,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,9,0,0,0,0,0,0,0,-1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,10,0,0,0,0,0,0,0,-1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199418,9,0,0,0,0,0,0,0,-1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
199419,12,0,0,0,0,0,0,0,-1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
199420,7,0,0,0,0,0,0,0,-1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
199421,60,0,0,0,0,1,2,13,-1,6,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


Would be good to write code that would loop through all the columns and print out the uniques to add decisions about one-hot vs label encoding vs scaling:

So that takes one column and generates 9 columns