In [1]:
# import required modules
import os
import time
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, balanced_accuracy_score # balanced_accuracy_score with adjusted=True is Informedness
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

df_train = pd.read_csv('Datasets for Assignment 3/census-income.csv')
df_test = pd.read_csv('Datasets for Assignment 3/census-income-test.csv')
print(f"df_train.info(): {df_train.info()} vs df_test.info(): {df_test.info()}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199523 entries, 0 to 199522
Data columns (total 40 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   AAGE      199523 non-null  int64 
 1   ACLSWKR   199523 non-null  object
 2   ADTIND    199523 non-null  int64 
 3   ADTOCC    199523 non-null  int64 
 4   AHGA      199523 non-null  object
 5   AHRSPAY   199523 non-null  int64 
 6   AHSCOL    199523 non-null  object
 7   AMARITL   199523 non-null  object
 8   AMJIND    199523 non-null  object
 9   AMJOCC    199523 non-null  object
 10  ARACE     199523 non-null  object
 11  AREORGN   199523 non-null  object
 12  ASEX      199523 non-null  object
 13  AUNMEM    199523 non-null  object
 14  AUNTYPE   199523 non-null  object
 15  AWKSTAT   199523 non-null  object
 16  CAPGAIN   199523 non-null  int64 
 17  CAPLOSS   199523 non-null  int64 
 18  DIVVAL    199523 non-null  int64 
 19  FILESTAT  199523 non-null  object
 20  GRINREG   199523 non-null 

In [2]:
# Some of the values in the train set are not in the test set
# Rather than figure out which value is missing from the one hot encoding
# I am going to add the additional CLASS column to the test file
# and a FILE column so I can track which rows belong to which file

df_train['FILE'] = 'Train'

df_test['CLASS'] = 100
df_test['FILE'] = 'Test'

df_all = pd.concat([df_train, df_test])
# df_all.iloc[0:10,:].to_csv("df_all_top.csv") # Export the last 10 rows
# df_all.iloc[-10:,:].to_csv("df_all_bottom.csv") # Export the last 10 rows
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Index: 299285 entries, 0 to 99761
Data columns (total 41 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   AAGE      299285 non-null  int64 
 1   ACLSWKR   299285 non-null  object
 2   ADTIND    299285 non-null  int64 
 3   ADTOCC    299285 non-null  int64 
 4   AHGA      299285 non-null  object
 5   AHRSPAY   299285 non-null  int64 
 6   AHSCOL    299285 non-null  object
 7   AMARITL   299285 non-null  object
 8   AMJIND    299285 non-null  object
 9   AMJOCC    299285 non-null  object
 10  ARACE     299285 non-null  object
 11  AREORGN   299285 non-null  object
 12  ASEX      299285 non-null  object
 13  AUNMEM    299285 non-null  object
 14  AUNTYPE   299285 non-null  object
 15  AWKSTAT   299285 non-null  object
 16  CAPGAIN   299285 non-null  int64 
 17  CAPLOSS   299285 non-null  int64 
 18  DIVVAL    299285 non-null  int64 
 19  FILESTAT  299285 non-null  object
 20  GRINREG   299285 non-null  objec

When to One-Hot Encode vs Label Encode?

```To prevent biases from being introduced, One-Hot Encoding is preferable for nominal data (where there is no inherent order among categories). Label encoding, however, might be more appropriate for ordinal data (where categories naturally have an order)```

So we should one-hot encode columns like class of worker, state of residence, etc. After reviewing the column descriptions I decided to one-hot encode all the following columns:

'ACLSWKR', 'ADTIND', 'ADTOCC', 'AMARITL', 'AMJIND', 'AMJOCC', 'ARACE', 'AREORGN', 'ASEX', 'AUNMEM', 'AUNTYPE', 'AWKSTAT', 'FILESTAT', 'GRINREG', 'GRINST', 'HHDFMX', 'HHDREL', 'MIGMTR1', 'MIGMTR3', 'MIGMTR4', 'PARENT', 'PEFNTVTY', 'PEMNTVTY', 'PENATVTY', 'PRCITSHP', 'SEOTR'

**https://www.geeksforgeeks.org/ml-one-hot-encoding-of-datasets-in-python/**



In [3]:
def Preprocessing(df, columns_to_one_hot_encode, columns_to_label_encode, columns_to_scale):

    def OneHotEncode (df, columns_to_one_hot_encode):
        from sklearn.preprocessing import OneHotEncoder
        onehotencode = OneHotEncoder()      

        for item in columns_to_one_hot_encode:
            df[item] = df[item].astype('category') # Must convert the strings to category numbers for One Hot to work
            df[item + '_new'] = df[item].cat.codes # Rob: Need to research this more
            # print(f" Column: {item}")
            # print(df[item + '_new'])

        OneHot_df = pd.DataFrame(onehotencode.fit_transform(df[columns_to_one_hot_encode]).toarray())

        PostOneHot_df = df.join(OneHot_df) # Appends the OneHot_df to the original dataframe to create a new one
        PostOneHot_df[:-100] # Check results from the One Hot Encoding
        PostOneHot_df = PostOneHot_df.drop(columns=columns_to_one_hot_encode)
        df = PostOneHot_df
        return df

    def StripSpaces (df):
        # I noticed some of the columns get imported with leading spaces. I want to strip() these right away
        for column in df.select_dtypes(include=object): # Only review the columns with a str datatype
            df[column] = df[column].apply(lambda x: x.strip())
        return df
    
    def PreLabelEncode(df):
        #   Before label encoding we want to apply some value judgements to the data to give the resulting labels some ranking
        #   education
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Children", "0"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Less than 1st grade", "1"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("1st 2nd 3rd or 4th grade", "2"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("5th or 6th grade", "3"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("7th and 8th grade", "4"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("9th grade", "5"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("10th grade", "6"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("11th grade", "7"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("12th grade no diploma", "8"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("High school graduate", "9"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Some college but no degree", "10"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Associates degree-occup /vocational", "11"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Associates degree-academic program", "12"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Bachelors degree(BA AB BS)", "13"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Masters degree(MA MS MEng MEd MSW MBA)", "14"))
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Doctorate degree(PhD EdD)", "15")) # Sorry Dr. Feuz, but the professional doctorates have you beat in earning potential
        df["AHGA"] = df["AHGA"].apply(lambda x: x.replace("Prof school degree (MD DDS DVM LLB JD)", "16"))

        #   enrolled in edu inst last wk
        df["AHSCOL"] = df["AHSCOL"].apply(lambda x: x.replace("Not in universe", "0"))
        df["AHSCOL"] = df["AHSCOL"].apply(lambda x: x.replace("High school", "1"))
        df["AHSCOL"] = df["AHSCOL"].apply(lambda x: x.replace("College or university", "2"))

        #   live in this house 1 year ago
        df["MIGSAME"] = df["MIGSAME"].apply(lambda x: x.replace("Not in universe under 1 year old", "0"))
        df["MIGSAME"] = df["MIGSAME"].apply(lambda x: x.replace("No", "1"))
        df["MIGSAME"] = df["MIGSAME"].apply(lambda x: x.replace("Yes", "2"))

        #   migration prev res in sunbelt
        df["MIGSUN"] = df["MIGSUN"].apply(lambda x: x.replace("?", "0"))
        df["MIGSUN"] = df["MIGSUN"].apply(lambda x: x.replace("Not in universe", "1"))
        df["MIGSUN"] = df["MIGSUN"].apply(lambda x: x.replace("No", "1"))
        df["MIGSUN"] = df["MIGSUN"].apply(lambda x: x.replace("Yes", "2"))

        #   fill inc questionnaire for veteran's admin
        df["VETQVA"] = df["VETQVA"].apply(lambda x: x.replace("Not in universe", "0"))
        df["VETQVA"] = df["VETQVA"].apply(lambda x: x.replace("No", "1"))
        df["VETQVA"] = df["VETQVA"].apply(lambda x: x.replace("Yes", "2"))

        return df

    def LabelEncode(df, columns_to_label_encode):
        from sklearn.preprocessing import LabelEncoder
        LabelEncode = LabelEncoder()

        for item in columns_to_label_encode:
            df[item]= LabelEncode.fit_transform(df[item])
            # print(f"Post Label Encoding for {item}: {df[item].unique()}")

        return df
    
    def StandardScale(df, columns_to_scale):
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()

        scaled_columns = scaler.fit_transform(df[columns_to_scale])
        df[columns_to_scale] = scaled_columns

        return df
    
    df = StripSpaces(df)
    df = OneHotEncode(df, columns_to_one_hot_encode)
    df = PreLabelEncode(df)
    df = LabelEncode(df, columns_to_label_encode)
    df = StandardScale(df, columns_to_scale)
    
    return(df)


Now let's label encode some of the columns, but first let's update the columns so they have an inherent rank order

Reference: https://www.geeksforgeeks.org/ml-label-encoding-of-datasets-in-python


In [4]:
# This cell helps view the values we want to label encode
# df_test['VETQVA'].unique().tolist()

In [5]:
# df_train = df_train.drop(columns='CLASS')

columns_to_one_hot_encode = ['ACLSWKR', 'ADTIND', 'ADTOCC', 'AMARITL', 'AMJIND', 'AMJOCC', 'ARACE', 'AREORGN', 'ASEX', 'AUNMEM', 'AUNTYPE', 'AWKSTAT', 'FILESTAT', 'GRINREG', 'GRINST', 'HHDFMX', 'HHDREL', 'MIGMTR1', 'MIGMTR3', 'MIGMTR4', 'PARENT', 'PEFNTVTY', 'PEMNTVTY', 'PENATVTY', 'PRCITSHP', 'SEOTR']
columns_to_label_encode = ['AHGA','AHSCOL','MIGSAME','MIGSUN','VETQVA',]
columns_to_scale = ['AAGE','AHRSPAY','CAPGAIN','CAPLOSS','DIVVAL','NOEMP','WKSWORK',]

df_all = Preprocessing(df_all, columns_to_one_hot_encode, columns_to_label_encode, columns_to_scale)

# print(f"df_train.info(): {df_train.info()} vs df_test.info(): {df_test.info()}")

Here is a Kaggle page showing the best categorical classifiers for a given data set:
* https://www.kaggle.com/code/jeffd23/10-classifier-showdown-in-scikit-learn
* Comment about grid search: https://www.kaggle.com/code/jeffd23/10-classifier-showdown-in-scikit-learn/comments#135499



In [6]:
df_all.dtypes

AAGE       float64
AHGA         int64
AHRSPAY    float64
AHSCOL       int64
CAPGAIN    float64
            ...   
463        float64
464        float64
465        float64
466        float64
467        float64
Length: 509, dtype: object

In [7]:
# Separate the files once again
df_train = pd.DataFrame()

df_train = df_all.loc[df_all['FILE'] == 'Train']
df_train = df_train.drop(columns='FILE')

df_test = pd.DataFrame()
df_test = df_all.loc[df_all['FILE'] == 'Test']
df_test = df_test.drop(columns='FILE')

df_train.info: <bound method DataFrame.info of             AAGE  AHGA   AHRSPAY  AHSCOL   CAPGAIN   CAPLOSS    DIVVAL  \
0       1.723284    16 -0.201599       0 -0.092435 -0.136584 -0.101067   
1       1.051194     2 -0.201599       0 -0.092435 -0.136584 -0.101067   
2      -0.741047    13 -0.201599       1 -0.092435 -0.136584 -0.101067   
3      -1.144301     0 -0.201599       0 -0.092435 -0.136584 -0.101067   
4      -1.099495     0 -0.201599       0 -0.092435 -0.136584 -0.101067   
...          ...   ...       ...     ...       ...       ...       ...   
199518  2.350569    11 -0.201599       0 -0.092435 -0.136584 -0.101067   
199519  1.364836    14 -0.201599       0  1.281645 -0.136584 -0.096422   
199520  0.558328     2 -0.201599       0 -0.092435 -0.136584 -0.020049   
199521 -0.830659    13 -0.201599       1 -0.092435 -0.136584 -0.101067   
199522 -0.113762    16 -0.201599       0 -0.092435 -0.136584 -0.101067   

        MIGSAME  MIGSUN     NOEMP  ...  458  459  460  461  462 

In [8]:
print(f"df_train.info: {df_train.info} vs df_test.info: {df_test.info}")

df_train.info: <bound method DataFrame.info of             AAGE  AHGA   AHRSPAY  AHSCOL   CAPGAIN   CAPLOSS    DIVVAL  \
0       1.723284    16 -0.201599       0 -0.092435 -0.136584 -0.101067   
1       1.051194     2 -0.201599       0 -0.092435 -0.136584 -0.101067   
2      -0.741047    13 -0.201599       1 -0.092435 -0.136584 -0.101067   
3      -1.144301     0 -0.201599       0 -0.092435 -0.136584 -0.101067   
4      -1.099495     0 -0.201599       0 -0.092435 -0.136584 -0.101067   
...          ...   ...       ...     ...       ...       ...       ...   
199518  2.350569    11 -0.201599       0 -0.092435 -0.136584 -0.101067   
199519  1.364836    14 -0.201599       0  1.281645 -0.136584 -0.096422   
199520  0.558328     2 -0.201599       0 -0.092435 -0.136584 -0.020049   
199521 -0.830659    13 -0.201599       1 -0.092435 -0.136584 -0.101067   
199522 -0.113762    16 -0.201599       0 -0.092435 -0.136584 -0.101067   

        MIGSAME  MIGSUN     NOEMP  ...  458  459  460  461  462 

In [11]:
from sklearn.model_selection import train_test_split

X = df_train.drop(columns='CLASS').values # Include ALL columns except CLASS
X_test = df_test.drop(columns='CLASS').values # Include ALL columns except CLASS

y = df_train['CLASS'].values # Only include Class

# Initially I want a smaller training set so I can evaluate many models faster
X_train, X_test_discard, y_train, y_test_discard = train_test_split(X, y, test_size = 0.01) # Intentionally setting aside a "test" set that I will not use
                                                    
print(f"X_train.shape {X_train.shape} compared to X_test.shape {X_test.shape}")

# Would normally run the following line, but CLASS isn't in the test data
# X_test = df_test.drop(columns='CLASS').values # Include ALL columns except CLASS

# print(f"X_test.shape {X_test.shape} compared to df_test.shape {df_test.info}")

X_train.shape (197527, 507) compared to X_test.shape (99762, 507)


In [None]:
# from sklearn.metrics import accuracy_score, log_loss
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC, LinearSVC, NuSVC
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
# from sklearn.naive_bayes import GaussianNB
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# classifiers = [
#     # SVC(kernel="rbf", C=0.025, probability=True),
#     # NuSVC(probability=True),
#     # DecisionTreeClassifier(),
#     RandomForestClassifier(),
#     AdaBoostClassifier(),
#     GradientBoostingClassifier(),
#     # GaussianNB(),
#     # LinearDiscriminantAnalysis(),
#     # QuadraticDiscriminantAnalysis()
#     ]

# # Logging for Visual Comparison
# log_cols=["Classifier", "Accuracy", "Log Loss"]
# log = pd.DataFrame(columns=log_cols)

# for clf in classifiers:
#     clf.fit(X_train, y_train)
#     name = clf.__class__.__name__
    
#     print("="*30)
#     print(name)
    
#     print('****Results****')
#     train_predictions = clf.predict(X_test)
#     acc = accuracy_score(y_test, train_predictions)
#     print("Accuracy: {:.4%}".format(acc))
    
#     train_predictions = clf.predict_proba(X_test)
#     ll = log_loss(y_test, train_predictions)
#     print("Log Loss: {}".format(ll))
    
#     log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
#     # log = log.append(log_entry)
    
# print("="*30)

Here are the high level results using only 5% of the data:

![Alt text](image-5.png)

I will narrow in on the 3 most promising models (RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier()))and rerun with 25% of the training data

![Alt text](image-6.png)

Having determined that GradientBoostingClassifier is the lowest overall model using log_loss (log_loss is a cost function where we want the lowest value unlike utility functions where we want the highest), we can now Cross Validate and GridSearch to find the best hyperparameters
Credit:  https://www.kaggle.com/code/hatone/gradientboostingclassifier-with-gridsearchcv/script

In [20]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

parameters = {
    "loss":["log_loss"],
    "learning_rate": [0.01, 0.075, 0.1, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[3,8],
    "max_features":["log2"],
    "criterion": ["friedman_mse"],
    "subsample":[0.5, 1.0],
    "n_estimators":[10]
    }

# Do cross fold validation using all processors. Default CV is 5 folds
clf = GridSearchCV(GradientBoostingClassifier(), parameters, n_jobs=-1, cv=5, scoring="neg_mean_squared_error") # GridSearchCV requires cost functions so you have turn some scoring metrics into negative numbers for it to work.

clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.best_params_)
print(clf.cv_results_)



-0.24820910559062812
{'criterion': 'friedman_mse', 'learning_rate': 0.01, 'loss': 'log_loss', 'max_depth': 3, 'max_features': 'log2', 'min_samples_leaf': 0.1, 'min_samples_split': 0.1, 'n_estimators': 10, 'subsample': 0.5}


The GradientBoostingGrid Search ran for hours, but ultimately the Grid Search for the Gradient Boost returned: 

-0.24820910559062812
{'criterion': 'friedman_mse', 'learning_rate': 0.01, 'loss': 'log_loss', 'max_depth': 3, 'max_features': 'log2', 'min_samples_leaf': 0.1, 'min_samples_split': 0.1, 'n_estimators': 10, 'subsample': 0.5}


. Random Forest didn't perform significantly worse in the Classifier Showdown so trying it out to see if it is faster. Will let Gradient Boosting run overnight on the weekend.

https://stackoverflow.com/questions/50993867/increasing-n-jobs-has-no-effect-on-gridsearchcv


In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

parameters = [{
    'n_estimators': [20, 30, 40, 50, 60], 
    'max_features': [100, 200, 300, 400, 500], 
    'criterion': ['log_loss'] # Need to consider with gini would be better
    }] 

# Do cross fold validation using all processors. Default CV is 5 folds
clf = GridSearchCV(RandomForestClassifier(), parameters, n_jobs=-1, cv=5, scoring="neg_mean_squared_error") # GridSearchCV requires cost functions so you have turn some scoring metrics into negative numbers for it to work.

clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.best_params_)
print(clf.cv_results_)



KeyboardInterrupt: 

Indeed, the RandomForest() did perform the GridSearch successfully. Here are the results:
-0.0024907987262500824
{'criterion': 'log_loss', 'max_features': 200, 'n_estimators': 60}

In [None]:
# # Reset X_test to the values from df_test rather than the results of the split
# X_test = df_test.values # Include ALL columns except CLASS

Would be good to write code that would loop through all the columns and print out the uniques to add decisions about one-hot vs label encoding vs scaling:

So that takes one column and generates 9 columns

In [26]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier('friedman_mse', learning_rate = 0.01, loss= 'log_loss', max_depth= 3, max_features= 'log2', min_samples_leaf= 0.1, min_samples_split= 0.1, n_estimators= 10, subsample= 0.5)
clf.fit(X_train, y_train)

# print(f"X_train.shape: {X_train.shape} vs X_test.shape: {X_test.shape}")
result = clf.predict(X_test)
print(f"result size: {result.shape}")


TypeError: GradientBoostingClassifier.__init__() takes 1 positional argument but 2 positional arguments (and 8 keyword-only arguments) were given

In [None]:
from sklearn.ensemble import RandomForestClassifier

# clf = RandomForestClassifier(criterion = 'log_loss', max_features = 200, n_estimators = 60)
# clf.fit(X_train, y_train)

# print(f"X_train.shape: {X_train.shape} vs X_test.shape: {X_test.shape}")

result = clf.predict(X_test)
print(f"result size: {result.shape}")


result size: (99762,)


In [15]:
X_test[1:507]

array([[ 0.42390982,  3.        , -0.20159864, ...,  1.        ,
         0.        ,  0.        ],
       [-1.4579429 ,  0.        , -0.20159864, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.02065567, 16.        , -0.20159864, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-0.92027069,  0.        , -0.20159864, ...,  1.        ,
         0.        ,  0.        ],
       [-0.20337442,  5.        , -0.20159864, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.55832787,  5.        , -0.20159864, ...,  1.        ,
         0.        ,  0.        ]])

In [16]:

# Program to save a NumPy array to a text file
  
# Displaying the array
print('Array:\n', result)

result.dtype

np.savetxt("Christiansen_Rob.txt", result, newline="\n", fmt = '%i')
# result.T.tofile('Christiansen_Rob.txt', sep = ',')

Array:
 [-1 -1 -1 ... -1 -1 -1]


Resouces Consulted for major questions:

* https://www.kaggle.com/code/jeffd23/10-classifier-showdown-in-scikit-learn
* https://stackoverflow.com/questions/38151615/specific-cross-validation-with-random-forest
