In [32]:
## importing packages
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB

In [33]:
#importing csv file
file1_path = '/Users/stellasi/Desktop/midterm-sbel2/train.csv'
file2_path = '/Users/stellasi/Desktop/midterm-sbel2/test.csv'

train = pd.read_csv(file1_path)
test = pd.read_csv(file2_path)

## Preprocessing the datasets

Before running the models, we need to first encode the categorical features and scale the continous features. To ensure uniformity of categorical encoding, I concacted the train and test datasets together. I will split the dataset back to its original datasets after encoding for normalizing to avoid leaking the test data to the training data.

In [34]:
data = pd.concat([train, test], ignore_index=True)

### Encoding

I encoded the categorical features in two ways go use for later analysis depending on what the model requires:

    (1) One-hot encoding.

    (2) Numerical encoding to make 1 to n categories within one column.

To find the categorical features, I first found the columns that has "object" type features which are definetly categorical. Then, for numerical encoding, I included the "year" column to recode them into simpler categories. And for one-hot encoding, I added the previously numerically encoded columns as well to convert into one-hot encoding.

Note: For numerical encoding, I made sure to specify the categorical features to be categorical for future analyses

In [35]:
# selecting categorical features for numerical encoding
categorical_features = list (data.dtypes[data.dtypes == "object"].index)
categorical_features.extend(["year"])

#numerical encoding
data_cn = pd.DataFrame(data) ## numerical encoding
le = LabelEncoder()
data_cat = data_cn[categorical_features]
data_cat = data_cat.apply(le.fit_transform)
data_cn[categorical_features] = data_cat

#specifying the categorical features as categorical
for col in categorical_features:
    data_cn[col] = data_cn[col].astype('category')

data_cn.head()

Unnamed: 0,age,class_of_worker,detailed_industry_recode,detailed_occupation_recode,education,wage_per_hour,enroll_in_edu_inst_last_wk,marital_stat,major_industry_code,major_occupation_code,...,country_of_birth_father,country_of_birth_mother,country_of_birth_self,citizenship,own_business_or_self_employed,fill_inc_questionnaire_for_veteran's_admin,veterans_benefits,weeks_worked_in_year,year,label
0,56,3,0,0,9,0,2,0,14,6,...,0,40,40,4,0,1,2,1,1,1.0
1,53,1,43,2,14,0,2,0,5,2,...,40,40,40,4,0,1,2,52,1,1.0
2,47,4,35,33,7,0,2,2,7,8,...,40,40,40,4,0,1,2,52,1,0.0
3,71,3,0,0,6,0,2,2,14,6,...,40,40,40,4,0,1,2,0,1,0.0
4,2,3,0,0,10,0,2,4,14,6,...,40,40,40,4,0,1,0,0,1,0.0


In [36]:

# adding additional categorical features for one hot encoding
categorical_features.extend(["detailed_industry_recode", "detailed_occupation_recode", "veterans_benefits", "own_business_or_self_employed"])
#one-hot encoding
data_cato = data[categorical_features]
data_cato = pd.get_dummies(data_cato)

In [37]:
data_cato = data_cato.astype(int)
data_co = pd.concat([data.drop(categorical_features, axis=1), data_cato], axis=1)
data_co.head()

Unnamed: 0,age,wage_per_hour,capital_gains,capital_losses,dividends_from_stocks,num_persons_worked_for_employer,weeks_worked_in_year,label,year,detailed_industry_recode,...,country_of_birth_self_ Vietnam,country_of_birth_self_ Yugoslavia,citizenship_ Foreign born- Not a citizen of U S,citizenship_ Foreign born- U S citizen by naturalization,citizenship_ Native- Born abroad of American Parent(s),citizenship_ Native- Born in Puerto Rico or U S Outlying,citizenship_ Native- Born in the United States,fill_inc_questionnaire_for_veteran's_admin_ No,fill_inc_questionnaire_for_veteran's_admin_ Not in universe,fill_inc_questionnaire_for_veteran's_admin_ Yes
0,56,0,0,0,2353,6,1,1.0,95,0,...,0,0,0,0,0,0,1,0,1,0
1,53,0,0,1564,0,6,52,1.0,95,43,...,0,0,0,0,0,0,1,0,1,0
2,47,0,0,0,0,6,52,0.0,95,35,...,0,0,0,0,0,0,1,0,1,0
3,71,0,0,0,0,0,0,0.0,95,0,...,0,0,0,0,0,0,1,0,1,0
4,2,0,0,0,0,0,0,0.0,95,0,...,0,0,0,0,0,0,1,0,1,0


### Normalization

For the best quality features, I first log transformed the continous features to neutralize any exceptionally large values. I then scaled the data with a scaler tool, trying three different methods: minmaxscaler, standardscaler, and robustscaler.

To find the continous features, we can first located the numerical columns from the original dataframe, but we can see that some categorical features are numerically encoded. We will have to exclude them for scaling. We will also exclude "year" as we already encoded it to be categorical.

I will conduct this process to both the numerically encoded dataframe and the one-hot encoded dataframe. But before, I will split the data back to its original form before noramlizing to prevent any bleeding of data.

In [38]:
train_cn = data_cn[data_cn['label'].notna()]
test_cn = data_cn[data_cn['label'].isna()]
train_co = data_co[data_co['label'].notna()]
test_co = data_co[data_co['label'].isna()]

In [39]:
#Droping the label column out of the test data set since there are no labels for the test set yet
test_cn = test_cn.drop(columns="label")
test_co = test_co.drop(columns="label")

In [40]:
## selecting the continous features
numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
columns_to_exclude = ["detailed_industry_recode", "detailed_occupation_recode", "veterans_benefits", "own_business_or_self_employed", "label", "year"] 

numeric_columns_filtered = [col for col in numeric_columns if col not in columns_to_exclude]

print("These will be the columns that we will transform and scale: ")
print(numeric_columns_filtered)

These will be the columns that we will transform and scale: 
['age', 'wage_per_hour', 'capital_gains', 'capital_losses', 'dividends_from_stocks', 'num_persons_worked_for_employer', 'weeks_worked_in_year']


#### log transformation

In [41]:
## adding a small number to avoid zero values for log transformation
for column in numeric_columns_filtered:
    train_cn.loc[:, column] = np.log(train_cn[column] + 1e-10)
    train_co.loc[:, column] = np.log(train_co[column] + 1e-10)
    test_cn.loc[:, column] = np.log(test_cn[column] + 1e-10)
    test_co.loc[:, column] = np.log(test_co[column] + 1e-10)

In [42]:
#quickly examining one of the dataframes
print("Log-transformed numerically encoded train dataset:")
train_cn.head()

Log-transformed numerically encoded train dataset:


Unnamed: 0,age,class_of_worker,detailed_industry_recode,detailed_occupation_recode,education,wage_per_hour,enroll_in_edu_inst_last_wk,marital_stat,major_industry_code,major_occupation_code,...,country_of_birth_father,country_of_birth_mother,country_of_birth_self,citizenship,own_business_or_self_employed,fill_inc_questionnaire_for_veteran's_admin,veterans_benefits,weeks_worked_in_year,year,label
0,4.025352,3,0,0,9,-23.025851,2,0,14,6,...,0,40,40,4,0,1,2,1e-10,1,1.0
1,3.970292,1,43,2,14,-23.025851,2,0,5,2,...,40,40,40,4,0,1,2,3.951244,1,1.0
2,3.850148,4,35,33,7,-23.025851,2,2,7,8,...,40,40,40,4,0,1,2,3.951244,1,0.0
3,4.26268,3,0,0,6,-23.025851,2,2,14,6,...,40,40,40,4,0,1,2,-23.02585,1,0.0
4,0.693147,3,0,0,10,-23.025851,2,4,14,6,...,40,40,40,4,0,1,0,-23.02585,1,0.0


#### feature scaling
I am including code for three scaling methods so I have options to train my models.

In [43]:
# #Minmaxscaler
# min_max_scaler = preprocessing.MinMaxScaler()
# train_cn.loc[:, numeric_columns_filtered] = min_max_scaler.fit_transform(train_cn[numeric_columns_filtered])
# test_cn.loc[:, numeric_columns_filtered] = min_max_scaler.fit_transform(test_cn[numeric_columns_filtered])
# train_co.loc[:, numeric_columns_filtered] = min_max_scaler.fit_transform(train_co[numeric_columns_filtered])
# test_co.loc[:, numeric_columns_filtered] = min_max_scaler.fit_transform(test_co[numeric_columns_filtered])

#Standard Scaler
sscaler = StandardScaler()
train_cn.loc[:, numeric_columns_filtered] = sscaler.fit_transform(train_cn[numeric_columns_filtered])
test_cn.loc[:, numeric_columns_filtered] = sscaler.fit_transform(test_cn[numeric_columns_filtered])
train_co.loc[:, numeric_columns_filtered] = sscaler.fit_transform(train_co[numeric_columns_filtered])
test_co.loc[:, numeric_columns_filtered] = sscaler.fit_transform(test_co[numeric_columns_filtered])
train_cn.head()

#Robust Scaler
# rscaler = RobustScaler()
# train_cn.loc[:, numeric_columns_filtered] = rscaler.fit_transform(train_cn[numeric_columns_filtered])
# test_cn.loc[:, numeric_columns_filtered] = rscaler.fit_transform(test_cn[numeric_columns_filtered])
# train_co.loc[:, numeric_columns_filtered] = rscaler.fit_transform(train_co[numeric_columns_filtered])
# test_co.loc[:, numeric_columns_filtered] = rscaler.fit_transform(test_co[numeric_columns_filtered])


Unnamed: 0,age,class_of_worker,detailed_industry_recode,detailed_occupation_recode,education,wage_per_hour,enroll_in_edu_inst_last_wk,marital_stat,major_industry_code,major_occupation_code,...,country_of_birth_father,country_of_birth_mother,country_of_birth_self,citizenship,own_business_or_self_employed,fill_inc_questionnaire_for_veteran's_admin,veterans_benefits,weeks_worked_in_year,year,label
0,0.352953,3,0,0,9,-0.243543,2,0,14,6,...,0,40,40,4,0,1,2,0.644346,1,1.0
1,0.335313,1,43,2,14,-0.243543,2,0,5,2,...,40,40,40,4,0,1,2,0.940709,1,1.0
2,0.296821,4,35,33,7,-0.243543,2,2,7,8,...,40,40,40,4,0,1,2,0.940709,1,0.0
3,0.428988,3,0,0,6,-0.243543,2,2,14,6,...,40,40,40,4,0,1,2,-1.082709,1,0.0
4,-0.714617,3,0,0,10,-0.243543,2,4,14,6,...,40,40,40,4,0,1,0,-1.082709,1,0.0


In [44]:
print("Log-transformed and scaled numerically encoded train dataset:")
train_cn.head()

Log-transformed and scaled numerically encoded train dataset:


Unnamed: 0,age,class_of_worker,detailed_industry_recode,detailed_occupation_recode,education,wage_per_hour,enroll_in_edu_inst_last_wk,marital_stat,major_industry_code,major_occupation_code,...,country_of_birth_father,country_of_birth_mother,country_of_birth_self,citizenship,own_business_or_self_employed,fill_inc_questionnaire_for_veteran's_admin,veterans_benefits,weeks_worked_in_year,year,label
0,0.352953,3,0,0,9,-0.243543,2,0,14,6,...,0,40,40,4,0,1,2,0.644346,1,1.0
1,0.335313,1,43,2,14,-0.243543,2,0,5,2,...,40,40,40,4,0,1,2,0.940709,1,1.0
2,0.296821,4,35,33,7,-0.243543,2,2,7,8,...,40,40,40,4,0,1,2,0.940709,1,0.0
3,0.428988,3,0,0,6,-0.243543,2,2,14,6,...,40,40,40,4,0,1,2,-1.082709,1,0.0
4,-0.714617,3,0,0,10,-0.243543,2,4,14,6,...,40,40,40,4,0,1,0,-1.082709,1,0.0


In [45]:
print("Log-transformed and scaled numerically encoded test dataset:")
test_cn.head()

Log-transformed and scaled numerically encoded test dataset:


Unnamed: 0,age,class_of_worker,detailed_industry_recode,detailed_occupation_recode,education,wage_per_hour,enroll_in_edu_inst_last_wk,marital_stat,major_industry_code,major_occupation_code,...,family_members_under_18,country_of_birth_father,country_of_birth_mother,country_of_birth_self,citizenship,own_business_or_self_employed,fill_inc_questionnaire_for_veteran's_admin,veterans_benefits,weeks_worked_in_year,year
101143,0.084506,1,43,10,9,-0.239314,2,4,5,10,...,4,40,40,40,4,0,1,2,0.784188,1
101144,0.109785,6,37,8,12,-0.239314,2,4,2,10,...,4,40,40,40,4,0,1,2,0.946129,0
101145,0.413422,3,0,0,5,-0.239314,2,4,14,6,...,4,40,0,40,4,0,1,2,-1.076905,1
101146,0.010725,4,33,24,12,-0.239314,2,4,19,0,...,4,26,26,40,4,2,1,2,0.946129,1
101147,0.057026,4,4,40,1,-0.239314,2,2,4,4,...,4,26,26,26,0,0,1,2,0.888147,1


## Making the training and testing set for modeling

In [46]:
#Assessing our classes
count_label_1 = (train_co['label'] == 1).sum()
count_label_0 = (train_co['label'] == 0).sum()
print(count_label_1)
print(count_label_0)
if count_label_1 != count_label_0:
    print("The labels are unbalanced where there are unequal numbers of labels 0 and 1")

11144
89999
The labels are unbalanced where there are unequal numbers of labels 0 and 1


Here, I take note that the label classes are heavily imbalanced with a 1 to 8 ratio, which will impact my models, so I will try to mediate the difference in my coding. 

First, I will split the training dataset into training and testing sets to enable internal evaluation of my models. I will have the numerically encoded version, and the one-hot encoded version.

#### data splitting
Since numerical encoding will be used less frequently in my models, I will give it a more complicated name to denote the difference.

In [47]:
# numerical encoding train and test set
X_n = train_cn.drop('label', axis=1)  # Features
y_n = train_cn['label']  # Target

X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(X_n, y_n, test_size=0.2, random_state=42)

# one-hot encoding train and test set
X = train_co.drop('label', axis=1)  # Features
y = train_co['label']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
X_train.head()

Unnamed: 0,age,wage_per_hour,capital_gains,capital_losses,dividends_from_stocks,num_persons_worked_for_employer,weeks_worked_in_year,year,detailed_industry_recode,detailed_occupation_recode,...,country_of_birth_self_ Vietnam,country_of_birth_self_ Yugoslavia,citizenship_ Foreign born- Not a citizen of U S,citizenship_ Foreign born- U S citizen by naturalization,citizenship_ Native- Born abroad of American Parent(s),citizenship_ Native- Born in Puerto Rico or U S Outlying,citizenship_ Native- Born in the United States,fill_inc_questionnaire_for_veteran's_admin_ No,fill_inc_questionnaire_for_veteran's_admin_ Not in universe,fill_inc_questionnaire_for_veteran's_admin_ Yes
98869,0.067861,-0.243543,-0.216631,-0.155029,-0.371524,0.94259,0.911015,94,41,13,...,0,0,0,0,0,0,1,0,1,0
27671,0.038716,-0.243543,-0.216631,-0.155029,-0.371524,-1.08231,-1.082709,94,33,41,...,0,0,0,0,0,0,1,0,1,0
21260,0.260786,-0.243543,-0.216631,-0.155029,-0.371524,0.976222,0.940709,95,32,44,...,0,0,0,0,0,0,1,0,1,0
2556,-0.16845,-0.243543,-0.216631,-0.155029,-0.371524,-1.08231,-1.082709,95,0,0,...,0,0,0,0,0,0,1,0,1,0
6429,0.202374,-0.243543,-0.216631,-0.155029,-0.371524,0.94259,0.940709,94,18,3,...,0,0,0,0,0,0,1,0,1,0


In [49]:
X_test.head()

Unnamed: 0,age,wage_per_hour,capital_gains,capital_losses,dividends_from_stocks,num_persons_worked_for_employer,weeks_worked_in_year,year,detailed_industry_recode,detailed_occupation_recode,...,country_of_birth_self_ Vietnam,country_of_birth_self_ Yugoslavia,citizenship_ Foreign born- Not a citizen of U S,citizenship_ Foreign born- U S citizen by naturalization,citizenship_ Native- Born abroad of American Parent(s),citizenship_ Native- Born in Puerto Rico or U S Outlying,citizenship_ Native- Born in the United States,fill_inc_questionnaire_for_veteran's_admin_ No,fill_inc_questionnaire_for_veteran's_admin_ Not in universe,fill_inc_questionnaire_for_veteran's_admin_ Yes
8266,-0.16845,-0.243543,-0.216631,-0.155029,-0.371524,-1.08231,-1.082709,94,0,0,...,0,0,0,0,0,0,1,0,1,0
71640,0.316645,-0.243543,-0.216631,-0.155029,-0.371524,0.94259,0.940709,94,12,35,...,0,0,1,0,0,0,0,0,1,0
53556,0.245154,4.170027,-0.216631,-0.155029,-0.371524,0.976222,0.940709,95,41,13,...,0,0,0,0,0,0,1,0,1,0
1938,-0.421056,-0.243543,-0.216631,-0.155029,-0.371524,-1.08231,-1.082709,95,0,0,...,0,0,0,0,0,0,1,0,1,0
95855,-0.584714,-0.243543,-0.216631,-0.155029,-0.371524,-1.08231,-1.082709,95,0,0,...,0,0,0,0,0,0,1,0,1,0


## The Decision Tree Model

I am trying the decision tree model first because it is a simple and straightforward classifier, and I will have the oppotunity to explore the hyperparameters and its effects.

In [50]:
# Create a Decision Tree Classifier
model = DecisionTreeClassifier(random_state=42, max_depth = 15)

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

# Print the MSE
print(f"Mean Squared Error: {mse:.2f}")


# Print the accuracy and classification report
print(f'Accuracy: {accuracy:.2f}')
print(f"Mean Squared Error: {mse:.2f}")
print('Classification Report:')
print(report)

f_score = f1_score(y_test, y_pred)
print(f"F Score: {f_score}")


Mean Squared Error: 0.08
Accuracy: 0.92
Mean Squared Error: 0.08
Classification Report:
              precision    recall  f1-score   support

         0.0       0.95      0.97      0.96     18055
         1.0       0.68      0.55      0.61      2174

    accuracy                           0.92     20229
   macro avg       0.81      0.76      0.78     20229
weighted avg       0.92      0.92      0.92     20229

F Score: 0.608739837398374


Aftern running the model several times, my best F score is 0.608739837398374. It is decent, but not great.

### GridsearchCV
To see if I can improve the model in anyway, I will use GridSearchCV to optimize the hyperparameters and compare F-scores to see if the resulting model does any better. Since the current traininh and testing set is huge, I am taking a subset of the dataset to complete this task.

In [51]:
#creating subset
sub_X_train = X_train [1:10000]
sub_X_test = X_test [1:10000]
sub_y_train = y_train [1:10000]
sub_y_test = y_test [1:10000]

In [52]:
#finding parameter search
params_grid = {
    "max_depth": [5, 10, 15, 20, 30, None],
    "max_features": ["sqrt", "log2", None],
    "class_weight": ["balanced", None]
}

In [53]:
#getting the optimal paramenter
gs = GridSearchCV(estimator=DecisionTreeClassifier(), 
                  param_grid=params_grid,
                  n_jobs=-1,
                  refit=True,
                  cv=10,
                  verbose=1)

In [54]:
#fitting the model from the Gridsearch, and predicting y with the test set
gs.fit(sub_X_train, sub_y_train)
y_test_pred = gs.predict(sub_X_test)

Fitting 10 folds for each of 36 candidates, totalling 360 fits


In [55]:
#Evaluating the model
accuracy = accuracy_score(sub_y_test, y_test_pred)
report = classification_report(sub_y_test, y_test_pred)
mse = mean_squared_error(sub_y_test, y_test_pred)

# Print the MSE
print(f"Mean Squared Error: {mse:.2f}")


# Print the accuracy and classification report
print(f'Accuracy: {accuracy:.2f}')
print(f"Mean Squared Error: {mse:.2f}")
print('Classification Report:')
print(report)

f_score = f1_score(y_test, y_pred)
print(f"F Score: {f_score}")

Mean Squared Error: 0.08
Accuracy: 0.92
Mean Squared Error: 0.08
Classification Report:
              precision    recall  f1-score   support

         0.0       0.94      0.97      0.96      8905
         1.0       0.66      0.53      0.59      1094

    accuracy                           0.92      9999
   macro avg       0.80      0.75      0.77      9999
weighted avg       0.91      0.92      0.91      9999

F Score: 0.608739837398374


The F score is about the same as the original model.

In [56]:
print("The optimal hyperparameter according to GridsearchCV")
gs.best_params_

The optimal hyperparameter according to GridsearchCV


{'class_weight': None, 'max_depth': 10, 'max_features': None}

#### classifying test.csv
Now, taking my best model, which is the Gridsearch model, I will generate my predictions for the test data.

In [57]:
pred_k_dt = gs.predict(test_co)
print(pred_k_dt)

[0. 0. 0. ... 0. 0. 0.]


In [58]:
# creating submission file for Kaggle
# df_submission = pd.DataFrame(pred_k_dt, columns=["Label"])
# df_submission.to_csv("my_predictions.csv", index=True, index_label="ID")

## The Random Forest Model
Now, I will try the Random Forest model, which is know to be versatile and can reduce overfitting data. I played around with the n estimators and max depth, which controls the complexity of the tree model.

In [59]:
rf_model = RandomForestClassifier(n_estimators=45, max_depth = 50, random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test data
rf_y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, rf_y_pred)
report = classification_report(y_test, rf_y_pred)

# Print the accuracy and classification report
print(f'Accuracy: {accuracy:.2f}')
print(f"Mean Squared Error: {mse:.2f}")
print('Classification Report:')
print(report)

mse = mean_squared_error(y_test, rf_y_pred)
print(f"Mean Squared Error: {mse}")

f_score = f1_score(y_test, rf_y_pred)
print(f"F Score: {f_score}")

Accuracy: 0.93
Mean Squared Error: 0.08
Classification Report:
              precision    recall  f1-score   support

         0.0       0.95      0.98      0.96     18055
         1.0       0.75      0.54      0.63      2174

    accuracy                           0.93     20229
   macro avg       0.85      0.76      0.79     20229
weighted avg       0.93      0.93      0.93     20229

Mean Squared Error: 0.0689604033812843
F Score: 0.6263059201714438


From the random forest tree model, the best F-score that I got is 0.6452123450276972, which is higher than my decision tree model.

### GridsearchCV
Now, I will try GridsearchCV to optimize the hyperparamenter.

In [60]:
classifier = RandomForestClassifier()

# Define the hyperparameter grid
params_grid = {
    "max_depth": [50, 55],
    "n_estimators": [40, 45]
}

In [61]:
# Create a GridSearchCV object
gs = GridSearchCV(classifier, params_grid, cv=10)

In [62]:
# Fitting the grid search to training data
gs.fit(sub_X_train, sub_y_train)

In [63]:
# Get the best hyperparameters
best_params = gs.best_params_
print("Best Hyperparameters:", best_params)

# Get the best model
gs_model = gs.best_estimator_

Best Hyperparameters: {'max_depth': 55, 'n_estimators': 45}


In [67]:
# Fitting the GridsearcCV model with the training set
gs_model.fit(X_train, y_train)
y_test_pred = gs_model.predict(X_test)

In [68]:
#Evaluate the model
accuracy = accuracy_score(y_test, y_test_pred)
report = classification_report(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)

# Print the MSE
print(f"Mean Squared Error: {mse:.2f}")


# Print the accuracy and classification report
print(f'Accuracy: {accuracy:.2f}')
print(f"Mean Squared Error: {mse:.2f}")
print('Classification Report:')
print(report)

f_score = f1_score(y_test, y_pred)
print(f"F Score: {f_score}")

Mean Squared Error: 0.07
Accuracy: 0.93
Mean Squared Error: 0.07
Classification Report:
              precision    recall  f1-score   support

         0.0       0.95      0.98      0.96     18055
         1.0       0.75      0.55      0.63      2174

    accuracy                           0.93     20229
   macro avg       0.85      0.76      0.80     20229
weighted avg       0.93      0.93      0.93     20229

F Score: 0.608739837398374


The F-score of the GridsearchCV model is actually lower than my previous model, so we shall not use this model.

### Feature Engineering
In addition to GridsearchCV, I will also try selecting the features of top importance from the previous model to run a new model and see if I can better generalize future predictions.

In [64]:
#getting the feature importance data from the RF model
feature_importances = rf_model.feature_importances_
# print("Feature Importances:")
# for i, importance in enumerate(feature_importances):
#     print(f"Feature {i}: {importance}")
sorted_idx = feature_importances.argsort()[::-1]

In [70]:
N = 20
X_train_fe = X_train.iloc[:, sorted_idx[:N]]
X_test_fe = X_test.iloc[:, sorted_idx[:N]]
f_select = X_test_fe.columns
X_train_fe.head()

Index(['age', 'detailed_occupation_recode', 'dividends_from_stocks',
       'capital_gains', 'detailed_industry_recode',
       'num_persons_worked_for_employer', 'weeks_worked_in_year',
       'capital_losses', 'sex_ Male', 'sex_ Female',
       'major_occupation_code_ Executive admin and managerial',
       'education_ Masters degree(MA MS MEng MEd MSW MBA)',
       'education_ Bachelors degree(BA AB BS)',
       'education_ High school graduate',
       'major_occupation_code_ Professional specialty',
       'own_business_or_self_employed',
       'detailed_household_summary_in_household_ Householder',
       'education_ Prof school degree (MD DDS DVM LLB JD)',
       'detailed_household_and_family_stat_ Householder',
       'education_ Some college but no degree'],
      dtype='object')

In [66]:
rff_model = RandomForestClassifier(n_estimators=45, max_depth = 50, random_state=42)
rff_model.fit(X_train_fe, y_train)

# Make predictions on the test data
y_pred = rff_model.predict(X_test_fe)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the accuracy and classification report
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

f_score = f1_score(y_test, y_pred)
print(f"F Score: {f_score}")

Accuracy: 0.93
Classification Report:
              precision    recall  f1-score   support

         0.0       0.95      0.97      0.96     18055
         1.0       0.71      0.56      0.63      2174

    accuracy                           0.93     20229
   macro avg       0.83      0.77      0.79     20229
weighted avg       0.92      0.93      0.92     20229

Mean Squared Error: 0.0713332344653715
F Score: 0.6291441788743254


Selecting the top 20 features of the previous model and using only those top 20 features to generate future predictions seems to have a higher F-score. There is an advantage in this model in that it reduces overfitting, and can potentially have better generalization effects. We will use this model for future predictions as a result.

#### predicting test.csv
To get the best prediction, we will rerun the feature engineered model with the whole training data set to get the best results.

In [72]:
X_cofe = train_co[f_select]
rff_model.fit(X_cofe, y)

In [73]:
test_cofe = test_co[f_select]
test_cofe.head()

Unnamed: 0,age,detailed_occupation_recode,dividends_from_stocks,capital_gains,detailed_industry_recode,num_persons_worked_for_employer,weeks_worked_in_year,capital_losses,sex_ Male,sex_ Female,major_occupation_code_ Executive admin and managerial,education_ Masters degree(MA MS MEng MEd MSW MBA),education_ Bachelors degree(BA AB BS),education_ High school graduate,major_occupation_code_ Professional specialty,own_business_or_self_employed,detailed_household_summary_in_household_ Householder,education_ Prof school degree (MD DDS DVM LLB JD),detailed_household_and_family_stat_ Householder,education_ Some college but no degree
101143,0.084506,10,-0.368335,-0.219896,43,0.923884,0.784188,-0.151129,0,1,0,0,1,0,1,0,0,0,0,0
101144,0.109785,8,-0.368335,-0.219896,37,0.832787,0.946129,-0.151129,0,1,0,0,0,1,1,0,0,0,0,0
101145,0.413422,0,-0.368335,-0.219896,0,-1.076517,-1.076905,-0.151129,1,0,0,0,0,0,0,0,1,0,0,0
101146,0.010725,24,-0.368335,-0.219896,33,0.98136,0.946129,-0.151129,0,1,0,0,0,1,0,2,0,0,0,0
101147,0.057026,40,-0.368335,-0.219896,4,0.832787,0.888147,-0.151129,1,0,0,0,0,0,0,0,0,0,0,0


In [74]:
pred_k_rf_gs = rff_model.predict(test_cofe)
print(pred_k_rf_gs)

[0. 0. 0. ... 0. 0. 0.]


In [351]:
# Creating the submission file
# df_submission = pd.DataFrame(pred_k_rf_gs, columns=["Label"])
# df_submission.to_csv("my_predictions.csv", index=True, index_label="ID")

### The Gradient Boosting (xgboost) model
Here, I tried the Gradient Boosting model mainly because of its low cost computation and its powerful ability to handle both continous and categorical features. I played around with the objective function and tested for best n_estimators, max depth, and learning rate with grid sesarch.

In [83]:
gb_model = xgb.XGBClassifier(objective="binary:logistic", enable_categorical = True, random_state=42)

# Fit the model to the training data
gb_model.fit(X_train_n, y_train_n)

# Make predictions on the test data
gb_y_pred = gb_model.predict(X_test_n)

# Evaluate the model
accuracy = accuracy_score(y_test_n, gb_y_pred)
report = classification_report(y_test_n, gb_y_pred)

# Print the accuracy and classification report
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)

mse = mean_squared_error(y_test_n, gb_y_pred)
print(f"Mean Squared Error: {mse}")

f_score = f1_score(y_test_n, gb_y_pred)
print(f"F Score: {f_score}")


  or is_sparse(dtype)
  or (is_categorical_dtype(dtype) and enable_categorical)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(ser.dtype):
  if is_sparse(data):


Accuracy: 0.94
Classification Report:
              precision    recall  f1-score   support

         0.0       0.95      0.98      0.96     18055
         1.0       0.75      0.61      0.67      2174

    accuracy                           0.94     20229
   macro avg       0.85      0.79      0.82     20229
weighted avg       0.93      0.94      0.93     20229

Mean Squared Error: 0.06406643927035444
F Score: 0.6720647773279352


  or is_sparse(dtype)
  or (is_categorical_dtype(dtype) and enable_categorical)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(ser.dtype):


From the random forest model, I have an F score of 0.6720647773279352, which is a great deal higher than previous models.

### GridSearchCV
Optimizing the hyperparameter. Because of how long the grid search takes, I will take an even smaller subset of the numerically encoded training and testing set.

In [91]:
classifier = xgb.XGBClassifier(enable_categorical = True)

gbhs_Xtr = X_train_n[1:2500]
gbhs_ytr = y_train_n[1:2500]
gbhs_Xts = X_test_n[1:2500]
gbhs_yts = y_test_n[1:2500]
# Define the hyperparameter grid
param_grid = {
    'n_estimators': [250,275],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
}


In [92]:
# Create a GridSearchCV object
grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy')

# Fit the grid search to your data
grid_search.fit(gbhs_Xtr, gbhs_ytr)

  or is_sparse(dtype)
  or (is_categorical_dtype(dtype) and enable_categorical)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(ser.dtype):
  if is_sparse(data):
  or is_sparse(dtype)
  or (is_categorical_dtype(dtype) and enable_categorical)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(ser.dtype):
  or is_sparse(dtype)
  or (is_categorical_dtype(dtype) and enable_categorical)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(ser.dtype):
  if is_sparse(data):
  or is_sparse(dt

In [93]:
# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best model
best_model = grid_search.best_estimator_

Best Hyperparameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 250}


In [95]:
# Train your best model on the full dataset (if needed)
best_model.fit(X_train_n, y_train_n)

# Make predictions on the test data
y_pred = best_model.predict(X_test_n)

# Evaluate the model
accuracy = accuracy_score(y_test_n, y_pred)
report = classification_report(y_test_n, y_pred)

# Print the accuracy and classification report
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)

mse = mean_squared_error(y_test_n, y_pred)
print(f"Mean Squared Error: {mse}")

f_score = f1_score(y_test_n, y_pred)
print(f"F Score: {f_score}")

  or is_sparse(dtype)
  or (is_categorical_dtype(dtype) and enable_categorical)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(ser.dtype):
  if is_sparse(data):


Accuracy: 0.93
Classification Report:
              precision    recall  f1-score   support

         0.0       0.93      0.99      0.96     18055
         1.0       0.81      0.40      0.54      2174

    accuracy                           0.93     20229
   macro avg       0.87      0.69      0.75     20229
weighted avg       0.92      0.93      0.91     20229

Mean Squared Error: 0.07449700924415444
F Score: 0.5364503229775454


  or is_sparse(dtype)
  or (is_categorical_dtype(dtype) and enable_categorical)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(ser.dtype):


From the F score, we can tell this model is not great, most likely due to overfitting as n_estimators is set to 250.

### Feature Engineering
I will attempt to make the model better by selecting the most predictive features to improve generalizability.

In [96]:
feature_importances = gb_model.feature_importances_
# print("Feature Importances:")
# for i, importance in enumerate(feature_importances):
#     print(f"Feature {i}: {importance}")

sorted_idx = feature_importances.argsort()[::-1]

array([12,  9, 38, 16,  4, 18, 19, 17,  0, 23,  3, 22,  8, 29,  7,  1,  5,
       21, 32, 31, 27, 14, 33, 11, 35, 25, 26, 15, 24, 10, 13, 36, 34, 28,
       20,  2,  6, 30, 37, 39])

I will select the top 20 features ranked out of feature importance of the previous model to predict future labeling.

In [99]:
N = 20
X_train_fe = X_train.iloc[:, sorted_idx[:N]]
X_test_fe = X_test.iloc[:, sorted_idx[:N]]
f_select_gb = X_train_fe.columns

In [100]:
gb_model.fit(X_train_fe, y_train_n)

# Make predictions on the test data
y_pred = gb_model.predict(X_test_fe)

# Evaluate the model
accuracy = accuracy_score(y_test_n, y_pred)
report = classification_report(y_test_n, y_pred)

# Print the accuracy and classification report
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

f_score = f1_score(y_test, y_pred)
print(f"F Score: {f_score}")

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


Accuracy: 0.92
Classification Report:
              precision    recall  f1-score   support

         0.0       0.94      0.98      0.96     18055
         1.0       0.72      0.49      0.58      2174

    accuracy                           0.92     20229
   macro avg       0.83      0.73      0.77     20229
weighted avg       0.92      0.92      0.92     20229

Mean Squared Error: 0.0754856888625241
F Score: 0.5833560709413369


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


The model after selecting 20 top features does much worse than the full model when comparing F-scores. Thus, we will be using the full gradient boosting model for future predictions.

#### predicting test.csv
For the best and most accurate prediction, I will train the model again with the entire training dataset.

In [101]:
gb_model.fit(X_n,y_n)

  or is_sparse(dtype)
  or (is_categorical_dtype(dtype) and enable_categorical)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(ser.dtype):
  if is_sparse(data):


In [102]:
pred_k_gb = gb_model.predict(test_cn)
print(pred_k_gb)

[0 0 0 ... 0 0 0]


  or is_sparse(dtype)
  or (is_categorical_dtype(dtype) and enable_categorical)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(ser.dtype):


In [267]:
#Creating submission file
# df_submission = pd.DataFrame(pred_k_gb, columns=["Label"])
# df_submission.to_csv("my_predictions.csv", index=True, index_label="ID")

### Ensemble modeling of Gradient Boosting and Random Forest

While my gradient boosting model does the best, I wanted to find a way to improve my predictions. My appraoch was getting the majority vote of both the random forest model and the gradient boosting model. This would be an additional method to scaling the data and feature engineering.

In [104]:
# creating an ensemble model with a majority vote approach
ensemble_predictions = (rf_y_pred + gb_y_pred) >= 1

# Calculate the accuracy of the ensemble model
ensemble_accuracy = accuracy_score(y_test, ensemble_predictions)

# Print the accuracy
print(f"Ensemble Accuracy: {ensemble_accuracy:.2f}")

f_score = f1_score(y_test, ensemble_predictions)
print(f"F Score: {f_score}")

Ensemble Accuracy: 0.93
F Score: 0.6829385020339794


The resulting F-Score is an improvement from the previous f-scores. We will use this model for all future predictions.

#### predicting test.csv
We will now predict the test dataset, and submit the results to Kaggle. Since I already trained both models with the full training dataset, I will not need to do any additional work here but get the majority vote.

In [105]:
pred_k_gb = gb_model.predict(test_cn)
pred_k_rf = rf_model.predict(test_co)
ensemble_predictions = (pred_k_gb + pred_k_rf) >= 1

df_submission = pd.DataFrame(ensemble_predictions, columns=["Label"])
df_submission.to_csv("my_predictions.csv", index=True, index_label="ID")

  or is_sparse(dtype)
  or (is_categorical_dtype(dtype) and enable_categorical)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(ser.dtype):


## SVM
Despite the success of my previous model, I wanted to give SVM a try, as it is known to be a powerful tool for binary classification. Here I played around with regulariztion of punishing overcomplex models, and also the C parameter.

In [106]:
svm_l1 = LinearSVC(penalty='l1', dual=False, C=2.0, random_state=42)
svm_l1.fit(sub_X_train, sub_y_train)



In [108]:
# Make predictions on the test data
y_test_pred = svm_l1.predict(sub_X_test)

# Evaluate the model
accuracy = accuracy_score(sub_y_test, y_test_pred)
report = classification_report(sub_y_test, y_test_pred)
mse = mean_squared_error(sub_y_test, y_test_pred)
print(f"Mean Squared Error: {mse}")

f_score = f1_score(sub_y_test, y_test_pred)
print(f"F Score: {f_score}")

# Print the accuracy and classification report
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)

Mean Squared Error: 0.0748074807480748
F Score: 0.5965480043149946
Accuracy: 0.93
Classification Report:
              precision    recall  f1-score   support

         0.0       0.94      0.98      0.96      8905
         1.0       0.73      0.51      0.60      1094

    accuracy                           0.93      9999
   macro avg       0.83      0.74      0.78      9999
weighted avg       0.92      0.93      0.92      9999



As we can see, the F-score is incredibily low, so we will stick to the model that we have from before.

#### predicting test.csv

In [111]:
pred_k_svm = svm_l1.predict(test_co)
print(pred_k_svm)

[0. 0. 0. ... 0. 0. 0.]


In [271]:
# submission file preparation
# df_submission = pd.DataFrame(pred_k_svm, columns=["Label"])
# df_submission.to_csv("my_predictions.csv", index=True, index_label="ID")

## Conclusion
In conclusion, after trying a series of methods out, I found that the random forest and gradience boosting models did the best for this problem. There are a lot of intricacies when it comes to scaling and encoding the data, which can impact the F-score. One of the biggest challenge with this dataset is the imbalance of the classes. 
Out of everything I tried including grid search and feature scaling, I found that log transforming the data and ensemble modeling of taking the majority vote of two models significantly boosted the effectiveness of the models. I will be implementing these methods in future projects.

A question that still lingers includes the usage of one-hot encoding versus numerical encoding. When submitting results to Kaggle, I found that numerical encoding did significantly better even for random forest, but theorically speaking random forest can only handle one-hot encodings for categorical features. I hope to find out more about it in the future.