## **Wage Dataset**

In [2]:
#include libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#load data and checking contents
data = pd.read_csv("DT-Wage.csv")
print(data)

#check for null values
print("\nChecking NULL values:\n",data.isnull().sum())

      year  age            maritl      race        education  \
0     2006   18  1. Never Married  1. White     1. < HS Grad   
1     2004   24  1. Never Married  1. White  4. College Grad   
2     2003   45        2. Married  1. White  3. Some College   
3     2003   43        2. Married  3. Asian  4. College Grad   
4     2005   50       4. Divorced  1. White       2. HS Grad   
...    ...  ...               ...       ...              ...   
2995  2008   44        2. Married  1. White  3. Some College   
2996  2007   30        2. Married  1. White       2. HS Grad   
2997  2005   27        2. Married  2. Black     1. < HS Grad   
2998  2005   27  1. Never Married  1. White  3. Some College   
2999  2009   55      5. Separated  1. White       2. HS Grad   

                  region        jobclass          health health_ins   logwage  \
0     2. Middle Atlantic   1. Industrial       1. <=Good      2. No  4.318063   
1     2. Middle Atlantic  2. Information  2. >=Very Good      2. No  

In [3]:
#preprocessing data
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
print("Categorical Columns:", categorical_cols)
print("\n")

data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

print(data)

Categorical Columns: ['maritl', 'race', 'education', 'region', 'jobclass', 'health', 'health_ins']


      year  age   logwage        wage  maritl_2. Married  maritl_3. Widowed  \
0     2006   18  4.318063   75.043154              False              False   
1     2004   24  4.255273   70.476020              False              False   
2     2003   45  4.875061  130.982177               True              False   
3     2003   43  5.041393  154.685293               True              False   
4     2005   50  4.318063   75.043154              False              False   
...    ...  ...       ...         ...                ...                ...   
2995  2008   44  5.041393  154.685293               True              False   
2996  2007   30  4.602060   99.689464               True              False   
2997  2005   27  4.193125   66.229408               True              False   
2998  2005   27  4.477121   87.981033              False              False   
2999  2009   55  4.505150   90

In [4]:
#selecting target variable and features
X = data.drop(['wage'], axis = 1)
y = data['wage']

#splitting data for training, validating and testing
X_train = X.iloc[:2100]
X_validation = X.iloc[2100:2550]
X_test = X.iloc[2550:]
print(X_validation)
y_train = y.iloc[:2100]
y_validation = y.iloc[2100:2550]
y_test = y.iloc[2550:]

      year  age   logwage  maritl_2. Married  maritl_3. Widowed  \
2100  2008   43  4.477121              False              False   
2101  2008   42  5.176091               True              False   
2102  2005   51  5.243038               True              False   
2103  2007   49  4.380211              False              False   
2104  2006   48  4.711200               True              False   
...    ...  ...       ...                ...                ...   
2545  2008   23  4.591087              False              False   
2546  2006   46  4.973128              False              False   
2547  2005   61  4.913814               True              False   
2548  2007   70  4.612784               True              False   
2549  2009   59  4.342423               True              False   

      maritl_4. Divorced  maritl_5. Separated  race_2. Black  race_3. Asian  \
2100               False                False          False          False   
2101               False             

### **Evaluation Metrics**

In [5]:
def mean_sq_error(y_true, y_pred):
  if len(y_true) != len(y_pred):
    raise ValueError("Lengths of y_true and y_pred must be the same.")

  # Calculate the squared differences between true and predicted values
  squared_errors = [(true - pred) ** 2 for true, pred in zip(y_true, y_pred)]

  # Calculate the mean of squared errors
  mse = sum(squared_errors) / len(y_true)

  return mse

### **Decision Tree**

In [6]:
#Training the dataset
from sklearn.tree import DecisionTreeRegressor

#Define the Decision Tree Regressor
dt_regressor = DecisionTreeRegressor()

dt = dt_regressor.fit(X_train, y_train)

In [7]:
#Validating the model using validation set
y_pred = dt.predict(X_validation)

mse = mean_sq_error(y_validation, y_pred)
print('Mean Squared Error: ', mse)

Mean Squared Error:  0.5606809610924576


In [8]:
#Validating
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer

# Define the hyperparameter grid for grid search
param_grid = {
    'max_depth': [None, 3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

scoring = make_scorer(mean_squared_error, greater_is_better=False)

# Initialize GridSearchCV with cross-validation (e.g., 5-fold)
grid_search = GridSearchCV(estimator=dt_regressor, param_grid=param_grid, scoring=scoring, cv=5)

# Fit GridSearchCV on the combined training and validation sets
X_train_new = X.iloc[:2550]
y_train_new = y.iloc[:2550]
grid_search.fit(X_train_new, y_train_new)

# Get the best model on the test set
best_dt_model = grid_search.best_estimator_

print(best_dt_model)

DecisionTreeRegressor()


In [9]:
#Testing best model using test set
y_pred = best_dt_model.predict(X_test)

mse = mean_sq_error(y_test, y_pred)
print('Mean Squared Error: ', mse)

Mean Squared Error:  0.7067425794707285


## **XGB Regressor**

In [10]:
#Training the dataset
import xgboost as xgb

#Define the XGBRegressor
xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

xgbr = xgb_regressor.fit(X_train, y_train)

In [11]:
#Validating the model using validation set
y_pred = xgbr.predict(X_validation)

mse = mean_sq_error(y_validation, y_pred)
print('Mean Squared Error: ', mse)

Mean Squared Error:  5.076167179387068


In [12]:
#Validating
#Define the hyperparameter grid for grid searching
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300]
}

grid_search = GridSearchCV(estimator=xgb_regressor, param_grid=param_grid, cv=3,
                           scoring='neg_mean_squared_error')
grid_search.fit(X_train_new, y_train_new)

# Get the best XGBClassifier model from GridSearchCV
best_xgb_model = grid_search.best_estimator_

print(best_xgb_model)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=3, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=100, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...)


In [13]:
#Testing best model using test set
y_pred = best_xgb_model.predict(X_test)

mse = mean_sq_error(y_test, y_pred)
print('Mean Squared Error: ', mse)

Mean Squared Error:  0.9171398063784151
