In [None]:
# Swapnil Saha Shawon (2022533042)
# Tamanna Rahman (2021450642)
# Syeda Mashiat Tabassum (2031356642)

## **Wage Dataset**

In [None]:
#include libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#load data and checking contents
data = pd.read_csv("DT-Wage.csv")
print(data)

#check for null values
print("\nChecking NULL values:\n",data.isnull().sum())

      year  age            maritl      race        education  \
0     2006   18  1. Never Married  1. White     1. < HS Grad   
1     2004   24  1. Never Married  1. White  4. College Grad   
2     2003   45        2. Married  1. White  3. Some College   
3     2003   43        2. Married  3. Asian  4. College Grad   
4     2005   50       4. Divorced  1. White       2. HS Grad   
...    ...  ...               ...       ...              ...   
2995  2008   44        2. Married  1. White  3. Some College   
2996  2007   30        2. Married  1. White       2. HS Grad   
2997  2005   27        2. Married  2. Black     1. < HS Grad   
2998  2005   27  1. Never Married  1. White  3. Some College   
2999  2009   55      5. Separated  1. White       2. HS Grad   

                  region        jobclass          health health_ins   logwage  \
0     2. Middle Atlantic   1. Industrial       1. <=Good      2. No  4.318063   
1     2. Middle Atlantic  2. Information  2. >=Very Good      2. No  

In [None]:
#preprocessing data
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
print("Categorical Columns:", categorical_cols)
print("\n")

data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

print(data)

Categorical Columns: ['maritl', 'race', 'education', 'region', 'jobclass', 'health', 'health_ins']


      year  age   logwage        wage  maritl_2. Married  maritl_3. Widowed  \
0     2006   18  4.318063   75.043154              False              False   
1     2004   24  4.255273   70.476020              False              False   
2     2003   45  4.875061  130.982177               True              False   
3     2003   43  5.041393  154.685293               True              False   
4     2005   50  4.318063   75.043154              False              False   
...    ...  ...       ...         ...                ...                ...   
2995  2008   44  5.041393  154.685293               True              False   
2996  2007   30  4.602060   99.689464               True              False   
2997  2005   27  4.193125   66.229408               True              False   
2998  2005   27  4.477121   87.981033              False              False   
2999  2009   55  4.505150   90

In [None]:
#selecting target variable and features
X = data.drop(['wage'], axis = 1)
y = data['wage']

#splitting data for training, validating and testing
X_train = X.iloc[:2100]
X_validation = X.iloc[2100:2550]
X_test = X.iloc[2550:]
print(X_validation)
y_train = y.iloc[:2100]
y_validation = y.iloc[2100:2550]
y_test = y.iloc[2550:]

      year  age   logwage  maritl_2. Married  maritl_3. Widowed  \
2100  2008   43  4.477121              False              False   
2101  2008   42  5.176091               True              False   
2102  2005   51  5.243038               True              False   
2103  2007   49  4.380211              False              False   
2104  2006   48  4.711200               True              False   
...    ...  ...       ...                ...                ...   
2545  2008   23  4.591087              False              False   
2546  2006   46  4.973128              False              False   
2547  2005   61  4.913814               True              False   
2548  2007   70  4.612784               True              False   
2549  2009   59  4.342423               True              False   

      maritl_4. Divorced  maritl_5. Separated  race_2. Black  race_3. Asian  \
2100               False                False          False          False   
2101               False             

### **Evaluation Metrics**

In [None]:
def mean_sq_error(y_true, y_pred):
  if len(y_true) != len(y_pred):
    raise ValueError("Lengths of y_true and y_pred must be the same.")

  # Calculate the squared differences between true and predicted values
  squared_errors = [(true - pred) ** 2 for true, pred in zip(y_true, y_pred)]

  # Calculate the mean of squared errors
  mse = sum(squared_errors) / len(y_true)

  return mse

### **Support vector regression (SVR)**

In [None]:
#Training the dataset
from sklearn.svm import SVR

#Define the Support Vector Regressors
svr_rbf = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
svr_lin = SVR(kernel='linear', C=100, epsilon=.1)
svr_poly = SVR(kernel='poly', C=100, degree=3, epsilon=.1, coef0=1)

svr_rbf_trained = svr_rbf.fit(X_train, y_train)
svr_lin_trained = svr_lin.fit(X_train, y_train)
svr_poly_trained = svr_poly.fit(X_train, y_train)



In [None]:
#Validating the model using training set
y_rbf_pred = svr_rbf_trained.predict(X_train)
y_lin_pred = svr_lin_trained.predict(X_train)
y_poly_pred = svr_poly_trained.predict(X_train)

mse = mean_sq_error(y_train, y_rbf_pred)
print('Rbf SVR Mean Squared Error: ', mse)
mse = mean_sq_error(y_train, y_lin_pred)
print('Lin SVR Mean Squared Error: ', mse)
mse = mean_sq_error(y_train, y_poly_pred)
print('Poly SVR Mean Squared Error: ', mse)

Rbf SVR Mean Squared Error:  72.97953401654938
Lin SVR Mean Squared Error:  1076243.1517674015
Poly SVR Mean Squared Error:  1658.44398310714


In [None]:
#Validating the model using validation set
y_rbf_pred = svr_rbf_trained.predict(X_validation)
y_lin_pred = svr_lin_trained.predict(X_validation)
y_poly_pred = svr_poly_trained.predict(X_validation)

mse = mean_sq_error(y_validation, y_rbf_pred)
print('Rbf SVR Mean Squared Error: ', mse)
mse = mean_sq_error(y_validation, y_lin_pred)
print('Lin SVR Mean Squared Error: ', mse)
mse = mean_sq_error(y_validation, y_poly_pred)
print('Poly SVR Mean Squared Error: ', mse)

Rbf SVR Mean Squared Error:  186.77483558358375
Lin SVR Mean Squared Error:  1020920.0379212433
Poly SVR Mean Squared Error:  1955.73528808665


In [None]:
#Using the best model to evaluate the test set
y_rbf_pred = svr_rbf_trained.predict(X_test)

mse = mean_sq_error(y_test, y_rbf_pred)
print('SVR Mean Squared Error: ', mse)

SVR Mean Squared Error:  174.27242421912928
