### dependencies

In [1]:
import pandas as pd
import numpy as np
import os

### loading the dataset

In [2]:
cwd = os.getcwd()
print(f'the current directory is: {cwd}')

the current directory is: /Users/santiagocardenas/Documents/MDSI/202502/advanced_machine_learning/labs/20250731_amlaa_lab1/notebooks


### loading the dataset

In [9]:
root = '/'.join(cwd.split(sep = '/')[:-1])
file_path = os.path.join(root, 'data', 'raw', 'wfh.csv')
df = pd.read_csv(file_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          50000 non-null  int64  
 1   distance_from_office        50000 non-null  float64
 2   salary_range                50000 non-null  object 
 3   gas_price_per_litre         50000 non-null  float64
 4   public_transportation_cost  50000 non-null  float64
 5   wfh_prev_workday            50000 non-null  bool   
 6   workday                     50000 non-null  object 
 7   tenure                      50000 non-null  float64
 8   work_home_actual            50000 non-null  int64  
dtypes: bool(1), float64(4), int64(2), object(2)
memory usage: 3.1+ MB


In [10]:
df.head()

Unnamed: 0,id,distance_from_office,salary_range,gas_price_per_litre,public_transportation_cost,wfh_prev_workday,workday,tenure,work_home_actual
0,0,5.962247,40K - 60K,2.119485,8.568058,False,Friday,0.212653,1
1,1,0.535872,40K - 60K,2.357199,5.425382,True,Tuesday,4.927549,0
2,2,1.969519,40K - 60K,2.366849,8.247158,False,Monday,0.520817,1
3,3,2.53041,20K - 40K,2.318722,7.944251,False,Tuesday,0.453649,1
4,4,2.253635,60K+,2.221265,8.884478,True,Thursday,5.695263,1


In [12]:
target = df.pop('work_home_actual')
df_clean = df.copy()

### encoding categorical features

In [14]:
df_clean.select_dtypes(exclude='number').columns

Index(['salary_range', 'wfh_prev_workday', 'workday'], dtype='object')

In [15]:
df_clean['salary_range'].value_counts()

0 - 20K      19918
20K - 40K    15047
40K - 60K     9977
60K+          5058
Name: salary_range, dtype: int64

In [16]:
df_clean.workday.value_counts()

Wednesday    10095
Monday       10025
Friday       10020
Tuesday       9946
Thursday      9914
Name: workday, dtype: int64

### encoding categorial features

In [17]:
from sklearn.preprocessing import OrdinalEncoder
ord_encoder = OrdinalEncoder(categories = [['0 - 20K', '20K - 40K', '40K - 60K', '60K+'], ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']])
df_clean[['salary_range', 'workday']] = ord_encoder.fit_transform(df_clean[['salary_range', 'workday']])
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          50000 non-null  int64  
 1   distance_from_office        50000 non-null  float64
 2   salary_range                50000 non-null  float64
 3   gas_price_per_litre         50000 non-null  float64
 4   public_transportation_cost  50000 non-null  float64
 5   wfh_prev_workday            50000 non-null  bool   
 6   workday                     50000 non-null  float64
 7   tenure                      50000 non-null  float64
dtypes: bool(1), float64(6), int64(1)
memory usage: 2.7 MB


In [18]:
df_clean['salary_range'].value_counts()

0.0    19918
1.0    15047
2.0     9977
3.0     5058
Name: salary_range, dtype: int64

In [19]:
df_clean['workday'].value_counts()

2.0    10095
0.0    10025
4.0    10020
1.0     9946
3.0     9914
Name: workday, dtype: int64

### dropping id column

In [20]:
df_clean.drop(columns = 'id', axis = 1, inplace=True)
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   distance_from_office        50000 non-null  float64
 1   salary_range                50000 non-null  float64
 2   gas_price_per_litre         50000 non-null  float64
 3   public_transportation_cost  50000 non-null  float64
 4   wfh_prev_workday            50000 non-null  bool   
 5   workday                     50000 non-null  float64
 6   tenure                      50000 non-null  float64
dtypes: bool(1), float64(6)
memory usage: 2.3 MB


### split the dataset

In [22]:
from sklearn.model_selection import train_test_split
x_full, x_test, y_full, y_test = train_test_split(df_clean, target, test_size = 0.2, random_state = 8, stratify = target)
x_train, x_val, y_train, y_val = train_test_split(x_full, y_full, test_size = 0.2, random_state = 8, stratify = y_full)
print(f'training features shape: {x_train.shape} and target shape is: {y_train.shape}')
print(f"validation features shape is: {x_val.shape} and target shap is: {y_val.shape}")
print(f"test features shape is: {x_test.shape} and target is: {y_test.shape}")

training features shape: (32000, 7) and target shape is: (32000,)
validation features shape is: (8000, 7) and target shap is: (8000,)
test features shape is: (10000, 7) and target is: (10000,)


### saving the splits into the processed folder

In [30]:
processed_folder_path = os.path.join(root, 'data', 'processed')
x_train.to_csv(f"{processed_folder_path}/x_train.csv", index = False)
x_val.to_csv(f"{processed_folder_path}/x_val.csv", index = False)
x_test.to_csv(f"{processed_folder_path}/x_test.csv", index = False)
y_train.to_csv(f"{processed_folder_path}/y_train.csv", index = False)
y_val.to_csv(f"{processed_folder_path}/y_val.csv", index = False)
y_test.to_csv(f"{processed_folder_path}/y_test.csv", index = False)

### generate a baseline model

In [31]:
from sklearn.dummy import DummyClassifier
base_clf = DummyClassifier(strategy = "most_frequent")
base_clf.fit(x_train, y_train)

In [35]:
from sklearn.metrics import roc_auc_score
baseline_prob = base_clf.predict_proba(x_train)
baseline_prob[:5]

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [36]:
print(f"the auroc score on training is: {roc_auc_score(y_train, baseline_prob[:, 1])}")

the auroc score on training is: 0.5


### random forest

In [38]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 8)
rf.fit(x_train, y_train)

In [39]:
train_probs_rf = rf.predict_proba(x_train)
val_probs_rf = rf.predict_proba(x_val)
print(f'the aucroc score on training is: {roc_auc_score(y_train, train_probs_rf[:, 1])}')
print(f'the aucroc score on validation is: {roc_auc_score(y_val, val_probs_rf[:, 1])}')

the aucroc score on training is: 1.0
the aucroc score on validation is: 0.9728576722324407


### tuned rf

In [41]:
rf2 = RandomForestClassifier(random_state = 8, max_depth = 6, min_samples_leaf = 50)
rf2.fit(x_train, y_train)

In [42]:
train_probs_rf2 = rf2.predict_proba(x_train)
val_probs_rf2 = rf2.predict_proba(x_val)
print(f'the aucroc score on training is: {roc_auc_score(y_train, train_probs_rf2[:, 1])}')
print(f'the aucroc score on validation is: {roc_auc_score(y_val, val_probs_rf2[:, 1])}')

the aucroc score on training is: 0.9639261149590567
the aucroc score on validation is: 0.9639032609455842
