In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/nsw74psid1.csv")
df.sample(3)

Unnamed: 0,trt,age,educ,black,hisp,marr,nodeg,re74,re75,re78
1723,0,37,17,0,0,1,0,82289.206,85040.323,110829.49
222,0,20,13,0,1,1,0,6073.7271,10025.806,27337.942
2099,0,21,10,0,0,1,1,13714.868,14080.887,7249.7266


In [3]:
from sklearn.model_selection import train_test_split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop("re78", axis=1), df["re78"], test_size=0.2, random_state=42
)

In [5]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

In [6]:
models = {
    "Decision Tree": DecisionTreeRegressor(max_depth=7, random_state=42),
    "Linear Regression": LinearRegression(),
    "SVM": Pipeline([
        ('scaler', StandardScaler()),
        ('svr', SVR(kernel='rbf', C=1.0, epsilon=0.1))
    ])
}

In [7]:
from tqdm import tqdm

In [8]:
%%time

results = {}

for name, model in tqdm(models.items()):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    results[name] = {
        "MSE": mean_squared_error(y_test, y_pred),
        "R2": r2_score(y_test, y_pred)
    }

100%|█████████████████████████████████████████████████████| 3/3 [00:00<00:00, 13.50it/s]

CPU times: user 318 ms, sys: 8.06 ms, total: 326 ms
Wall time: 225 ms





In [9]:
results_df = pd.DataFrame(results).T
results

{'Decision Tree': {'MSE': 135685193.10603836, 'R2': 0.4480341482267267},
 'Linear Regression': {'MSE': 96715423.73420379, 'R2': 0.6065627352621785},
 'SVM': {'MSE': 246520033.0594677, 'R2': -0.0028407441665720956}}