In [4]:
# Phase 5: Advanced ML Models (Decision Tree + Random Forest)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# ------------------------
# 1. Dataset
# ------------------------
experience = np.array([1,2,3,4,5,6,7,8]).reshape(-1, 1)
salary = np.array([10000,20000,30000,40000,50000,60000,70000,80000])

df = pd.DataFrame({"Experience": experience.flatten(), "Salary": salary})
print(df)

# ------------------------
# 2. Features & Target
# ------------------------
X = df[["Experience"]]
y = df["Salary"]

# ------------------------
# 3. Train/Test Split
# ------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print("Training size:", len(X_train))
print("Testing size:", len(X_test))

# ------------------------
# 4. Linear Regression Baseline
# ------------------------
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
print("Linear Regression MAE:", mae_lr)

# ------------------------
# 5. Decision Tree Regressor
# ------------------------
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
mae_dt = mean_absolute_error(y_test, y_pred_dt)
print("Decision Tree MAE:", mae_dt)

# ------------------------
# 6. Random Forest Regressor
# ------------------------
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rmse_rf = mean_squared_error(y_test, rf_pred, squared=False)
print("Random Forest RMSE:", rmse_rf)

# ------------------------
# 7. Cross-Validation
# ------------------------
scores = cross_val_score(rf_model, X, y, cv=5, scoring='neg_mean_absolute_error')
print("Random Forest CV MAE:", -scores.mean())

# ------------------------
# 8. Hyperparameter Tuning with GridSearchCV
# ------------------------
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

grid = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='neg_mean_absolute_error')
grid.fit(X_train, y_train)
print("Best Hyperparameters:", grid.best_params_)

# ------------------------
# 9. Model Comparison
# ------------------------
models = {
    "Linear Regression": lr_model,
    "Decision Tree": dt_model,
    "Random Forest": rf_model
}

for name, model in models.items():
    preds = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    print(f"{name} MAE: {mae}")

# ------------------------
# 10. Visualizing Predictions
# ------------------------
plt.figure(figsize=(8,5))
plt.scatter(X_test, y_test, color='black', label='Actual')
plt.plot(X_test, y_pred_lr, color='blue', label='Linear Regression')
plt.scatter(X_test, y_pred_dt, color='red', label='Decision Tree')
plt.scatter(X_test, rf_pred, color='green', label='Random Forest')
plt.xlabel('Experience')
plt.ylabel('Salary')
plt.title('Model Predictions vs Actual')
plt.legend()
plt.show()

   Experience  Salary
0           1   10000
1           2   20000
2           3   30000
3           4   40000
4           5   50000
5           6   60000
6           7   70000
7           8   80000
Training size: 6
Testing size: 2
Linear Regression MAE: 9.094947017729282e-12
Decision Tree MAE: 10000.0


TypeError: got an unexpected keyword argument 'squared'