In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# === Load Data ===
train_data = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv')
test_data = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv')

X = train_data.drop("SalePrice", axis=1)
y = train_data["SalePrice"]

# === Feature Separation ===
categorical_cols = [c for c in X.columns if X[c].dtype == "object" and X[c].nunique() < 10]
numerical_cols = [c for c in X.columns if X[c].dtype in ['int64', 'float64']]

# === Preprocessor ===
preprocessor = ColumnTransformer([
    ('num', SimpleImputer(strategy='mean'), numerical_cols),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), categorical_cols)
])

# === Define Models ===
models = {
    "DecisionTree": DecisionTreeRegressor(random_state=42, max_depth=12, min_samples_leaf=3),
    "RandomForest": RandomForestRegressor(random_state=42, n_estimators=200, max_depth=12, min_samples_leaf=3, n_jobs=-1),
    "XGBoost": XGBRegressor(random_state=42, n_estimators=300, learning_rate=0.05, max_depth=6, n_jobs=-1)
}

# === Helper Function ===
def precision_average(y_true, y_pred, tolerance=0.1):
    """Percentage of predictions within Â±10% of true values."""
    return np.mean(np.abs((y_true - y_pred) / y_true) <= tolerance) * 100

# === Perform K-Fold from 1 to 5 ===
final_results = {}

for k in range(1, 6):
    print(f"\n================= K = {k} =================")
    results = []

    for model_name, model in models.items():
        maes, mses, r2s, precisions = [], [], [], []

        if k == 1:
            # Single train-test split for K=1
            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('model', model)
            ])

            pipeline.fit(X_train, y_train)
            preds = pipeline.predict(X_val)

            mae = mean_absolute_error(y_val, preds)
            mse = mean_squared_error(y_val, preds)
            r2 = r2_score(y_val, preds)
            prec = precision_average(y_val, preds)

            maes.append(mae)
            mses.append(mse)
            r2s.append(r2)
            precisions.append(prec)

        else:
            # Use KFold for k >= 2
            kf = KFold(n_splits=k, shuffle=True, random_state=42)
            for train_idx, val_idx in kf.split(X):
                X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
                y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

                pipeline = Pipeline([
                    ('preprocessor', preprocessor),
                    ('model', model)
                ])

                pipeline.fit(X_train, y_train)
                preds = pipeline.predict(X_val)

                mae = mean_absolute_error(y_val, preds)
                mse = mean_squared_error(y_val, preds)
                r2 = r2_score(y_val, preds)
                prec = precision_average(y_val, preds)

                maes.append(mae)
                mses.append(mse)
                r2s.append(r2)
                precisions.append(prec)

        results.append({
            "Model": model_name,
            "MAE": np.mean(maes),
            "MSE": np.mean(mses),
            "Accuracy (%)": np.mean(r2s) * 100,
            "Precision Average (%)": np.mean(precisions)
        })

    # Store and display results
    results_df = pd.DataFrame(results)
    results_df.index = range(1, len(results_df) + 1)
    final_results[f"K={k}"] = results_df
    display(results_df)


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/home-data-for-ml-course/train.csv'

In [None]:
final_results

{'K=1':           Model           MAE           MSE  Accuracy (%)  \
 1  DecisionTree  27699.118143  1.707908e+09     77.733571   
 2  RandomForest  17973.072307  8.943397e+08     88.340265   
 3       XGBoost  16231.979318  6.621587e+08     91.367269   
 
    Precision Average (%)  
 1              48.630137  
 2              66.438356  
 3              68.835616  ,
 'K=2':           Model           MAE           MSE  Accuracy (%)  \
 1  DecisionTree  25931.517842  1.662225e+09     73.264098   
 2  RandomForest  18177.613085  9.612297e+08     84.706035   
 3       XGBoost  18337.565550  1.022489e+09     83.908632   
 
    Precision Average (%)  
 1              49.794521  
 2              64.315068  
 3              64.589041  ,
 'K=3':           Model           MAE           MSE  Accuracy (%)  \
 1  DecisionTree  27250.235385  1.961498e+09     69.363857   
 2  RandomForest  18432.602072  1.046659e+09     83.512623   
 3       XGBoost  17632.317309  1.063124e+09     83.288718   
 
   

In [None]:
import pandas as pd
import numpy as np

# Create empty lists to store all the data
all_data = []

# Loop through each K value and model to create rows
for k in range(1, 6):
    for model in ['DecisionTree', 'RandomForest', 'XGBoost']:
        # Get the results for this K value and model
        k_results = final_results[f'K={k}']
        model_results = k_results[k_results['Model'] == model]

        # Extract the metrics
        if len(model_results) > 0:
            row = {
                'K': k,
                'Model': model,
                'MAE': model_results['MAE'].values[0],
                'MSE': model_results['MSE'].values[0],
                'Accuracy (%)': model_results['Accuracy (%)'].values[0],
                'Precision Average (%)': model_results['Precision Average (%)'].values[0]
            }
            all_data.append(row)

# Create the final DataFrame
final_result = pd.DataFrame(all_data)

# Display in your desired format
for k in range(1, 6):
    print(f"k={k}")
    k_data = final_result[final_result['K'] == k]

    # Print header
    print("model\tmae\tmse\tacc\tPrecision average")

    # Print each model's results
    for model in ['DecisionTree', 'RandomForest', 'XGBoost']:
        model_data = k_data[k_data['Model'] == model]
        if len(model_data) > 0:
            mae = model_data['MAE'].values[0]
            mse = model_data['MSE'].values[0]
            acc = model_data['Accuracy (%)'].values[0]
            precision = model_data['Precision Average (%)'].values[0]

            print(f"{model}\t{mae:.2f}\t{mse:.2e}\t{acc:.2f}\t{precision:.2f}")

    print("\n" + "="*50 + "\n")

k=1
model	mae	mse	acc	Precision average
DecisionTree	27699.12	1.71e+09	77.73	48.63
RandomForest	17973.07	8.94e+08	88.34	66.44
XGBoost	16231.98	6.62e+08	91.37	68.84


k=2
model	mae	mse	acc	Precision average
DecisionTree	25931.52	1.66e+09	73.26	49.79
RandomForest	18177.61	9.61e+08	84.71	64.32
XGBoost	18337.57	1.02e+09	83.91	64.59


k=3
model	mae	mse	acc	Precision average
DecisionTree	27250.24	1.96e+09	69.36	48.15
RandomForest	18432.60	1.05e+09	83.51	64.32
XGBoost	17632.32	1.06e+09	83.29	66.71


k=4
model	mae	mse	acc	Precision average
DecisionTree	25581.53	1.60e+09	74.92	50.62
RandomForest	17836.99	9.01e+08	85.72	66.51
XGBoost	16321.86	7.08e+08	88.79	69.04


k=5
model	mae	mse	acc	Precision average
DecisionTree	26311.91	1.75e+09	71.94	49.66
RandomForest	18032.90	9.78e+08	83.98	66.23
XGBoost	16953.84	9.76e+08	83.87	68.56




In [None]:
# Save in your desired format
with open("/kaggle/working/observation.csv", "w") as f:
    for k in range(1, 6):
        f.write(f"k={k}\n")
        f.write("model,mae,mse,acc,Precision average\n")

        k_data = final_result[final_result['K'] == k]
        for model in ['DecisionTree', 'RandomForest', 'XGBoost']:
            model_data = k_data[k_data['Model'] == model]
            if len(model_data) > 0:
                mae = model_data['MAE'].values[0]
                mse = model_data['MSE'].values[0]
                acc = model_data['Accuracy (%)'].values[0]
                precision = model_data['Precision Average (%)'].values[0]

                f.write(f"{model},{mae:.2f},{mse:.2e},{acc:.2f},{precision:.2f}\n")

        f.write("\n" + ","*5 + "\n\n")

print("File saved as /kaggle/working/observation.csv")

File saved as /kaggle/working/observation.csv
