# House Prices - Advanced Regression Techniques

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
import os
from sklearn.ensemble import GradientBoostingRegressor
import joblib

In [17]:
# Create directories for saving models and processed data
os.makedirs('models', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)


--

In [18]:
# 1. Data Loading and Initial Splitting
def load_and_split_data():
    """Load data and split into train and test sets"""
    train_df = pd.read_csv("../data/train.csv")
    test_df = pd.read_csv("../data/test.csv")
    
    # Define features
    continuous_features = ["LotArea", "GrLivArea"]
    categorical_features = ["MSZoning", "HouseStyle"]
    target = "SalePrice"
    
    # Split features and target
    X = train_df[continuous_features + categorical_features]
    y = train_df[target]
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test, test_df, continuous_features, categorical_features, target


--

In [19]:
# 2. Preprocessing Functions
def fit_preprocessors(X_train, continuous_features, categorical_features):
    """Fit the preprocessors on training data"""
    # Fit scaler for continuous features
    scaler = StandardScaler()
    scaler.fit(X_train[continuous_features])
    
    # Fit encoder for categorical features
    encoder = OneHotEncoder(drop="first", sparse=False, handle_unknown='ignore')
    encoder.fit(X_train[categorical_features])
    
    return scaler, encoder


--

In [20]:
def transform_data(X, scaler, encoder, continuous_features, categorical_features):
    """Transform data using fitted preprocessors"""
    # Transform continuous features
    X_cont = scaler.transform(X[continuous_features])
    
    # Transform categorical features
    X_cat = encoder.transform(X[categorical_features])
    
    # Combine features
    X_processed = np.hstack((X_cont, X_cat))
    
    # Create column names
    continuous_cols = [f"{feat}_scaled" for feat in continuous_features]
    categorical_cols = encoder.get_feature_names_out(categorical_features)
    all_columns = continuous_cols + list(categorical_cols)
    
    # Create DataFrame
    X_processed_df = pd.DataFrame(X_processed, columns=all_columns)
    
    return X_processed_df

--

In [21]:
# 3. Model Building Section
def train_model(X_train_processed, y_train):
    """Train the model"""
    model = GradientBoostingRegressor(n_estimators=1, warm_start=True)
    model.fit(X_train_processed, y_train)
    return model

def evaluate_model(model, X_test_processed, y_test):
    """Evaluate the model"""
    y_pred = model.predict(X_test_processed)
    rmsle = np.sqrt(mean_squared_log_error(y_test, np.maximum(y_pred, 0)))
    return rmsle

In [22]:
# 4. Model Inference Section
def predict_new_data(model, X_new_processed):
    """Make predictions on new data"""
    predictions = model.predict(X_new_processed)
    return predictions


In [23]:
# Main execution
if __name__ == "__main__":
    # 1. Load and split data
    X_train, X_test, y_train, y_test, test_df, continuous_features, categorical_features, target = load_and_split_data()
    
    # 2. Fit preprocessors on training data
    scaler, encoder = fit_preprocessors(X_train, continuous_features, categorical_features)
    
    # Save preprocessors
    joblib.dump(scaler, 'models/scaler.joblib')
    joblib.dump(encoder, 'models/encoder.joblib')
    
    # 3. Transform training data
    X_train_processed = transform_data(X_train, scaler, encoder, continuous_features, categorical_features)
    
    # 4. Train model
    model = train_model(X_train_processed, y_train)
    
    # Save model
    joblib.dump(model, 'models/model.joblib')
    
    # 5. Transform test data
    X_test_processed = transform_data(X_test, scaler, encoder, continuous_features, categorical_features)
    
    # 6. Evaluate model
    rmsle = evaluate_model(model, X_test_processed, y_test)
    print(f"Model RMSLE on test set: {rmsle:.4f}")
    
    # 7. Model Inference
    # Transform new data (test.csv)
    X_new_processed = transform_data(test_df, scaler, encoder, continuous_features, categorical_features)
    
    # Make predictions
    predictions = predict_new_data(model, X_new_processed)
    
    # Create submission file
    submission = pd.DataFrame({
        'Id': test_df['Id'],
        'SalePrice': predictions
    })
    submission.to_csv('data/processed/submission.csv', index=False)
    
    print("Submission file has been created at 'data/processed/submission.csv'")

Model RMSLE on test set: 0.4233
Submission file has been created at 'data/processed/submission.csv'




--

In [24]:
os.makedirs('models', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)

In [25]:
X_train, X_test, y_train, y_test, test_df, continuous_features, categorical_features, target = load_and_split_data()

In [26]:
scaler, encoder = fit_preprocessors(X_train, continuous_features, categorical_features)

In [27]:
X_train_processed = transform_data(X_train, scaler, encoder, continuous_features, categorical_features)

In [28]:
model = train_model(X_train_processed, y_train)

In [29]:
X_test_processed = transform_data(X_test, scaler, encoder, continuous_features, categorical_features)
rmsle = evaluate_model(model, X_test_processed, y_test)
print(f"Model RMSLE on test set: {rmsle:.4f}")

Model RMSLE on test set: 0.4233


In [30]:
X_new_processed = transform_data(test_df, scaler, encoder, continuous_features, categorical_features)
predictions = predict_new_data(model, X_new_processed)



In [31]:
submission = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': predictions
})
submission.to_csv('data/processed/submission.csv', index=False)