# House Prices - Advanced Regression Techniques

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_squared_log_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'seaborn'

In [4]:
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

In [5]:
# Load data
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')

# Select features
numerical_features = ['GrLivArea', 'TotalBsmtSF']
categorical_features = ['Neighborhood', 'ExterQual']
target = 'SalePrice'

# Split data first to avoid data leakage
X = train_data[numerical_features + categorical_features]
y = train_data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

NameError: name 'train_test_split' is not defined

In [17]:
# Initialize transformers
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
scaler = StandardScaler()
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Process numerical features
X_train_num = X_train[numerical_features].copy()
X_test_num = X_test[numerical_features].copy()

# Fit and transform numerical features
X_train_num_imputed = num_imputer.fit_transform(X_train_num)
X_test_num_imputed = num_imputer.transform(X_test_num)

X_train_num_scaled = scaler.fit_transform(X_train_num_imputed)
X_test_num_scaled = scaler.transform(X_test_num_imputed)

# Process categorical features
X_train_cat = X_train[categorical_features].copy()
X_test_cat = X_test[categorical_features].copy()

# Fit and transform categorical features
X_train_cat_imputed = cat_imputer.fit_transform(X_train_cat)
X_test_cat_imputed = cat_imputer.transform(X_test_cat)

X_train_cat_encoded = encoder.fit_transform(X_train_cat_imputed)
X_test_cat_encoded = encoder.transform(X_test_cat_imputed)

# Convert to DataFrames
X_train_num_scaled = pd.DataFrame(X_train_num_scaled, columns=numerical_features, index=X_train.index)
X_test_num_scaled = pd.DataFrame(X_test_num_scaled, columns=numerical_features, index=X_test.index)

feature_names = encoder.get_feature_names_out(categorical_features)
X_train_cat_encoded = pd.DataFrame(X_train_cat_encoded, columns=feature_names, index=X_train.index)
X_test_cat_encoded = pd.DataFrame(X_test_cat_encoded, columns=feature_names, index=X_test.index)

# Combine features
X_train_processed = pd.concat([X_train_num_scaled, X_train_cat_encoded], axis=1)
X_test_processed = pd.concat([X_test_num_scaled, X_test_cat_encoded], axis=1)


In [18]:
# Model Training
print("Training model...")
model = LinearRegression()
model.fit(X_train_processed, y_train)

# Model Evaluation
def evaluate_model(X, y_true, dataset_name=""):
    y_pred = model.predict(X)
    rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    
    print(f"\n{dataset_name} Set Metrics:")
    print(f"RMSLE: {rmsle:.4f}")
    print(f"R2 Score: {r2:.4f}")
    return rmsle, r2

# Evaluate on training and test sets
train_rmsle, train_r2 = evaluate_model(X_train_processed, y_train, "Training")
test_rmsle, test_r2 = evaluate_model(X_test_processed, y_test, "Testing")

Training model...

Training Set Metrics:
RMSLE: 0.1788
R2 Score: 0.7779

Testing Set Metrics:
RMSLE: 0.1834
R2 Score: 0.8199


In [6]:
# Model Inference
print("\nProcessing test data...")
X_test_final = test_data[numerical_features + categorical_features]

# Process numerical features
X_test_final_num = X_test_final[numerical_features].copy()
X_test_final_num_imputed = num_imputer.transform(X_test_final_num)
X_test_final_num_scaled = scaler.transform(X_test_final_num_imputed)

# Process categorical features
X_test_final_cat = X_test_final[categorical_features].copy()
X_test_final_cat_imputed = cat_imputer.transform(X_test_final_cat)
X_test_final_cat_encoded = encoder.transform(X_test_final_cat_imputed)

# Convert to DataFrames and combine
X_test_final_num_scaled = pd.DataFrame(X_test_final_num_scaled, columns=numerical_features, index=X_test_final.index)
X_test_final_cat_encoded = pd.DataFrame(X_test_final_cat_encoded, columns=feature_names, index=X_test_final.index)
X_test_final_processed = pd.concat([X_test_final_num_scaled, X_test_final_cat_encoded], axis=1)

# Make predictions
predictions = model.predict(X_test_final_processed)

# Save predictions
submission = pd.DataFrame({
    'Id': test_data['Id'],
    'SalePrice': predictions
})



Processing test data...


NameError: name 'num_imputer' is not defined

In [1]:
processed_df.to_parquet('/my/filapth/processed_df.parquet', index=False)

NameError: name 'processed_df' is not defined