In [118]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PowerTransformer, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [119]:
# Upload kaggle.json
from google.colab import files
files.upload()
!unzip house-prices-advanced-regression-techniques.zip

Saving house-prices-advanced-regression-techniques.zip to house-prices-advanced-regression-techniques (4).zip
Archive:  house-prices-advanced-regression-techniques.zip
replace data_description.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: Y
  inflating: data_description.txt    
replace sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: Y
  inflating: sample_submission.csv   
replace test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: Y
  inflating: test.csv                
replace train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: Y
  inflating: train.csv               


In [120]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [121]:
# test['SalePrice'] = sample_submission['SalePrice']
y = train['SalePrice']
train = train.drop('SalePrice', axis=1)

In [122]:
all_data = pd.concat([train, test], axis=0).reset_index(drop=True)

In [123]:
## Temporal Variables (Date Time Variables)

for feature in ['YearBuilt','YearRemodAdd','GarageYrBlt']:

    all_data[feature]=all_data['YrSold']-all_data[feature]

In [124]:
# Drop Id column if present
id_column = test['Id']  # Save for submission
all_data = all_data.drop('Id', axis=1)

In [125]:
# Identify numerical and categorical features
num_feats = all_data.select_dtypes(include=[np.number]).columns.tolist()
cat_feats = all_data.select_dtypes(include=['object']).columns.tolist()

In [126]:
# Step 1: Find and remove outliers in numerical features from training data (using IQR method)
train_idx = len(train)
train_num = all_data.iloc[:train_idx][num_feats]
outlier_mask = np.zeros(len(train_num), dtype=bool)

In [127]:
for feat in num_feats:
    Q1 = train_num[feat].quantile(0.25)
    Q3 = train_num[feat].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    feat_outliers = (train_num[feat] < lower) | (train_num[feat] > upper)
    outlier_mask = outlier_mask | feat_outliers

In [128]:
# Remove outliers from train
train_clean = all_data.iloc[:train_idx][~outlier_mask]
y_clean = y[~outlier_mask]
print(f"Removed {outlier_mask.sum()} outliers. New train shape: {train_clean.shape}")

Removed 886 outliers. New train shape: (574, 79)


In [129]:
# Update all_data to reflect removed rows in train part
all_data = pd.concat([train_clean, all_data.iloc[train_idx:]], axis=0).reset_index(drop=True)
train_idx = len(train_clean)  # Update train index

In [130]:
# Step 2: Handle missing data
# Numerical: fill with median
for col in num_feats:
    all_data[col] = all_data[col].fillna(all_data[col].median())

# Categorical: fill with 'None'
for col in cat_feats:
    all_data[col] = all_data[col].fillna('None')

In [131]:
# Step 3: Feature transformation for skewed features
# Identify skewed numerical features in train
train_num = all_data.iloc[:train_idx][num_feats]
skewness = train_num.apply(lambda x: stats.skew(x))
skewed_feats = skewness[abs(skewness) > 0.75].index

  skewness = train_num.apply(lambda x: stats.skew(x))


In [132]:
# Apply Yeo-Johnson transformation
if len(skewed_feats) > 0:
    pt = PowerTransformer(method='yeo-johnson', standardize=False)
    all_data[skewed_feats] = pt.fit_transform(all_data[skewed_feats])

# Transform target to log (common for house prices)
y_log = np.log(y_clean)

In [133]:
# Step 3.2: Feature standardization (only numerical)
scaler = StandardScaler()
all_data[num_feats] = scaler.fit_transform(all_data[num_feats])

# One-hot encode categorical features
all_data = pd.get_dummies(all_data)

In [151]:
joblib.dump(scaler, 'scaler.pkl')  # After fitting scaler

['scaler.pkl']

In [134]:
# Split back to train and test
X_train_full = all_data.iloc[:train_idx]
X_test = all_data.iloc[train_idx:]

In [135]:
# Step 4: Find correlations (using train data with target)
train_full = pd.concat([X_train_full, pd.Series(y_log, name='SalePrice')], axis=1)
corr_matrix = train_full.corr()
corr_sale = abs(corr_matrix['SalePrice']).sort_values(ascending=False).drop('SalePrice')

In [136]:
# Step 5: Remove irrelevant features (highly correlated features > 0.8 threshold for multicollinearity)
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(abs(upper_tri[column]) > 0.85
                                                         )]
all_data = all_data.drop(to_drop, axis=1)

In [137]:
# Update train and test
X_train_full = all_data.iloc[:train_idx]
X_test = all_data.iloc[train_idx:]

In [138]:
# Recompute correlations after drop
train_full = pd.concat([X_train_full, pd.Series(y_log, name='SalePrice')], axis=1)
corr_matrix = train_full.corr()
corr_sale = abs(corr_matrix['SalePrice']).sort_values(ascending=False).drop('SalePrice')

In [139]:
# Step 6: Use only 50-60 most relevant features (based on correlation with target)
num_features = min(60, len(corr_sale))
top_features = corr_sale.head(num_features).index.tolist()
joblib.dump(top_features, 'top_features.pkl')
X_train_full = X_train_full[top_features]
X_test = X_test[top_features]

In [140]:
# Split train into train and validation
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_log, test_size=0.2, random_state=42)

In [141]:
# Step 7: Try different regression models and print evaluation metrics
models = {
    'Linear Regression': LinearRegression(),
    'Lasso Regression': Lasso(alpha=0.01),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'SVR': SVR()
}

In [142]:
# Polynomial Regression separately
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_val)
model_poly = LinearRegression()
model_poly.fit(X_train_poly, y_train)
pred_poly = model_poly.predict(X_val_poly)
rmse_poly = np.sqrt(mean_squared_error(y_val, pred_poly))
r2_poly = r2_score(y_val, pred_poly)
print(f"Polynomial Regression: RMSE = {rmse_poly:.4f}, R2 = {r2_poly:.4f}")

for name, model in models.items():
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred))
    r2 = r2_score(y_val, pred)
    print(f"{name}: RMSE = {rmse:.4f}, R2 = {r2:.4f}")

Polynomial Regression: RMSE = 34.7261, R2 = -10586.1541
Linear Regression: RMSE = 0.1414, R2 = 0.8244
Lasso Regression: RMSE = 0.1558, R2 = 0.7868
Decision Tree: RMSE = 0.1930, R2 = 0.6730
Random Forest: RMSE = 0.1339, R2 = 0.8425
SVR: RMSE = 0.1228, R2 = 0.8677


In [143]:
# Step 8: Select best model (assuming Random Forest based on common performance; adjust based on prints)
# For demonstration, let's select Random Forest
best_model_base = SVR()

In [147]:
from sklearn.pipeline import Pipeline

In [148]:
# Create pipeline: scaling + SVR
svr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR(kernel='rbf'))  # rbf is usually best for non-linear regression
])
# Parameter grid for hyperparameter tuning
# Start with a reasonable range; you can expand based on initial results
param_grid = {
    'svr__C': [1, 10, 100, 1000],            # Regularization parameter
    'svr__epsilon': [0.01, 0.1, 0.2],        # Epsilon in the epsilon-SVR model
    'svr__gamma': ['scale', 'auto', 0.01, 0.1, 1],  # Kernel coefficient for 'rbf'
    # You can add 'svr__kernel': ['rbf', 'linear', 'poly'] if you want to test others
}

# GridSearchCV with 5-fold cross-validation
# Uses neg_mean_squared_error (higher is better)
grid_search = GridSearchCV(
    estimator=svr_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,          # Use all cores
    verbose=2           # To see progress
)

# Fit on the full training data
grid_search.fit(X_train_full, y_log)

# Results
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV neg MSE: {grid_search.best_score_:.6f}")
print(f"Best CV RMSE: {np.sqrt(-grid_search.best_score_):.6f}")

# Best tuned model
best_svr_model = grid_search.best_estimator_

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best parameters: {'svr__C': 1, 'svr__epsilon': 0.01, 'svr__gamma': 0.01}
Best CV neg MSE: -0.023823
Best CV RMSE: 0.154348


In [149]:
# Step 10: Model packing - save to pkl file
joblib.dump(best_model, 'best_house_price_model.pkl')
print("Model saved as 'best_house_price_model.pkl'")

Model saved as 'best_house_price_model.pkl'


In [150]:
# Optional: Generate predictions for test and save submission
pred_test_log = best_model.predict(X_test)
pred_test = np.exp(pred_test_log)
submission = pd.DataFrame({'Id': id_column, 'SalePrice': pred_test})
submission.to_csv('submission.csv', index=False)
print("Submission file created as 'submission.csv'")

Submission file created as 'submission.csv'
