In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

train.head()

In [None]:
#dataset sizes
print("Train set size:", train.shape)
print("Test set size:", test.shape)

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
missing_values = train.isnull().sum()
missing_values = missing_values[missing_values > 0]
print(missing_values)

In [None]:
missing_percent = train.isnull().sum() / len(train) * 100
missing_percent = missing_percent[missing_percent > 0].sort_values(ascending=False)
print(missing_percent)

# SalePrice Breakdown

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(train['SalePrice'], kde=True, bins=30)
plt.title('SalePrice Breakdown')
plt.xlabel('SalePrice')
plt.ylabel('Frequency')
plt.show()

In [None]:
numeric_train = train.select_dtypes(include=[np.number])

corr_matrix = numeric_train.corr()

plt.figure(figsize=(12,10))
sns.heatmap(corr_matrix, cmap='coolwarm', annot=False)
plt.title('Korelasyon Matrisi (Sadece Sayısal Sütunlar)')
plt.show()

# Features with the Highest Correlation with SalePrice

In [None]:
top_corr = corr_matrix['SalePrice'].sort_values(ascending=False)
print("\nSalePrice ile en yüksek korelasyona sahip özellikler:")
print(top_corr.head(10))

# Categorical Variables and SalePrice Relationships

In [None]:
categorical_features = train.select_dtypes(include=['object']).columns.tolist()
print(categorical_features)

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(x='Neighborhood', y='SalePrice', data=train)
plt.xticks(rotation=45)
plt.title('Relationship between Neighborhood and SalePrice')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x='BldgType', y='SalePrice', data=train)
plt.title('Relationship between BldgType and SalePrice')
plt.show()

# Fill Missing Data

In [None]:
import pandas as pd
import numpy as np


# Fill very high missing columns with “None”

for col in ['PoolQC','MiscFeature','Alley','Fence','FireplaceQu','MasVnrType']:
    train[col] = train[col].fillna("None")
    test[col] = test[col].fillna("None")


# Garage related columns

garage_cols = ['GarageType','GarageFinish','GarageQual','GarageCond']
for col in garage_cols:
    train[col] = train[col].fillna("None")
    test[col] = test[col].fillna("None")

# GarageYrBlt numeric, if there is no garage fill with 0 or YearBuilt
# Here YearBuilt is a logical guess

train['GarageYrBlt'] = train['GarageYrBlt'].fillna(train['YearBuilt'])
test['GarageYrBlt'] = test['GarageYrBlt'].fillna(test['YearBuilt'])


# Basement (Bsmt*) columns

bsmt_cols = ['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2']
for col in bsmt_cols:
    train[col] = train[col].fillna("None")
    test[col] = test[col].fillna("None")


# LotFrontage - Median by neighborhood

train['LotFrontage'] = train.groupby('Neighborhood')['LotFrontage'].transform(
    lambda x: x.fillna(x.median())
)
test['LotFrontage'] = test.groupby('Neighborhood')['LotFrontage'].transform(
    lambda x: x.fillna(x.median())
)


# MasVnrArea

train['MasVnrArea'] = train['MasVnrArea'].fillna(0)
test['MasVnrArea'] = test['MasVnrArea'].fillna(0)


# Electrical

train['Electrical'] = train['Electrical'].fillna(train['Electrical'].mode()[0])
test['Electrical'] = test['Electrical'].fillna(test['Electrical'].mode()[0])


# Check Are there any missing data?

print("Number of missing values remaining in the training set:", train.isnull().sum().sum())
print("Number of missing values remaining in the test set:", test.isnull().sum().sum())

# Fill in the Missing Values in the Test Set

In [None]:
missing_test = test.isnull().sum()
print(missing_test[missing_test > 0])

In [None]:
# For categorical columns: fill with mode
test['MSZoning']      = test['MSZoning'].fillna(train['MSZoning'].mode()[0])
test['Utilities']     = test['Utilities'].fillna(train['Utilities'].mode()[0])
test['Exterior1st']   = test['Exterior1st'].fillna(train['Exterior1st'].mode()[0])
test['Exterior2nd']   = test['Exterior2nd'].fillna(train['Exterior2nd'].mode()[0])
test['KitchenQual']   = test['KitchenQual'].fillna(train['KitchenQual'].mode()[0])
test['Functional']    = test['Functional'].fillna(train['Functional'].mode()[0])
test['SaleType']      = test['SaleType'].fillna(train['SaleType'].mode()[0])

# For numeric columns: fill with median or logical value
test['BsmtFinSF1']    = test['BsmtFinSF1'].fillna(train['BsmtFinSF1'].median())
test['BsmtFinSF2']    = test['BsmtFinSF2'].fillna(train['BsmtFinSF2'].median())
test['BsmtUnfSF']     = test['BsmtUnfSF'].fillna(train['BsmtUnfSF'].median())
test['TotalBsmtSF']   = test['TotalBsmtSF'].fillna(train['TotalBsmtSF'].median())

# If basement-related bathrooms are missing, it usually means “no basement”, so we can fill it with 0:
test['BsmtFullBath']  = test['BsmtFullBath'].fillna(0)
test['BsmtHalfBath']  = test['BsmtHalfBath'].fillna(0)

# For numeric columns related to the garage:
test['GarageCars']    = test['GarageCars'].fillna(train['GarageCars'].median())
test['GarageArea']    = test['GarageArea'].fillna(train['GarageArea'].median())

In [None]:
print("Number of missing values remaining in the test set:", test.isnull().sum().sum())

In [None]:
categorical_features = train.select_dtypes(include=['object']).columns.tolist()
print("Categorical Variables:", categorical_features)

# Let's apply One-Hot Encoding to training and test sets
train_encoded = pd.get_dummies(train, columns=categorical_features)
test_encoded = pd.get_dummies(test, columns=categorical_features)

# Align the training and test sets in terms of their columns (fill_value=0 for non-common columns)
train_encoded, test_encoded = train_encoded.align(test_encoded, join='left', axis=1, fill_value=0)

print("Training set shape (after encoding):", train_encoded.shape)
print("Test set shape (post encoding):", test_encoded.shape)

In [None]:
from sklearn.model_selection import train_test_split

X = train_encoded.drop('SalePrice', axis=1)
y = train_encoded['SalePrice']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Verification set shape:", X_val.shape)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

lr_model = LinearRegression()

lr_model.fit(X_train, y_train)

y_train_pred = lr_model.predict(X_train)
y_val_pred = lr_model.predict(X_val)

# RMSE 
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))

print("Training set RMSE:", rmse_train)
print("Validation set RMSE:", rmse_val)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Log transformation of the target variable: y = log1p(SalePrice)
y_log = np.log1p(y)

# We apply the same transformation in Train/Validation split:
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

# Train Linear Regression model with log transformed target
lr_model_log = LinearRegression()
lr_model_log.fit(X_train, y_train_log)

# Make predictions on training and validation sets (log scale)
y_train_pred_log = lr_model_log.predict(X_train)
y_val_pred_log = lr_model_log.predict(X_val)

# Let's convert the estimates back to the original scale
y_train_pred = np.expm1(y_train_pred_log)
y_val_pred = np.expm1(y_val_pred_log)

# Let's calculate RMSE on the original scale
rmse_train_log = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_val_log = np.sqrt(mean_squared_error(y_val, y_val_pred))

print("Log Transformed Linear Regression Training RMSE:", rmse_train_log)
print("Log Transformed Linear Regression Verification RMSE:", rmse_val_log)

In [None]:
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import numpy as np

xgb_model = XGBRegressor(
    subsample=0.7, 
    n_estimators=800, 
    min_child_weight=3, 
    max_depth=5, 
    learning_rate=0.1, 
    gamma=0.3, 
    colsample_bytree=1, 
    random_state=42
)

cat_model = CatBoostRegressor(
    iterations=800, 
    learning_rate=0.1, 
    depth=5, 
    random_seed=42, 
    verbose=0
)

lgb_model = LGBMRegressor(
    n_estimators=800, 
    learning_rate=0.1, 
    max_depth=5, 
    subsample=0.9, 
    colsample_bytree=1, 
    random_state=42
)

# We use Ridge regression as a meta model
meta_model = Ridge(alpha=1.0, random_state=42)

# Let's define StackingRegressor
stacking_model = StackingRegressor(
    estimators=[('xgb', xgb_model), ('cat', cat_model), ('lgb', lgb_model)],
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1
)

# Let's train our stacking model
stacking_model.fit(X_train, y_train)

# Let's make a prediction on the validation set
y_pred_stack = stacking_model.predict(X_val)


rmse_stack = np.sqrt(mean_squared_error(y_val, y_pred_stack))
print("Stacking Model RMSE:", rmse_stack)

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error

def rmsle(y_true, y_pred):
    # Log transformation: We log over (y+1) using np.log1p().
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true))**2))

# Suppose y_val is the actual values in the validation set
# and let y_pred_stack be the predictions of your stacking model.
rmsle_val = rmsle(y_val, y_pred_stack)
print("Calculated RMSLE:", rmsle_val)

In [None]:
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true))**2))

In [None]:
print("Actual values (top 10):", y_val.head(10).values)
print("Predictions (top 10):", y_pred_stack[:10])

In [None]:
if 'SalePrice' in test_encoded.columns:
    test_encoded = test_encoded.drop('SalePrice', axis=1)

final_pred_log = stacking_model.predict(test_encoded)
final_pred = np.expm1(final_pred_log)  # We reverse log transformation

In [None]:
# Make sure there is an ID column in the test set
if 'Id' in test.columns:
    test_ids = test['Id']
else:
    raise ValueError("The test set is missing the 'Id' column")

# Submission file
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': final_pred
})


print(submission.head())
print(submission.shape)

In [None]:
submission.to_csv('submission.csv', index=False)