In [None]:
!pip install pandas
!pip install numpy

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Importing Dataset and Overview

We have to make relevant changes in our test data set too hence we have to import both and simulatenously change both accordingly

In [None]:
df_train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
df_test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

## 1.1 Overview of the data


In [None]:
df_train.head()

In [None]:
print(df_train.shape)


Structure of the data. Columns and their data types

In [None]:
df_train.info()

In [None]:
df_test.info()

### Understanding the response data 

Summary statistics of response data

In [None]:
df_train["SalePrice"].describe()

Visualizing the response data

In [None]:
import matplotlib.pyplot as plt
df_train["SalePrice"].plot(kind = "hist", bins = 20, xlabel = "Sale Price", color = "red", edgecolor = 'black')

Performing log transformation to response data , so that skewness doesnt affect our results.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Check the skewness of SalePrice
saleprice_skewness = df_train['SalePrice'].skew()
print(f"Skewness of SalePrice: {saleprice_skewness:.2f}")

# Plot original SalePrice distribution
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(df_train['SalePrice'], bins=30, kde= True)
plt.title("Original SalePrice Distribution")

# Apply log transformation only if the skewness is high (> 1)
if saleprice_skewness > 1:
    df_train['SalePrice'] = np.log1p(df_train['SalePrice'])  # log1p(x) = log(x + 1)
    print("Log transformation applied to SalePrice.")
else:
    print("No transformation applied. SalePrice is not highly skewed.")

# Plot transformed SalePrice distribution
plt.subplot(1, 2, 2)
sns.histplot(df_train['SalePrice'], bins=30, kde=True , color = 'red')
plt.title("Transformed SalePrice (if applied)")

plt.show()

## 1.2 Missing Values

Finding columns with missing values and their data type

In [None]:
missing_percentage = df_train.isnull().mean() * 100 #finds percentage missing in a single column and stores as series 

missing_percentage = missing_percentage.sort_values(ascending = False ) #sorts missing percentage in descending order

column_data_types = df_train.dtypes[missing_percentage.index] #gathers the data types of columns with missing values

# Combine missing percentage and data types into a DataFrame for display
missing_percentage_with_types = pd.DataFrame({
    'Missing Percentage': missing_percentage,
    'Data Type': column_data_types
})

print(missing_percentage_with_types[missing_percentage > 0]) #display columns with missing data

In [None]:
# Calculate missing data percentage and data types for the test set
missing_percentage_test = df_test.isnull().mean() * 100  # finds percentage missing in each column and stores as a series

# Sort missing percentage in descending order
missing_percentage_test = missing_percentage_test.sort_values(ascending=False)

# Gather the data types of columns with missing values in the test set
column_data_types_test = df_test.dtypes[missing_percentage_test.index]

# Combine missing percentage and data types into a DataFrame for display
missing_percentage_with_types_test = pd.DataFrame({
    'Missing Percentage': missing_percentage_test,
    'Data Type': column_data_types_test
})

# Display columns with missing data in the test set
print(missing_percentage_with_types_test[missing_percentage_test > 0])  # show columns with missing data


# 2. Data Pre-processing

## 2.1 Removing columns with high percentage of missing values.( >50% missing)

These columns have too much missing data. They will no longer be useful in our model hence we drop them

In [None]:
columns_to_drop = missing_percentage[missing_percentage  > 50].index #gathers index with missing percentage > 50

print(f"Columns dropped: {', '.join(columns_to_drop)}") #printing which columns have been dropped

df_train = df_train.drop(columns = columns_to_drop) #droping those columns from train set


In [None]:
# Drop those columns from the test set
df_test = df_test.drop(columns=columns_to_drop)  # Dropping the same columns from the test set


Columns dropped: PoolQC, MiscFeature, Alley, Fence, MasVnrType

## 2.2 Find categorical columns and change their *Dtype* from `object` to `Categorical`

This allows for efficent storage and computation

In [None]:
categorical_columns = df_train.select_dtypes(include=['object']).columns #getting column names with dtypes that are object 

for col in categorical_columns:
    df_train[col] = df_train[col].astype('category') #changinf their types to category

In [None]:
# Get column names with dtype 'object' from the test set
categorical_columns_test = df_test.select_dtypes(include=['object']).columns

# Change the type of these columns to 'category'
for col in categorical_columns_test:
    df_test[col] = df_test[col].astype('category')


## 2.3 Data imputation for columns with missing values

Let's check which columns with missing value are we dealing with in the train set

In [None]:
missing_percentage = df_train.isnull().mean() * 100 #finds percentage missing in a single column and stores as series 

missing_percentage = missing_percentage.sort_values(ascending = False ) #sorts missing percentage in descending order

column_data_types = df_train.dtypes[missing_percentage.index] #gathers the data types of columns with missing values

# Combine missing percentage and data types into a DataFrame for display
missing_percentage_with_types = pd.DataFrame({
    'Missing Percentage': missing_percentage,
    'Data Type': column_data_types
})

print(missing_percentage_with_types[missing_percentage > 0]) #display columns with missing data


Let's check which columns with missing value are we dealing with in the test set

In [None]:
# Find percentage of missing values for each column in the test set
missing_percentage_test = df_test.isnull().mean() * 100

# Sort missing percentage in descending order
missing_percentage_test = missing_percentage_test.sort_values(ascending=False)

# Get the data types of columns with missing values
column_data_types_test = df_test.dtypes[missing_percentage_test.index]

# Combine missing percentage and data types into a DataFrame for display
missing_percentage_with_types_test = pd.DataFrame({
    'Missing Percentage': missing_percentage_test,
    'Data Type': column_data_types_test
})

# Display columns with missing data
print(missing_percentage_with_types_test[missing_percentage_test > 0])


For `FireplaceQu` , the fact that 47.26% is missing (in train set) could itself be an important reason (Many houses may not have a Fireplace)
Hence we will imputate the missing rows in this column with a new category "Missing"

In [None]:
# Add 'Missing' as a category to the 'CategoryColumn'
df_train['FireplaceQu'] = df_train['FireplaceQu'].cat.add_categories('Missing')

# Fill missing values in categorical columns with a new category 'Unknown'
df_train['FireplaceQu'] = df_train['FireplaceQu'].fillna('Missing')

Performing similar operation on test set

In [None]:
# Add 'Missing' as a category to the 'CategoryColumn'
df_test['FireplaceQu'] = df_test['FireplaceQu'].cat.add_categories('Missing')

# Fill missing values in categorical columns with a new category 'Unknown'
df_test['FireplaceQu'] = df_test['FireplaceQu'].fillna('Missing')

Checking column `FireplaceQu` in train and test set 

In [None]:
df_train['FireplaceQu'].value_counts()

In [None]:
df_test["FireplaceQu"].value_counts()

For the rest of the categorical columns with missing data we will fill the missing the data with the mode of that column

In [None]:
# Separate categorical and numerical missing columns
missing_categories = missing_percentage_with_types[
    (missing_percentage_with_types['Missing Percentage'] > 0) & 
    (missing_percentage_with_types['Data Type'] == "category")
].index.tolist()

# Fill missing values
df_train[missing_categories] = df_train[missing_categories].apply(lambda x: x.fillna(x.mode()[0]))

# Print results
print("Categorical columns filled with mode:", missing_categories)



Performing similar operation on test set

In [None]:
# Separate categorical columns with missing values in the test dataset
missing_categories_test = missing_percentage_with_types_test[
    (missing_percentage_with_types_test['Missing Percentage'] > 0) & 
    (missing_percentage_with_types_test['Data Type'] == "category")
].index.tolist()

# Fill missing values in categorical columns in the test dataset
df_test[missing_categories_test] = df_test[missing_categories_test].apply(lambda x: x.fillna(x.mode()[0]))

# Print results for the test dataset
print("Categorical columns filled with mode in test dataset:", missing_categories_test)


For the rest of the numerical columns with missing data we will fill the missing the data with the median of that column

In [None]:
missing_numericals = missing_percentage_with_types[
    (missing_percentage_with_types['Missing Percentage'] > 0) & 
    (missing_percentage_with_types['Data Type'] == "float64" )].index.tolist()

df_train[missing_numericals] = df_train[missing_numericals].apply(lambda x: x.fillna(x.median()))


print("Numerical columns filled with medianin train dataset :", missing_numericals)


Performing similar operation on test set

In [None]:
# Separate numerical columns with missing values in the test dataset
missing_numericals_test = missing_percentage_with_types_test[
    (missing_percentage_with_types_test['Missing Percentage'] > 0) & 
    (missing_percentage_with_types_test['Data Type'] == "float64")
].index.tolist()

# Fill missing values in numerical columns in the test dataset with median
df_test[missing_numericals_test] = df_test[missing_numericals_test].apply(lambda x: x.fillna(x.median()))

# Print results for the test dataset
print("Numerical columns filled with median in test dataset:", missing_numericals_test)


kiv target encoding for categorical features 
Finally, we accounted for data skewness by applying log transformations to both the target variable (SalePrice) and key numeric features. 

## 2.4 Analysing correlation of numerical columns and `SalePrice`

### Visualising correlation and only keeping the columns that have higher than 0.3 correlation

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select numerical columns
num_columns = df_train.select_dtypes(include=['number']).columns.tolist()

# Select the numerical columns data
gf = df_train[num_columns]

# Calculate the correlation matrix
corr = gf.corr()


    # Sort the correlation values with 'SalePrice' in descending order
sorted_corr = corr['SalePrice'].sort_values(ascending=False)

    # Plot the heatmap for correlation with 'SalePrice'
plt.figure(figsize=(10, 8))
sns.heatmap(sorted_corr.to_frame(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation with SalePrice')
plt.show()

    # Get the columns with correlation greater than 0.3
hg_corr = sorted_corr[sorted_corr > 0.3].index.tolist()

    # Remove 'SalePrice' from the list if it's present
hg_corr.remove('SalePrice') if 'SalePrice' in hg_corr else None

    # Display the list of columns with high correlation in descending order
print("Columns with high correlation to SalePrice (sorted):", hg_corr)


Keeping numerical columns with corr > 0.3 

In [None]:
# Identify categorical columns
cat_columns = df_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Combine the high-correlation numerical columns with all categorical columns
columns_to_keep =  ['Id'] + hg_corr + cat_columns + ['SalePrice'] 

Now ensuring train and test set has same columns (this allows the prediction to work more efficiently)

In [None]:
# Create a filtered dataframe with only the selected columns
df_train_filtered = df_train[columns_to_keep]
df_test = df_test[['Id'] + hg_corr + cat_columns]

Cheching the info of the train and test set

In [None]:
# Display the filtered dataframe to verify
print(df_train_filtered.info())
print(df_test.info())

# 3. Splitting the dataset

We will split the dataset.

*Note: We don't need to split that train set into the trditional 80/20 split because we will be using cross validation with 5-folds*

In [None]:
# Define features (X) and target variable (y)
X_train = df_train_filtered.drop(columns=['SalePrice'])  # Drop target variable from features
y_train = df_train_filtered['SalePrice']  # Target variable

X_test = df_test

In [None]:
import pandas as pd

def target_encode(train, test, y_train, categorical_cols):
    """
    Applies Target Encoding manually using Pandas.
    
    Parameters:
    train (DataFrame): Training feature set.
    test (DataFrame): Test set (only transformed, not fitted).
    y_train (Series): Target variable for training.
    categorical_cols (list): List of categorical column names.

    Returns:
    DataFrame, DataFrame: Encoded train and test data.
    """
    train_encoded = train.copy()
    test_encoded = test.copy()
    
    for col in categorical_cols:
        # Compute mean target value for each category
        target_means = train.groupby(col)[y_train.name].mean()

        # Map the means to the categorical column in training and test sets
        train_encoded[col] = train[col].map(target_means)
        test_encoded[col] = test[col].map(target_means)

        # If unseen categories appear in test, replace them with the overall mean
        overall_mean = y_train.mean()
        train_encoded[col].fillna(overall_mean, inplace=True)
        test_encoded[col].fillna(overall_mean, inplace=True)

    return train_encoded, test_encoded

# Define features (X) and target variable (y)
X_train = df_train_filtered.drop(columns=['SalePrice'])  # Drop target variable from features
y_train = df_train_filtered['SalePrice']  # Target variable
X_test = df_test  # Kaggle test set

# Identify categorical columns automatically (if not specified manually)
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

# Apply target encoding
X_train_encoded, X_test_encoded = target_encode(X_train, X_test, y_train, categorical_cols)

# Now, X_train_encoded is ready for cross-validation (cv=5)
print("✅ Target Encoding Completed!")
print(X_train_encoded.head())  # Show sample encoded data


# 4. Model training and optimazation

## 4.1 XGBoost and Optuna for hyperparamter tuning

In [None]:
!pip install optuna
!pip install xgboost

In [None]:
import optuna

import xgboost as xgb

# Patch the issue
def _sklearn_tags(self):
    return {"estimator_type": "regressor"}

xgb.XGBRegressor.__sklearn_tags__ = _sklearn_tags

from sklearn.model_selection import cross_val_score

# Define the objective function
def objective(trial):
    # Define hyperparameter search space
    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=50),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 10),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 10),
    }
    
    # Initialize the model
    model = xgb.XGBRegressor(**params,enable_categorical=True, random_state=42)

    
    # Perform cross-validation
    score = cross_val_score(model, X_train_encoded, y_train, cv=5, n_jobs=-1)
    accuracy = score.mean()
    
    return accuracy  # Optuna tries to maximize, so return RMSE as negative

# Create study and optimize
study = optuna.create_study(direction="maximize")  # Maximize accuracy
study.optimize(objective, n_trials=20)

# Print best parameters
print("Best parameters:", study.best_params)


In [None]:

trial = study.best_trial
print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

In [None]:
X_test.head()

In [None]:
# Make predictions using the best model
best_params = study.best_params
best_model = xgb.XGBRegressor(**best_params, random_state=42,enable_categorical=True)
best_model.fit(X_train, y_train)


# Predict using the best model
predictions_log = best_model.predict(X_test)

# Apply inverse log transformation (exponential)
predictions = np.exp(predictions_log)

# Prepare submission file
submission = pd.DataFrame({"Id": df_test["Id"], "SalePrice": predictions})

submission.head()

# Save to CSV (without index)
submission.to_csv("submission.csv", index=False)

print("Submission file saved as submission.csv")
