# HOUSE PRICE PREDICTION 🏠💲

## About Dataset
- SalePrice: The property's sale price in dollars
- MSSubClass: The building class
- MSZoning: The general zoning classification
- LotFrontage: Linear feet of street connected to property
- LotArea: Lot size in square feet
- Street: Type of road access
- Alley: Type of alley access
- LotShape: General shape of property
- LandContour: Flatness of the property
- Utilities: Type of utilities available
- LotConfig: Lot configuration
- LandSlope: Slope of property
- Neighborhood: Physical locations within Ames city limits
- Condition1: Proximity to main road or railroad
- Condition2: Proximity to main road or railroad (if a second is present)
- BldgType: Type of dwelling
- HouseStyle: Style of dwelling
- OverallQual: Overall material and finish quality
- OverallCond: Overall condition rating
- YearBuilt: Original construction date
- YearRemodAdd: Remodel date
- RoofStyle: Type of roof
- RoofMatl: Roof material
- Exterior1st: Exterior covering on house
- Exterior2nd: Exterior covering on house (if more than one material)
- MasVnrType: Masonry veneer type
- MasVnrArea: Masonry veneer area in square feet
- ExterQual: Exterior material quality
- ExterCond: Present condition of the material on the exterior
- Foundation: Type of foundation
- BsmtQual: Height of the basement
- BsmtCond: General condition of the basement
- BsmtExposure: Walkout or garden level basement walls
- BsmtFinType1: Quality of basement finished area
- BsmtFinSF1: Type 1 finished square feet
- BsmtFinType2: Quality of second finished area (if present)
- BsmtFinSF2: Type 2 finished square feet
- BsmtUnfSF: Unfinished square feet of basement area
- TotalBsmtSF: Total square feet of basement area
- Heating: Type of heating
- HeatingQC: Heating quality and condition
- CentralAir: Central air conditioning
- Electrical: Electrical system
- 1stFlrSF: First Floor square feet
- 2ndFlrSF: Second floor square feet
- LowQualFinSF: Low quality finished square feet (all floors)
- GrLivArea: Above grade (ground) living area square feet
- BsmtFullBath: Basement full bathrooms
- BsmtHalfBath: Basement half bathrooms
- FullBath: Full bathrooms above grade
- HalfBath: Half baths above grade
- Bedroom: Number of bedrooms above basement level
- Kitchen: Number of kitchens
- KitchenQual: Kitchen quality
- TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)
- Functional: Home functionality rating
- Fireplaces: Number of fireplaces
- FireplaceQu: Fireplace quality
- GarageType: Garage location
- GarageYrBlt: Year garage was built
- GarageFinish: Interior finish of the garage
- GarageCars: Size of garage in car capacity
- GarageArea: Size of garage in square feet
- GarageQual: Garage quality
- GarageCond: Garage condition
- PavedDrive: Paved driveway
- WoodDeckSF: Wood deck area in square feet
- OpenPorchSF: Open porch area in square feet
- EnclosedPorch: Enclosed porch area in square feet
- 3SsnPorch: Three season porch area in square feet
- ScreenPorch: Screen porch area in square feet
- PoolArea: Pool area in square feet
- PoolQC: Pool quality
- Fence: Fence quality
- MiscFeature: Miscellaneous feature not covered in other categories
- MiscVal: $Value of miscellaneous feature
- MoSold: Month Sold
- YrSold: Year Sold
- SaleType: Type of sale
- SaleCondition: Condition of sale

## Importing Dataset, Required Libraries and Functions

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

df_train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
df_test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
df = df_train.append(df_test, ignore_index=False).reset_index()

# 1. Exploratory Data Analysis

## 1.1 Data Overview

In [None]:
df_train.shape, df_test.shape

In [None]:
df = df.drop("index", axis=1)

In [None]:
def analyze_df(dataframe):
    
    print(f"Shape: {dataframe.shape}")
    print("------------------------------ Head ------------------------------")
    print(dataframe.head(3))
    print("------------------------------ Tail ------------------------------")
    print(dataframe.tail(3))
    print("------------------------------ Types ------------------------------")
    print(dataframe.dtypes)
    print("------------------------------ Unique Values -----------------------------")
    print(dataframe.nunique())
    print("------------------------------ NaN Values -----------------------------")
    print(dataframe.isnull().sum())
    print("------------------------------ Quantiles ------------------------------")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)
    

analyze_df(df)

In [None]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    """
    Provides the names of categorical, numeric, and categorical-like but cardinal variables in the dataset.
    Note: Numeric-looking categorical variables are also included in categorical variables.

    Parameters
    ------
    dataframe: dataframe
            DataFrame from which variable names are to be extracted.
    cat_th: int, optional
            Class threshold value for numeric but categorical variables.
    car_th: int, optional
            Class threshold value for categorical but cardinal variables.

    Returns
    ------
        cat_cols: list
                List of categorical variable names.
        num_cols: list
                List of numeric variable names.
        cat_but_car: list
                List of categorical-like but cardinal variable names.

    Examples
    ------
        import seaborn as sns
        df = sns.load_dataset("iris")
        print(grab_col_names(df))
        

    Notes
    ------
        cat_cols + num_cols + cat_but_car = total number of variables.
        num_but_cat is within cat_cols.
    """
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]
    
    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')

    return cat_cols, num_cols, cat_but_car


cat_cols, num_cols, cat_but_car = grab_col_names(df)

## 1.2 Categorical Variable Analysis

In [None]:
def cat_summary(dataframe, column, plot=False):
    value_counts = dataframe[column].value_counts()
    ratio = 100 * value_counts / len(dataframe)
    
    summary_df = pd.DataFrame({column: value_counts, "Ratio": ratio})
    print(summary_df)
    
    if plot:
        sns.countplot(x=dataframe[column], data=dataframe, palette="husl")
        plt.xticks(fontsize=8)
        plt.show()

        
for col in cat_cols:
    cat_summary(df, col, True)

In [None]:
def target_summary_with_cat(dataframe, target, categorical_col):
    print(pd.DataFrame({"TARGET_MEAN": dataframe.groupby(categorical_col)[target].mean()}), end="\n\n\n")


for col in cat_cols:
    target_summary_with_cat(df, "SalePrice", col)

## 1.3 Numerical Variable Analysis

In [None]:
df[num_cols].hist(bins=40, figsize=(20,20))
plt.show()

In [None]:
np.log1p(df['SalePrice']).hist(bins=50)
plt.show()

## 1.4 Correlation Matrix

In [None]:
def high_correlated_cols(dataframe, plot=False, corr_th=0.70):
    corr = dataframe.corr()
    cor_matrix = corr.abs()
    upper_triangle_matrix = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(np.bool_))
    drop_list = [col for col in upper_triangle_matrix.columns if any(upper_triangle_matrix[col] > corr_th)]
    
    if plot:
        corr = df[num_cols].corr()
        f, ax = plt.subplots(figsize=[20, 16])
        ax.set_title("Correlation Matrix", fontsize=20)
        sns.heatmap(corr, annot=True, fmt=".2f", ax=ax, cmap="Spectral")
        plt.show()
        
        print("###################### High Correlated Columns ######################")
        
    return drop_list


high_correlated_cols(df, plot=True)

# 2. Data Pre-processing & Feature Engineering

## 2.1 Outliers

In [None]:
def outlier_thresholds(dataframe, variable, q1=0.10, q3=0.90):
    quantile1 = dataframe[variable].quantile(q1)
    quantile3 = dataframe[variable].quantile(q3)
    interquantile_range = quantile3 - quantile1
    up_limit = quantile3 + 1.5 * interquantile_range
    low_limit = quantile1 - 1.5 * interquantile_range
    return low_limit, up_limit

In [None]:
def check_outlier(dataframe, column):
    low_limit, up_limit = outlier_thresholds(dataframe, column)
    
    if dataframe[(dataframe[column] > up_limit) | (dataframe[column] < low_limit)].any(axis=None):
        return True
    
    else:
        return False
    
    
for col in num_cols:
    if col != "SalePrice":
        print(col, check_outlier(df, col))

In [None]:
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit


for col in num_cols:
    if col != "SalePrice":
        replace_with_thresholds(df, col)

In [None]:
for col in num_cols:
    if col != "SalePrice":
        print(col, check_outlier(df, col))

## 2.2 Missing Values

In [None]:
def missing_values(dataframe, na_name=False):
    # find columns with missing values
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]
    
    # calculate missing value statistics
    n_missing = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (n_missing / dataframe.shape[0] * 100).sort_values(ascending=False)
    
    # create and print a DataFrame to display missing value information
    missing_df = pd.concat([n_missing, np.round(ratio, 2)], axis=1, keys=['n_missing', 'ratio'])
    print(missing_df)
    
    # return columns with missing values if na_name is True
    if na_name:
        return na_columns
    
    
missing_values(df)

In [None]:
# null values in some variables indicate that the feature does not exist
none_cols = ["Alley","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","FireplaceQu",
           "GarageType","GarageFinish","GarageQual","GarageCond","PoolQC","Fence","MiscFeature"]

for col in none_cols:
    df[col].fillna("None", inplace=True)

In [None]:
missing_values(df)

In [None]:
def missing_impute(data, num_method="median", cat_length=20, target="SalePrice"):
    # identify columns with missing values
    na_variables = [col for col in data.columns if data[col].isnull().sum() > 0]

    # save the original target column values
    target_val = data[target]

    # print missing values before imputation
    print("BEFORE IMPUTATION")
    print(data[na_variables].isnull().sum(), "\n\n")

    # impute with a mode-based approach for categorical variables
    data = data.apply(lambda x: x.fillna(x.mode()[0])
                      if (x.dtype == "O" and len(x.unique()) <= cat_length) else x, axis=0)

    # impute with specified method for numerical variables 
    if num_method in ["mean", "median"]:
        num_imputer = data.mean() if num_method == "mean" else data.median()
        data = data.apply(lambda x: x.fillna(num_imputer) if x.dtype != "O" else x, axis=0)

    # restore the original target column values
    data[target] = target_val

    # print missing values count after imputation
    print(" Imputation method is 'MODE' for categorical variables." if num_method != "median" else "")
    print(" Imputation method is '" + num_method.upper() + "' for numeric variables. \n")
    print("AFTER IMPUTATION")
    print(data[na_variables].isnull().sum(), "\n\n")

    return data


df = missing_impute(df, num_method="median", cat_length=17)

## 2.3 Rare Analysis

In [None]:
def rare_analyser(dataframe, target, categorical_columns):
    """
    Analyzes rare categories in categorical columns with respect to a target column.

    Parameters
    ------
        dataframe: pd.DataFrame
                   DataFrame containing the data.
        target: str
                The name of the target column.
        cat_cols: list
                  List of categorical variable names.
    Returns
    ------
        None
    """
    for col in cat_cols:
        value_counts = dataframe[col].value_counts()
        ratio = value_counts / len(dataframe)
        target_mean = dataframe.groupby(col)[target].mean()
        
        print(f"{col}:", len(value_counts))

        rare_df = pd.DataFrame({"COUNT": value_counts,
                                "RATIO": ratio,
                                "TARGET_MEAN": target_mean})

        print(rare_df, end="\n\n\n")
        
        
rare_analyser(df, "SalePrice", cat_cols)

In [None]:
# inclusion of classes in other classes according to their proportions
df["ExterCond"] = np.where(df.ExterCond.isin(["Fa", "Po"]), "FaPo", df["ExterCond"])
df["ExterCond"] = np.where(df.ExterCond.isin(["Ex", "Gd"]), "Ex", df["ExterCond"])

df["LotShape"] = np.where(df.LotShape.isin(["IR1", "IR2", "IR3"]), "IR", df["LotShape"])

df["GarageQual"] = np.where(df.GarageQual.isin(["Fa", "Po"]), "FaPo", df["GarageQual"])
df["GarageQual"] = np.where(df.GarageQual.isin(["Ex", "Gd", "TA"]), "ExGd", df["GarageQual"])

df["BsmtFinType2"] = np.where(df.BsmtFinType2.isin(["GLQ", "ALQ"]), "RareExcellent", df["BsmtFinType2"])
df["BsmtFinType2"] = np.where(df.BsmtFinType2.isin(["BLQ", "LwQ", "Rec"]), "RareGof", df["BsmtFinType2"])

def rare_encoder(dataframe, rare_perc):
    temp_df = dataframe.copy()

    rare_columns = [col for col in temp_df.columns if temp_df[col].dtypes == 'O' and
                    (temp_df[col].value_counts() / len(temp_df) < rare_perc).any(axis=None)]

    for var in rare_columns:
        rare_ratio = temp_df[var].value_counts() / len(temp_df)
        rare_labels = rare_ratio[rare_ratio < rare_perc].index
        temp_df[var] = np.where(temp_df[var].isin(rare_labels), 'Rare', temp_df[var])

    return temp_df


rare_encoder(df, 0.01)

## 2.4 Feature Extraction

In [None]:
# overall evaluation
df["Overall"] = df[["OverallQual", "OverallCond"]].sum(axis = 1)

# garage
df["TotalGarageQual"] = df[["GarageQual", "GarageCond"]].sum(axis = 1)
df["NEW_1st*GrLiv"] = (df["1stFlrSF"]*df["GrLivArea"])
df["NEW_Garage*GrLiv"] = (df["GarageArea"]*df["GrLivArea"])

# total floor
df["NEW_TotalFlrSF"] = df["1stFlrSF"] + df["2ndFlrSF"]

# total finished basement area
df["NEW_TotalBsmtFin"] = df.BsmtFinSF1 + df.BsmtFinSF2

# porch Area
df["NEW_PorchArea"] = df.OpenPorchSF + df.EnclosedPorch + df.ScreenPorch + df["3SsnPorch"] + df.WoodDeckSF

# total house area
df["NEW_TotalHouseArea"] = df.NEW_TotalFlrSF + df.TotalBsmtSF

# total bathroom
df["NEW_TotalFullBath"] = df.BsmtFullBath + df.FullBath
df["NEW_TotalHalfBath"] = df.BsmtHalfBath + df.HalfBath
df["NEW_TotalBath"] = df["NEW_TotalFullBath"] + (df["NEW_TotalHalfBath"]*0.5)

# grading
df["NEW_OverallGrade"] = df["OverallQual"] * df["OverallCond"]

# aging
df["NEW_HouseAge"] = df.YrSold - df.YearBuilt
df["NEW_RestorationAge"] = df.YrSold - df.YearRemodAdd

In [None]:
df[cat_cols].nunique()

In [None]:
drop_list = ["GarageYrBlt", "Street", "Utilities", "Alley",  "LandSlope", "LandContour", "Heating", "PoolQC", "MiscFeature", "Neighborhood", "LotFrontage", "MasVnrArea"]
df.drop(drop_list, axis=1, inplace=True)

In [None]:
df.head()

## 2.5 Encoding

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

In [None]:
binary_cols = [col for col in df.columns if df[col].dtypes == "O" and len(df[col].unique()) == 2]
binary_cols

In [None]:
def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe


for col in binary_cols:
    df = label_encoder(df, col)

In [None]:
def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe


df = one_hot_encoder(df, cat_cols, drop_first=True)

# 3. Modelling

In [None]:
train_df = df[df['SalePrice'].notnull()]
test_df = df[df['SalePrice'].isnull()]

In [None]:
train_df.shape, test_df.shape

In [None]:
y = train_df['SalePrice']
X = train_df.drop(["SalePrice"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=17)

In [None]:
train_df.columns = train_df.columns.str.replace(" ", "_")
test_df.columns = test_df.columns.str.replace(" ", "_")
X.columns = X.columns.str.replace(" ", "_")

In [None]:
models = [('LR', LinearRegression()),
          ("Ridge", Ridge()),
          ("Lasso", Lasso()),
          ("ElasticNet", ElasticNet()),
          ('KNN', KNeighborsRegressor()),
          ('CART', DecisionTreeRegressor()),
          ('RF', RandomForestRegressor()),
          ('GBM', GradientBoostingRegressor()),
          ("XGBoost", XGBRegressor(objective='reg:squarederror')),
          ("LightGBM", LGBMRegressor()),
          ("CatBoost", CatBoostRegressor(verbose=False))]


for name, regressor in models:
    rmse = np.mean(np.sqrt(-cross_val_score(regressor, X, y, cv=5, scoring="neg_mean_squared_error")))
    print(f"RMSE: {round(rmse, 4)} ({name})")

In [None]:
df['SalePrice'].mean()

In [None]:
df['SalePrice'].std()

In [None]:
lgbm_model = LGBMRegressor(force_col_wise=True)
rmse = np.mean(np.sqrt(-cross_val_score(lgbm_model, X, y, cv=5, scoring="neg_mean_squared_error")))
rmse

In [None]:
lgbm_params = {"learning_rate": [0.01, 0.1],
               "n_estimators": [500, 1500]
              }

lgbm_gs = GridSearchCV(lgbm_model,
                            lgbm_params,
                            cv=3,
                            n_jobs=-1,
                            verbose=True).fit(X_train, y_train)

In [None]:
lgbm_gs.best_params_

In [None]:
lgbm_gs.best_score_

In [None]:
final_model = lgbm_model.set_params(**lgbm_gs.best_params_).fit(X, y)

rmse = np.mean(np.sqrt(-cross_val_score(final_model, X, y, cv=5, scoring="neg_mean_squared_error")))
rmse

## 3.1 Feature Importance

In [None]:
def plot_importance(model, features, num=len(X), save=False):
    
    feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
    plt.figure(figsize=(10, 10))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[0:num], palette="husl")
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.show()
    
    if save:
        plt.savefig('importances.png')
        
model = LGBMRegressor()
model.fit(X, y)


plot_importance(model, X, num=20)

In [None]:
model = LGBMRegressor()
model.fit(X, y)

prediction = model.predict(test_df.drop(["SalePrice"], axis=1))

submit_dict = {"Id": test_df["Id"],
             "SalePrice": prediction}

submit_df = pd.DataFrame(submit_dict)
submit_df["Id"] = submit_df["Id"].astype('int32')
submit_df.to_csv("house_price_predictions.csv", index=False)

In [None]:
submit_df.head()