In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from IPython.display import display
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Custom functions

# Read data

In [None]:
# read data from csv
data_df = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv',
                     keep_default_na= False)
print(data_df.shape)
data_df = data_df.drop(columns = ['Id'])

display(data_df.head())
data_df.isnull().sum().to_csv("count_na.csv")
data_df.dtypes.to_csv("data_types.csv")
display(data_df.describe())

# Preprocess data

In [None]:
data_processed = data_df.copy()

# identify correct data type for each column
column_types_dict = {
        'Id': 'id',
        'MSSubClass': 'category',
        'MSZoning': 'category',
        'LotFrontage': 'numeric',
        'LotArea': 'numeric',
        'Street': 'category',
        'Alley': 'category',
        'LotShape': 'ordinal',
        'LandContour': 'category',
        'Utilities': 'ordinal',
        'LotConfig': 'category',
        'LandSlope': 'ordinal',
        'Neighborhood': 'category',
        'Condition1': 'category',
        'Condition2': 'category',
        'BldgType': 'category',
        'HouseStyle': 'category',
        'OverallQual': 'numeric',
        'OverallCond': 'numeric',
        'YearBuilt': 'numeric',
        'YearRemodAdd': 'numeric',
        'RoofStyle': 'category',
        'RoofMatl': 'category',
        'Exterior1st': 'category',
        'Exterior2nd': 'category',
        'MasVnrType': 'category',
        'MasVnrArea': 'numeric',
        'ExterQual': 'ordinal',
        'ExterCond': 'ordinal',
        'Foundation': 'category',
        'BsmtQual': 'ordinal',
        'BsmtCond': 'ordinal',
        'BsmtExposure': 'ordinal',
        'BsmtFinType1': 'ordinal',
        'BsmtFinSF1': 'numeric',
        'BsmtFinType2': 'ordinal',
        'BsmtFinSF2': 'numeric',
        'BsmtUnfSF': 'numeric',
        'TotalBsmtSF': 'numeric',
        'Heating': 'category',
        'HeatingQC': 'ordinal',
        'CentralAir': 'category',
        'Electrical': 'category',
        '1stFlrSF': 'numeric',
        '2ndFlrSF': 'numeric',
        'LowQualFinSF': 'numeric',
        'GrLivArea': 'numeric',
        'BsmtFullBath': 'numeric',
        'BsmtHalfBath': 'numeric',
        'FullBath': 'numeric',
        'HalfBath': 'numeric',
        'BedroomAbvGr': 'numeric',
        'KitchenAbvGr': 'numeric',
        'KitchenQual': 'ordinal',
        'TotRmsAbvGrd': 'numeric',
        'Functional': 'ordinal',
        'Fireplaces': 'numeric',
        'FireplaceQu': 'ordinal',
        'GarageType': 'category',
        'GarageYrBlt': 'numeric',
        'GarageFinish': 'ordinal',
        'GarageCars': 'numeric',
        'GarageArea': 'numeric',
        'GarageQual': 'ordinal',
        'GarageCond': 'ordinal',
        'PavedDrive': 'ordinal',
        'WoodDeckSF': 'numeric',
        'OpenPorchSF': 'numeric',
        'EnclosedPorch': 'numeric',
        '3SsnPorch': 'numeric',
        'ScreenPorch': 'numeric',
        'PoolArea': 'numeric',
        'PoolQC': 'ordinal',
        'Fence': 'ordinal',
        'MiscFeature': 'category',
        'MiscVal': 'numeric',
        'MoSold': 'numeric',
        'YrSold': 'numeric',
        'SaleType': 'category',
        'SaleCondition': 'category',
        'SalePrice': 'numeric',
}

## binary columns for categories

In [None]:
# create binary columns from category columns
def create_binary_columns(
    column_types_dict,
    df,
    encoder = None,
):
    category_col = [col for col, col_type in column_types_dict.items() if column_types_dict[col] == 'category']

    if encoder == None:
        one_hot_enc = OneHotEncoder(handle_unknown='ignore', sparse_output = False)
        one_hot_enc.fit(df[category_col])
    else:
        one_hot_enc = encoder
        
    binary_df = one_hot_enc.transform(df[category_col])
    binary_columns = one_hot_enc.get_feature_names_out(category_col)
    binary_df = pd.DataFrame(binary_df, columns = binary_columns)
    
    # merge original data with binary columns
    df = df.drop(columns = category_col)
    df = pd.concat([df, binary_df], axis = 1)

    return one_hot_enc, df

one_hot_encoder, data_processed = create_binary_columns(column_types_dict, data_processed)

## numeric columns

In [None]:
# process numeric columns
def process_numeric_columns(
    column_types_dict,
    df,
    imputers = {}
):
    num_col = [col for col, col_type in column_types_dict.items() if column_types_dict[col] == 'numeric']
    
    for col in num_col:
        if (col in df.columns):
            if (df[col].dtype != float) and (df[col].dtype != int):
                df[col] = pd.to_numeric(df[col], errors = 'coerce')
    
                # fill in missing values
                if col == 'MasVnrArea':
                    if col in imputers.keys():
                        imputer = imputers[col]
                    else:
                        imputer = SimpleImputer(
                            missing_values=np.nan,
                            strategy='constant',
                            fill_value = 0.0
                        )
                        imputer.fit(np.array(df[col]).reshape(-1,1))
                        imputers[col] = imputer
                    df[col] = imputer.transform(np.array(df[col]).reshape(-1,1))
                else:
                    if col in imputers.keys():
                        imputer = imputers[col]
                    else:
                        imputer = SimpleImputer(
                            missing_values=np.nan,
                            strategy='mean',
                        )
                        imputer.fit(np.array(df[col]).reshape(-1,1))
                        imputers[col] = imputer
                    df[col] = imputer.transform(np.array(df[col]).reshape(-1,1))
            
    return imputers,df
    
num_imputers, data_processed = process_numeric_columns(column_types_dict, data_processed)

## ordinal columns

In [None]:
# process ordinal columns
def process_ordinal_columns(
    column_types_dict,
    df,
    ordinal_encoders = {},
):
    ordinal_col = [col for col, col_type in column_types_dict.items() if column_types_dict[col] == 'ordinal']

    ordinal_translation = {
        "LotShape": ['Reg', 'IR1', 'IR2', 'IR3'],
        'Utilities': ['AllPub', 'NoSewr', 'NoSeWa', 'ELO'],
        'LandSlope': ['Gtl', 'Mod', 'Sev'],
        'ExterQual': ['Ex', 'Gd', 'TA', 'Fa', 'Po'],
        'ExterCond': ['Ex', 'Gd', 'TA', 'Fa', 'Po'],
        'BsmtQual': ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'],
        'BsmtCond': ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'],
        'BsmtExposure': ['Gd', 'Av', 'Mn', 'No', 'NA'],
        'BsmtFinType1': ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'NA'],
        'BsmtFinType2': ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'NA'],
        'HeatingQC': ['Ex', 'Gd', 'TA', 'Fa', 'Po'],
        'KitchenQual': ['Ex', 'Gd', 'TA', 'Fa', 'Po'],
        'Functional': ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal'],
        'FireplaceQu': ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'],
        'GarageFinish': ['Fin', 'RFn', 'Unf', 'NA'],
        'GarageQual': ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'],
        'GarageCond': ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'],
        'PavedDrive': ['Y', 'P', 'N'],
        'PoolQC': ['Ex', 'Gd', 'TA', 'Fa', 'NA'],
        'Fence': ['GdPrv', 'MnPrv', 'GdWo', 'MnWw', 'NA'],
    }

    for col in ordinal_col:
        if (col in df.columns):
            if col in ordinal_encoders.keys():
                encoder = ordinal_encoders[col]
            else:
                encoder = OrdinalEncoder(
                    categories = [ordinal_translation[col]],
                    handle_unknown = "use_encoded_value",
                    unknown_value = np.nan
                ).fit(
                    np.array(df[col]).reshape(-1,1),
                )
                ordinal_encoders[col] = encoder
            df[col] = encoder.transform(np.array(df[col]).reshape(-1,1))
    
            na_imputer = SimpleImputer(strategy = 'most_frequent')
            df[col] = na_imputer.fit_transform(np.array(df[col]).reshape(-1,1))
            
    return ordinal_encoders, df
    
ordinal_encoders, data_processed = process_ordinal_columns(column_types_dict, data_processed)

# Other feature engineering

In [None]:
# feature Engineering
def feature_engineering(column_types_dict, df):
    df["HouseAge"] = df["YrSold"] - df['YearBuilt']
    column_types_dict["HouseAge"] = 'numeric'

    if 'SalePrice' in df.columns:
        df_copy = df[[col for col in df.columns if col != 'SalePrice']]
        df_copy['SalePrice'] = df['SalePrice']
        return column_types_dict, df_copy
    else:
        return column_types_dict, df

column_types_dict, data_processed = feature_engineering(column_types_dict, data_processed)

# Scaling

In [None]:
def scaling(
    column_types_dict,
    df,
    scalers = {}
):
    num_col = [
        col for col in column_types_dict.keys()\
        if column_types_dict.get(col) in ['numeric']
    ]
    
    for col in num_col:
        if (col in df.columns) and (col != 'SalePrice'):
            if col in scalers.keys():
                scaler = scalers[col]
            else:
                scaler = StandardScaler().fit(
                    np.array(df[col]).reshape(-1,1),
                )
                scalers[col] = scaler
            df[col] = scaler.transform(np.array(df[col]).reshape(-1,1))
    return scalers, df

num_scalers, data_processed = scaling(column_types_dict, data_processed)

In [None]:
display(data_processed.head())
display(data_processed.describe())
data_processed.isnull().sum().to_csv('processed_data_count_na.csv')
data_processed.to_csv('processed_data.csv')
data_processed.dtypes.to_csv("processed_data_types.csv")

# Exploratory Analysis

## correlations

In [None]:
num_ordinal_col = [
    col for col in data_processed.columns\
    if column_types_dict.get(col) in ['numeric', 'ordinal']
]

num_df = data_processed[num_ordinal_col]

plt.figure(figsize=(10,8))
sns.heatmap(
    num_df.corr(),
    cmap = sns.diverging_palette(240, 10, n = 20),
    center = 0
)
plt.show()

## distribution of numeric values

In [None]:
num_ordinal_df = data_processed[num_ordinal_col]
num_ordinal_df.hist(figsize=(20,20))
plt.show()

## distribution of sales prices by category

In [None]:
cat_col = [
    col for col in data_df.columns\
    if column_types_dict.get(col) in ['category']
]

plt.figure(figsize = (20,60))
for i, col in enumerate(cat_col):
    plt.subplot(10,3,i+1)
    sns.boxplot(
        data = data_df,
        x= data_df[col],
        y = data_df['SalePrice']
    )
plt.show()

# Modeling

In [None]:
X_columns = [col for col in data_processed.columns if col != 'SalePrice']
X_train = data_processed.loc[:, X_columns]
Y_train = data_processed.loc[:, 'SalePrice']

## LASSO

In [None]:
lasso = Lasso(max_iter = 1000000)

parameters = {'alpha':[0.001, 0.0015, 0.002, 0.00225]}

lasso_clf = GridSearchCV(
    estimator = lasso,
    param_grid = parameters,
    scoring = 'neg_root_mean_squared_error',
    cv = 10
).fit(X = X_train, y = Y_train)

In [None]:
print(lasso_clf.best_params_)
print(lasso_clf.best_score_)

## Random Forest

In [None]:
rf = RandomForestRegressor()

parameters = {
    'n_estimators':[50, 100, 200],
    'max_depth': [8,10],
}

rf_clf = GridSearchCV(
    estimator = rf,
    param_grid = parameters,
    scoring = 'neg_root_mean_squared_error',
    cv = 10
).fit(X = X_train, y = Y_train)

In [None]:
print(rf_clf.best_params_)
print(rf_clf.best_score_)

# Load test data

In [None]:
test_df = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv',
                      keep_default_na= False)
print(test_df.shape)
id_df = test_df['Id']
test_df = test_df.drop(columns = ['Id'])

In [None]:
test_df['SalePrice'] = 100.0

_ , test_df = create_binary_columns(
    column_types_dict,
    test_df,
    encoder = one_hot_encoder
)

_ , test_df = process_numeric_columns(
    column_types_dict,
    test_df,
    imputers = num_imputers
)

_, test_df = process_ordinal_columns(
    column_types_dict,
    test_df,
    ordinal_encoders = ordinal_encoders
)

test_df = test_df.drop(columns = ['SalePrice'])

test_df.to_csv('test_df_processed.csv')
test_df.isnull().sum().to_csv("count_na_test.csv")

display(test_df)

In [None]:
_, test_df = feature_engineering(column_types_dict, test_df)
_, test_df = scaling(column_types_dict, test_df)

# Predict test data

In [None]:
prediction = rf_clf.predict(test_df)
#prediction = num_scalers["SalePrice"].inverse_transform(prediction.reshape(-1,1))
print(prediction)

In [None]:
prediction_df = pd.DataFrame(prediction)
prediction_df = prediction_df.rename(columns = {0: "SalePrice"})
prediction_df["Id"] = id_df
prediction_df = prediction_df[['Id', "SalePrice"]]
display(prediction_df)
prediction_df.to_csv('submission.csv', index = False)