# House Price EDA and Prediction

![](https://i.imgur.com/1Ok0S6w.png)

## Initial Code and Data Import

### To Do
- Actual EDA
    - Review EDA lesson
- Feature engineering
 - Mutual information
 - PCA
 - k-Means Clustering
 - Encoding
     - Look at other options here

In [None]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.cluster import KMeans

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
house_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

## EDA

In [None]:
house_data.describe

In [None]:
house_data.shape

## Predictions

In [None]:
X = house_data.drop('SalePrice', axis=1)
y = house_data.SalePrice

In [None]:
X.columns

In [None]:
X['Topography'] = X['LotConfig'] + X['LandContour']
X['Geometry'] = X['LotArea'] / X['LotFrontage']
X['TotalIndoorSqFt'] = X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF'] + X['GarageArea']
X['HouseToYardRatio'] = X['TotalIndoorSqFt'] / X['LotArea']
X['HouseToPoolRatio'] = X['TotalIndoorSqFt'] / (X['PoolArea'] + 1)
X['Value'] = X['OverallCond'] * X['OverallQual']
X['Condition'] = X['Condition1'] + X['ExterCond']
X['YardToSeatingAreaRatio'] =  (X['WoodDeckSF'] + X['OpenPorchSF'] + 1) / X['LotArea']
X['Meh'] = X['Fireplaces'] * X['TotRmsAbvGrd']

In [None]:
X.Fireplaces.unique()

In [None]:
categorical_cols = [cname for cname in X.columns if X[cname].nunique() < 50 and 
                        X[cname].dtype == "object"]

numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

In [None]:
numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

model = XGBRegressor(random_state=42, n_estimators=350, max_depth=3, learning_rate=0.01, booster='dart')

In [None]:
kmeans = KMeans(n_clusters=6)
X["Cluster"] = kmeans.fit_predict(preprocessor.fit_transform(X))
X["Cluster"] = X["Cluster"].astype("category")

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

pipeline.fit(X_train, y_train)

preds = pipeline.predict(X_test)

# Evaluate the model
score = mean_squared_error(y_test, preds, squared=False)
print('RMSE:', score)

## Model Optimization

In [None]:
# parameters = {
#     'max_depth': [3, 5, 10, None],
#     'n_estimators': [100, 200, 300, 400, 500],
#     'learning_rate': [0.01, 0.1, 0.5],
#     'booster' : ['gbtree', 'gblinear', 'dart']
# }

# rv = RandomizedSearchCV(model,
#                         param_distributions=parameters,
#                         n_iter=25,
#                         cv=5,
#                         n_jobs=-1,
#                         random_state=42)

# param_X = preprocessor.fit_transform(X)
# rv.fit(param_X, y)

# rv.best_params_, rv.best_score_

## Submission

In [None]:
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
test['Topography'] = test['LotConfig'] + test['LandContour']
test['Geometry'] = test['LotArea'] / test['LotFrontage']
test['TotalIndoorSqFt'] = test['TotalBsmtSF'] + test['1stFlrSF'] + test['2ndFlrSF'] + test['GarageArea']
test['HouseToYardRatio'] = test['TotalIndoorSqFt'] / test['LotArea']
test['HouseToPoolRatio'] = test['TotalIndoorSqFt'] / (test['PoolArea'] + 1)
test['Value'] = test['OverallCond'] * test['OverallQual']
test['Condition'] = test['Condition1'] + test['ExterCond']
test['YardToSeatingAreaRatio'] =  (test['WoodDeckSF'] + test['OpenPorchSF'] + 1) / test['LotArea']
test['Meh'] = test['Fireplaces'] * test['TotRmsAbvGrd']
test["Cluster"] = kmeans.fit_predict(preprocessor.transform(test))
test["Cluster"] = test["Cluster"].astype("category")
pipeline.fit(X, y)
preds = pipeline.predict(test)
submission = pd.DataFrame({'Id': test.Id, 'SalePrice': preds})
submission.to_csv('submission.csv', index=False)