In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns

import re

sns.set_style('whitegrid')
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb

In [2]:
from imblearn.over_sampling import SMOTE

# Dataset Description

## File descriptions

**train.csv** - the training set

**test.csv** - the test set

**data_description.txt** - full description of each column, originally prepared by Dean De Cock but lightly edited to match the column names used here

**sample_submission.csv** - a benchmark submission from a linear regression on year and month of sale, lot square footage, and number of bedrooms

## Data fields

Here's a brief version of what you'll find in the data description file.

- **SalePrice:** the property's sale price in dollars. This is the target variable that you're trying to predict.
- **MSSubClass:** The building class
- **MSZoning:** The general zoning classification
- **LotFrontage:** Linear feet of street connected to property
- **LotArea:** Lot size in square feet
- **Street:** Type of road access
- **Alley:**  Type of alley access
- **LotShape:**  General shape of property
- **LandContour:**  Flatness of the property
- **Utilities:**  Type of utilities available
- **LotConfig:**  Lot configuration
- **LandSlope:**  Slope of property
- **Neighborhood:**  Physical locations within Ames city limits
- **Condition1:**  Proximity to main road or railroad
- **Condition2:**  Proximity to main road or railroad (if a second is present)
- **BldgType:**  Type of dwelling
- **HouseStyle:**  Style of dwelling
- **OverallQual:**  Overall material and finish quality
- **OverallCond:**  Overall condition rating
- **YearBuilt:**  Original construction date
- **YearRemodAdd:**  Remodel date
- **RoofStyle:**  Type of roof
- **RoofMatl:**  Roof material
- **Exterior1st:**  Exterior covering on house
- **Exterior2nd:**  Exterior covering on house (if more than one material)
- **MasVnrType:**  Masonry veneer type
- **MasVnrArea:**  Masonry veneer area in square feet
- **ExterQual:**  Exterior material quality
- **ExterCond:**  Present condition of the material on the exterior
- **Foundation:**  Type of foundation
- **BsmtQual:**  Height of the basement
- **BsmtCond:**  General condition of the basement
- **BsmtExposure:**  Walkout or garden level basement walls
- **BsmtFinType1:**  Quality of basement finished area
- **BsmtFinSF1:**  Type 1 finished square feet
- **BsmtFinType2:**  Quality of second finished area (if present)
- **BsmtFinSF2:**  Type 2 finished square feet
- **BsmtUnfSF:**  Unfinished square feet of basement area
- **TotalBsmtSF:**  Total square feet of basement area
- **Heating:**  Type of heating
- **HeatingQC:**  Heating quality and condition
- **CentralAir:**  Central air conditioning
- **Electrical:**  Electrical system
- **1stFlrSF:**  First Floor square feet
- **2ndFlrSF:**  Second floor square feet
- **LowQualFinSF:**  Low quality finished square feet (all floors)
- **GrLivArea:**  Above grade (ground) living area square feet
- **BsmtFullBath:**  Basement full bathrooms
- **BsmtHalfBath:**  Basement half bathrooms
- **FullBath:**  Full bathrooms above grade
- **HalfBath:**  Half baths above grade
- **Bedroom:**  Number of bedrooms above basement level
- **Kitchen:**  Number of kitchens
- **KitchenQual:**  Kitchen quality
- **TotRmsAbvGrd:**  Total rooms above grade (does not include bathrooms)
- **Functional:**  Home functionality rating
- **Fireplaces:**  Number of fireplaces
- **FireplaceQu:**  Fireplace quality
- **GarageType:**  Garage location
- **GarageYrBlt:**  Year garage was built
- **GarageFinish:**  Interior finish of the garage
- **GarageCars:**  Size of garage in car capacity
- **GarageArea:**  Size of garage in square feet
- **GarageQual:**  Garage quality
- **GarageCond:**  Garage condition
- **PavedDrive:**  Paved driveway
- **WoodDeckSF:**  Wood deck area in square feet
- **OpenPorchSF:**  Open porch area in square feet
- **EnclosedPorch:**  Enclosed porch area in square feet
- **3SsnPorch:**  Three season porch area in square feet
- **ScreenPorch:**  Screen porch area in square feet
- **PoolArea:**  Pool area in square feet
- **PoolQC:**  Pool quality
- **Fence:**  Fence quality
- **MiscFeature:**  Miscellaneous feature not covered in other categories
- **MiscVal:**  $Value of miscellaneous feature
- **MoSold:**  Month Sold
- **YrSold:**  Year Sold
- **SaleType:**  Type of sale
- **SaleCondition:**  Condition of sale

In [3]:
df_train = pd.read_csv("data/competition/train.csv")
df_test = pd.read_csv("data/competition/test.csv")
df_combine = pd.concat([df_train, df_test])#, axis = 1)

In [4]:
df_combine.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2919 entries, 0 to 1458
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             2919 non-null   int64  
 1   MSSubClass     2919 non-null   int64  
 2   MSZoning       2915 non-null   object 
 3   LotFrontage    2433 non-null   float64
 4   LotArea        2919 non-null   int64  
 5   Street         2919 non-null   object 
 6   Alley          198 non-null    object 
 7   LotShape       2919 non-null   object 
 8   LandContour    2919 non-null   object 
 9   Utilities      2917 non-null   object 
 10  LotConfig      2919 non-null   object 
 11  LandSlope      2919 non-null   object 
 12  Neighborhood   2919 non-null   object 
 13  Condition1     2919 non-null   object 
 14  Condition2     2919 non-null   object 
 15  BldgType       2919 non-null   object 
 16  HouseStyle     2919 non-null   object 
 17  OverallQual    2919 non-null   int64  
 18  OverallCond  

In [79]:
df_combine.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500.0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500.0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500.0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000.0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000.0


In [103]:
df_combine.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,2919.0,2919.0,2433.0,2919.0,2919.0,2919.0,2919.0,2919.0,2896.0,2918.0,...,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,1460.0
mean,1460.0,57.137718,69.305795,10168.11408,6.089072,5.564577,1971.312778,1984.264474,102.201312,441.423235,...,93.709832,47.486811,23.098321,2.602261,16.06235,2.251799,50.825968,6.213087,2007.792737,180921.19589
std,842.787043,42.517628,23.344905,7886.996359,1.409947,1.113131,30.291442,20.894344,179.334253,455.610826,...,126.526589,67.575493,64.244246,25.188169,56.184365,35.663946,567.402211,2.714762,1.314964,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,730.5,20.0,59.0,7478.0,5.0,5.0,1953.5,1965.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0,129975.0
50%,1460.0,50.0,68.0,9453.0,6.0,5.0,1973.0,1993.0,0.0,368.5,...,0.0,26.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,2189.5,70.0,80.0,11570.0,7.0,6.0,2001.0,2004.0,164.0,733.0,...,168.0,70.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,2919.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,1424.0,742.0,1012.0,508.0,576.0,800.0,17000.0,12.0,2010.0,755000.0


In [80]:
df_combine['YrSold'].unique()

array([2008, 2007, 2006, 2009, 2010])

In [100]:
drop_columns = ['LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'ScreenPorch']
drop_test = ['Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature', 'LotFrontage', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond']
df_test = df_combine.drop(drop_columns, axis = 1)
df_test = df_test.drop(drop_test, axis = 1)

In [102]:
for column in df_test.columns:
    if df_test[column].isnull().sum() != 0:
        print(f'{column} : {df_test[column].isnull().sum()}')
len(df_test.columns)

MSZoning : 4
Utilities : 2
Exterior1st : 1
Exterior2nd : 1
Electrical : 1
BsmtFullBath : 2
BsmtHalfBath : 2
KitchenQual : 1
Functional : 2
GarageCars : 1
SaleType : 1
SalePrice : 1459


50