# 1/ Import Libraries

In [None]:
# Data Manipulation, Import, Export
import numpy as np 
import pandas as pd 

# Data Missing Value Treatment
from sklearn.impute import SimpleImputer

# Data Preprocessing
# Data Transformation - Categorical
from sklearn.preprocessing import OneHotEncoder

# Data Transformation - Numerical
from sklearn.preprocessing import StandardScaler

# Train and Test Split
from sklearn.model_selection import train_test_split

# Build Model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Evaluate Model
from sklearn.metrics import mean_squared_error  # use squared = False for RMSE

# 2/ Import Data
- train (build model) : /kaggle/input/house-prices-advanced-regression-techniques/train.csv
- test (predict and submit) : /kaggle/input/house-prices-advanced-regression-techniques/test.csv

In [None]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
predict = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv') # we will call test as Predict to avoid confusion during train_test_split

In [None]:
train.head()

In [None]:
# Drop Categorical Features
train = train.drop(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
       'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
       'MiscFeature', 'SaleType', 'SaleCondition'], axis = 1)
predict = predict.drop(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
       'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
       'MiscFeature', 'SaleType', 'SaleCondition'], axis = 1)

In [None]:
sample_submission = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')
sample_submission.head()

## Check Data

In [None]:
# train
train.info()

In [None]:
# predict
predict.info()

- How many Categorical Features?
- How many Numerical Fetaures?
- Ids and Target feature?

# 3/ Divide Data into X and y

In [None]:
# Train
y = train['SalePrice']
# only X - no Id, no y
X = train.drop(['Id', 'SalePrice'], axis = 1)
# train.drop('SalePrice' And 'Id',axis=1)

In [None]:
# Predict
X_predict = predict.drop(['Id'], axis = 1)

# 4/ Identify Numerical and Categorical Features

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_features = X.select_dtypes(include = numerics).columns.values
numerical_features

In [None]:
categorical_features = X.select_dtypes(exclude = numerics).columns.values
categorical_features

# 6/ Identify and Treat Missing Values

In [None]:
# Check Missing Values
X.columns[X.isnull().any()]

In [None]:
X.isnull().sum()

## Impute Missing Values - Numerical

In [None]:
# Declare Simple Imputer
num_impute = SimpleImputer(strategy = 'median')

In [None]:
# Get Mean for all features
num_impute.fit(X[numerical_features])

In [None]:
# Transform Train
X[numerical_features] = num_impute.transform(X[numerical_features])

In [None]:
# Transform Predict
X_predict[numerical_features] = num_impute.transform(X_predict[numerical_features])

## Impute Missing Values - Categorical

In [None]:
# Declare Simple Imputer
#cat_impute = SimpleImputer(strategy = 'constant', fill_value = 'missing')

# Get Mean for all features
#cat_impute.fit(X[categorical_features])

# Tranform Train
#X[categorical_features] = cat_impute.transform(X[categorical_features])

# Tranform Predict
#X_predict[categorical_features] = cat_impute.transform(X_predict[categorical_features])

### Check Again

In [None]:
X.columns[X.isnull().any()]

# 7/ Transform - Numerical

In [None]:
# Declare StandardScaler
num_scale = StandardScaler()

# Fit
num_scale.fit(X[numerical_features])

# Transform Train
X[numerical_features] = num_scale.transform(X[numerical_features])

# Transform Predict
X_predict[numerical_features]=num_scale.transform(X_predict[numerical_features])

In [None]:
X.head()

# 8/ Encode Categorical

In [None]:
# Declare Categorical Encoder
# cat_encoder = OneHotEncoder(handle_unknown = 'ignore')

# Get Stats
# cat_encoder.fit_transform(X[categorical_features])

# Transform Train
#X[categorical_features] = cat_encoder.transform(X[categorical_features])

# Transform Predict
# X_predict[categorical_features] = cat_encoder.transform(X_predict[categorical_features])

# 9/ Build Data for train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

# 10/ Build Model

In [None]:
rf = RandomForestRegressor(n_estimators = 10, max_depth = 6, random_state = 0)

## Fit

In [None]:
rf.fit(X_train, y_train)

## Predict

In [None]:
# train
train_predict = rf.predict(X_train)
train_predict

In [None]:
# test
test_predict = rf.predict(X_test)
test_predict

# 11/ Eval

In [None]:
# Train RMSE
mean_squared_error(train_predict, y_train, squared = False)

In [None]:
# Test RMSE
mean_squared_error(test_predict, y_test, squared = False)

# 12/ Predict and Submit

In [None]:
# Predict
predictions = rf.predict(X_predict)
predictions

In [None]:
# Create Submission
rf_sub = pd.DataFrame({
    'Id' :  predict['Id'],
    'SalePrice' : predictions
})
rf_sub.head()

In [None]:
# Export Submission
rf_sub.to_csv('rf_sub_num.csv', index = False)