In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Read csv Dataset in pandas df 
# and display 5 obeservations of df using df.head() method below:
df = pd.read_csv(os.path.join(dirname, 'kc_house_data.csv'))
df.head()

In [None]:
# display info of df using df.info() method
# we notice that we don't have any null values 
# and all features are numeric types(int64, float64) except the data feature is object
df.info()

In [None]:
# display statistical info about df features or columns using df.descibe() method
df.describe()

In [None]:
df['year'] = df["date"].apply(lambda date: int(date[0:4]))
df['age'] = df['year'] - df['yr_built']
df['last_renvonted'] = df['year'] - df['yr_renovated']
df['baths'] =  df["bathrooms"].apply(lambda x: int(x+1) if int(str(x).split('.')[1]) > 50 else int(x))
df.head()

In [None]:
# drop both id and date 
df = df.drop(['id','date', 'zipcode','yr_built', 'yr_renovated', 'bathrooms'],axis=1)
df.head()

In [None]:
# display histogram of every column in df
df.hist(bins=40, figsize=(20,15))
plt.show()

In [None]:
# dispaly correlation between features or columns in data 
np.random.seed(0)
sns.set_theme()
fig, ax = plt.subplots(figsize = (20, 8))
sns.heatmap(df.corr(), cmap ='RdYlGn', linewidths = 0.3, annot = True)

In [None]:
# display corr in descending order between price and remaining columns
corr_matrix = df.corr()
corr_matrix["price"].sort_values(ascending=False)[1:]

In [None]:
# display Scatter of some columns in df 'sqft_living','grade','lat','sqft_living15','long', 'sqft_above','yr_built','bathrooms','waterfront','zipcode'
attributes = ['price', 'last_renvonted']
scatter_matrix(df[attributes], figsize=(12, 8))

In [None]:

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
print(f"The len of train df {len(train_df)}")
print(f"The len of train df {len(val_df)}")

In [None]:
X_train = train_df.drop('price',axis=1)
y_train = train_df['price']
X_val = val_df.drop('price',axis=1)
y_val = val_df['price']
cols = X_train.columns

In [None]:
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_val = scalar.fit_transform(X_val)

In [None]:
# Grid Search for Fine-Tune hyperparameters
def GridSearch(param_grid, reg, cv_n =5):
    grid_search = GridSearchCV(reg, param_grid, cv=cv_n,
    scoring='neg_mean_squared_error',
    return_train_score=True)
    return grid_search
# display scores 
def display_scores(predictions):
    mse = mean_squared_error(y_val, predictions)
    print("MSE:", mse)
    rmse = np.sqrt(mse)
    print("RMSE:", rmse)
    r2 = r2_score(y_val, predictions)
    print("R2:", r2)

In [None]:
# Fine-Tune our Decision Tree Regressor to find best hyperparameters and best features
param_grid = [{'max_features': [2, 4, 6, 8, 10], 'max_depth' : [3, 4, 5],}]
decision_tree_reg = DecisionTreeRegressor()
grid_search = GridSearch(param_grid, decision_tree_reg)
grid_search.fit(X_train, y_train)

In [None]:
#The best hyperparameters for model
decision_tree_model = grid_search.best_estimator_
decision_tree_model

In [None]:
#Mean Score test for each hyperparameters for model
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
#important features for training the model
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

In [None]:
#important features in descending order for training the model
sorted(zip(feature_importances, cols), reverse=True)

In [None]:
#prediction of X_val of to get predicated house prices
predictions = decision_tree_model.predict(X_val)
predictions

In [None]:
display_scores(predictions)

In [None]:
# Fine-Tune our Random Forest Regressor to find best hyperparameters and best features
param_grid = [
{'n_estimators': [10, 30, 40], 'max_features': [8, 10, 12]},
]
random_forest_reg = RandomForestRegressor()
grid_search = GridSearch(param_grid, random_forest_reg)
grid_search.fit(X_train, y_train)

In [None]:
random_forest_model = grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

In [None]:
sorted(zip(feature_importances, cols), reverse=True)

In [None]:
predictions = random_forest_model.predict(X_val)

In [None]:
display_scores(predictions)

In [None]:
# Train the model
param_grid = [
{'n_estimators': [10, 30, 40], 'max_features': [8, 10], 'max_depth' : [3, 4, 5],}
]
gradient_boosting_reg = GradientBoostingRegressor()
grid_search = GridSearch(param_grid, gradient_boosting_reg)
grid_search.fit(X_train, y_train)
grid_search.fit(X_train, y_train)

In [None]:
gb_model = grid_search.best_estimator_

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances


In [None]:
sorted(zip(feature_importances, cols), reverse=True)

In [None]:
predictions = gb_model.predict(X_val)

In [None]:
display_scores(predictions)