## THIS IS MY FIRST NOTEBOOK. PLEASE SHARE YOUR COMMENTS. THANK YOU !

The aim of this study is to predict the sale price of houses in King County (Washington, USA) by using three different algorithms. The dataset includes general characteristics of houses sold between May 2014 and May 2015.

# 1. Introduction

In [None]:
# ______Data Manipulation________
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter("ignore")
from scipy import stats
from pandas.plotting import scatter_matrix

#______Predictive Models________
import sklearn
from sklearn import preprocessing
from sklearn.linear_model import*
from sklearn.preprocessing import*
from sklearn.ensemble import*
from sklearn.metrics import*
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, f_regression

In [None]:
data = pd.read_csv('../input/housesalesprediction/kc_house_data.csv')

In [None]:
data.shape

In [None]:
data.head(5)

# 2. Descriptive Statistics

In [None]:
# Data types
data.info()

In [None]:
# Transformation of data type of "date" variable 
from datetime import datetime
data["date"]=pd.to_datetime(data["date"])
data.head(5)

In [None]:
# Detection of missing data
data.isnull().sum()

In [None]:
# Unique values
data.nunique()

In [None]:
# Removing duplicate raws in "id" variable
data=data.drop_duplicates('id')
data.shape

In [None]:
# Removing "id" and "date" columns
data = data.drop(columns=["id","date"])

In [None]:
# Viewing basic statistical details
data.describe().T

# 3. Data Visualization

In [None]:
data['price_bins'] = data.price.map(lambda x: 1 if x > 600000
                            else (2 if x >= 300000 and x <=600000
                                  else 3))

In [None]:
lat1 = data.loc[data['price_bins'] == 1].lat
long1 = data.loc[data['price_bins'] == 1].long

lat2 = data.loc[data['price_bins'] == 2].lat
long2 = data.loc[data['price_bins'] == 2].long

lat3 = data.loc[data['price_bins'] == 3].lat
long3 = data.loc[data['price_bins'] == 3].long

In [None]:
fig = plt.figure(figsize = (15,12))


ax1 = fig.add_subplot(221)
ax1.set_xlim(-122.6,-121.2)

plt.scatter(long1, lat1, label = 'over 600.000$', color='r', marker='.')
plt.scatter(long2, lat2, label = 'between 300,000 and 600,000$', color='b', marker='.', alpha=0.2)
plt.scatter(long3, lat3, label = 'under 300,000$', color='g', marker='.', alpha=0.2)

plt.title('House Prices / Location')
plt.legend()
plt.show()

ax2 = fig.add_subplot(222)
plt.scatter(long1, lat1, label = 'over 600.000$', color='r', marker='.')
plt.title('The houses that their price is over 600.000$')
ax2.set_xlim(-122.6,-121.2)
plt.show()

ax3 = fig.add_subplot(223)
plt.scatter(long2, lat2, label = '300.000-600.000$', color='b', marker='.')
plt.title('The houses that their price is between 300.000$ and 600.000$')
ax3.set_xlim(-122.6,-121.2)
plt.show()

ax4 = fig.add_subplot(224)
plt.scatter(long3, lat3, label = 'under 300.000$', color='g', marker='.')
plt.title('The houses that their price is under 300.000$')
ax4.set_xlim(-122.6,-121.2)
plt.show()

In [None]:
data = data.drop(columns="price_bins")

In [None]:
sns.pairplot(data,
             x_vars=["price","sqft_living","grade","sqft_above","bathrooms", "bedrooms"],
             y_vars=["price","sqft_living","grade","sqft_above","bathrooms", "bedrooms"],
             hue="condition", palette="tab10", corner=True)

In [None]:
# Correlation matrix
plt.figure(figsize=(16, 12))
sns.heatmap(data.corr(),vmax=1, vmin=-1,annot=True, cmap="PiYG")

In [None]:
# Data distribution
data.hist(bins=50,layout=(20,5),figsize=(20,55),xlabelsize='6',color="green")

In [None]:
# Boxplots of variables
data.plot(kind='box', subplots=True, layout=(20,5), sharex=False, sharey=False,figsize=(20,55))

In [None]:
# Detection of outliers
z_scores = stats.zscore(data)
abs_z_scores = np.abs(z_scores)
filtered = (abs_z_scores < 3).all(axis=1) # Thresold 3
data_out = data[filtered]
data_out.describe().T

After this process, as seen in the table above,the information in the "waterfront" and "yr_renovated" variables that can significantly affect house prices disappeared. Therefore, it was decided not to remove outliers from the data set in this study.

In [None]:
# Deleteing the row that has 33 bedrooms
data.sort_values('bedrooms',ascending=False).head(3)

In [None]:
data = data.drop(15870)

In [None]:
sns.distplot(data.price, kde = True)

In [None]:
# Log transformation of "price" output variable
data["log_price"] = np.log(data["price"])
data.head(5)

In [None]:
plt.figure(figsize=(5,5)); stats.probplot(data['log_price'], plot = plt)

# 4. Predictive Models

## 4.1. Linear regression

In [None]:
X=data.drop(["price","log_price"],axis=1)
y=data["log_price"]

In [None]:
cols = X.columns
cols

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
print('The shape of X and y, respectively(train) :',X_train.shape, y_train.shape)
print('The shape of X and y, respectively(test) :',X_test.shape, y_test.shape)

In [None]:
# Normalization of input variables
scaler = MinMaxScaler()
X_train_scl = scaler.fit_transform(X_train)
X_test_scl = scaler.transform(X_test)

In [None]:
model_1 = LinearRegression()
model_1.fit(X_train_scl,y_train)

ypred = model_1.predict(X_test_scl)

In [None]:
R2_model_scores=[]

linear_r2 = r2_score(y_test, ypred)
R2_model_scores.append(linear_r2)

print("r2 : %0.3f" % (linear_r2))

In [None]:
# 5-fold cross validation
CV_scores = cross_val_score(model_1, X_train_scl, y_train, cv=5)
print("Mean: %0.3f and Standard deviation: %0.3f" % (CV_scores.mean(), CV_scores.std()))

In [None]:
ypred_act = np.exp (ypred)
y_test_act = np.exp (y_test)

In [None]:
plt.scatter(y_test_act, ypred_act, color = "r", marker = "*" )
plt.plot([y_test_act.min(), y_test_act.max()], [y_test_act.min(), y_test_act.max()], color = 'g')

plt.text(2000000, 5000000, 'R-squared = %0.2f' % linear_r2)
plt.xlabel('House prices') 
plt.ylabel('Predicted prices') 

## 4.2. Linear regression - Select K best_F regression

In [None]:
r2_scores=[]
n = X_train_scl.shape[1]

for i in range(n):
    model_kbest = SelectKBest(score_func = f_regression,k = i+1)
    X_kbest = model_kbest.fit_transform(X_train_scl,y_train)
    X_test_kbest = model_kbest.transform(X_test_scl)
    
    model_2 = LinearRegression()
    model_2.fit(X_kbest,y_train)
    ypred = model_2.predict(X_test_kbest)
    r2_scores.append(r2_score(y_test, ypred))
    
f_regression_r2 = np.max(r2_scores)
R2_model_scores.append(f_regression_r2)

print("Optimal input number:",np.argmax(r2_scores))
print("r2 : %0.3f" % (np.max(r2_scores)))

In [None]:
# Determination of important variables
model_kbest = SelectKBest(score_func=f_regression, k=17)
model_kbest.fit_transform(X_train_scl, y_train)

selected_features = pd.DataFrame (model_kbest.get_support())
scores = pd.DataFrame (model_kbest.scores_)
cols = pd.DataFrame(X.columns)

In [None]:
feature_scores = pd.concat([cols, scores, selected_features], axis=1)
feature_scores.columns = ["Input", "Score", "Selected"]
feature_scores.nsmallest(18, "Score").head()

## 4.3. Gradient boosting regressor

In [None]:
X=data.drop(["price","log_price"],axis=1)
y=data["price"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
model_3 = GradientBoostingRegressor(max_depth = 5, n_estimators = 300)
model_3.fit(X_train,y_train)

ypred = model_3.predict(X_test)

In [None]:
GBR_r2 = r2_score(y_test, ypred)
R2_model_scores.append(GBR_r2)
print('r2 : %0.3f' %(GBR_r2))

In [None]:
# 5-fold cross validation
CV_scores = cross_val_score(model_3, X_train, y_train, cv = 5)
print("Mean: %0.3f and Standard deviation: %0.3f" % (CV_scores.mean(), CV_scores.std()))

In [None]:
plt.scatter(y_test, ypred, color = "b", marker = "o" )
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color = 'g')

plt.text(2000000, 5000000, 'R-squared = %0.2f' % GBR_r2)
plt.xlabel('House prices') 
plt.ylabel('Predicted prices')

## 4.4. Random forest regressor

In [None]:
model_4 = RandomForestRegressor()
model_4.fit(X_train,y_train)

ypred = model_4.predict(X_test)

In [None]:
RFR_r2 = r2_score(y_test, ypred)
R2_model_scores.append(RFR_r2)
print('r2 : %0.3f' %(RFR_r2))

In [None]:
# 5-fold cross validation
CV_scores = cross_val_score(model_4, X_train, y_train, cv=5)
print("Mean: %0.3f and Standard deviation: %0.3f" % (CV_scores.mean(), CV_scores.std()))

In [None]:
plt.scatter(y_test, ypred, color = "y", marker = "x" )
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color = 'g')

plt.text(2000000, 5000000, 'R-squared = %0.2f' % RFR_r2)
plt.xlabel('House prices') 
plt.ylabel('Predicted prices') 

# 5. Conclusion

In [None]:
R2_model_scores=[linear_r2, f_regression_r2, GBR_r2, RFR_r2]
print(R2_model_scores)