In [None]:
import pandas as pd 
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,KFold
from statsmodels.api import OLS
from sklearn.preprocessing import StandardScaler
from warnings import filterwarnings
filterwarnings("ignore")
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score,mean_squared_error
!pip install xgboost
import xgboost as xgb
from xgboost import XGBRegressor
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# ***DATA IMPORT***

In [None]:
df= pd.read_csv("/kaggle/input/housesalesprediction/kc_house_data.csv",index_col=0)

In [None]:
df.head()

In [None]:
df.info()

* id :a notation for a house

* date: Date house was sold

* price: Price is prediction target

* bedrooms: Number of Bedrooms/House

* bathrooms: Number of bathrooms/bedrooms

* sqft_living: square footage of the home

* sqft_lot: square footage of the lot

* floors :Total floors (levels) in house

* waterfront :House which has a view to a waterfront

* view: Has been viewed

* condition :How good the condition is Overall

* grade: overall grade given to the housing unit, based on King County grading system

* sqft_above :square footage of house apart from basement

* sqft_basement: square footage of the basement

* yr_built :Built Year

* yr_renovated :Year when house was renovated

* zipcode:zip code

* lat: Latitude coordinate

* long: Longitude coordinate

* sqft_living15 : The average lot square footage of the 15 closest houses

* sqft_lot15 : The average house square footage of the 15 closest houses

In [None]:
df.isnull().values.any()

**There is not any null values**

## ***ROOMS***

In [None]:
df.corr(method="spearman")["bathrooms"].sort_values()

In [None]:
df.corr(method="spearman")["bedrooms"].sort_values()

In [None]:
plt.figure(121,figsize=(7,7))
sns.scatterplot(x=df["bedrooms"],y=df["price"])
sns.scatterplot(x=df["bathrooms"],y=df["price"])
sns.lineplot(x=(df["bedrooms"]+df["bathrooms"]),y=df["price"],color="black");
plt.legend(["bedrooms+bathrooms","bathrooms","bedrooms"],loc="best");
plt.xlabel("Bedrooms&Bathrooms");

* Bathrooms and bedrooms have the same effect. So their sum gives the amount of the rooms

In [None]:
sns.heatmap(df[["bathrooms","bedrooms","price"]].corr(method="spearman"),annot=True);

In [None]:
df["rooms"]= df["bathrooms"]+df["bedrooms"]

In [None]:
df.corr(method="spearman")["rooms"].sort_values()

In [None]:
sns.heatmap(df.groupby(["rooms","bathrooms","bedrooms"])[["price"]].mean());

In [None]:
sns.scatterplot(x=df["rooms"],y=df["price"]);

In [None]:
df[["rooms","price"]].corr(method="spearman")

In [None]:
sns.heatmap(df[["bathrooms","bedrooms","rooms"]].corr(method="spearman"),annot=True);

In [None]:
df.drop(["bathrooms","bedrooms"],axis=1,inplace=True)

* Adding the number of bathrooms and bedrooms did not cause much data loss. The correlation between price and rooms has been preserved.

#### sqft_living = sqft_above + sqft_basement

In [None]:
plt.figure(1,figsize=(6,6))
sns.distplot(df["sqft_above"]+df["sqft_basement"]);
sns.kdeplot(df["sqft_living"]);
plt.legend(["sum of above and basement","sq ft living"],loc="best");

In [None]:
sns.heatmap(df[["sqft_living","sqft_above","sqft_basement","price"]].corr(method="spearman"),annot=True);

In [None]:
sns.scatterplot(x=df["sqft_basement"],y=df["price"]);
sns.scatterplot(x=df["sqft_living"],y=df["price"]);
sns.scatterplot(x=df["sqft_above"],y=df["price"]);
plt.legend(["Sqft of basement","Total sqft of a house","Sqft of above"],loc="best");

In [None]:
df[["sqft_basement","sqft_above","sqft_living"]].corr(method="spearman")

In [None]:
plt.figure(1,figsize=(20,5))
plt.subplot(311)
plt.plot(df.corr(method="spearman")["sqft_basement"].sort_values(),"--o",color="g");
plt.title("Sq ft of Basement Correlation Scores")
plt.xlabel("Variables",fontsize=10);
plt.ylabel("Correlation Score",fontsize=10);

plt.figure(2,figsize=(20,5))
plt.subplot(312)
plt.plot(df.corr(method="spearman")["sqft_above"].sort_values(),"--o",color="r");
plt.title("Sq ft Above Correlation Scores")
plt.xlabel("Variables",fontsize=10);
plt.ylabel("Correlation Score",fontsize=10);

plt.figure(3,figsize=(20,5))
plt.subplot(313)
plt.plot(df.corr(method="spearman")["sqft_living"].sort_values(),"--o",color="black");
plt.title("Sq ft Living Correlation Scores")
plt.xlabel("Variables",fontsize=10);
plt.ylabel("Correlation Score",fontsize=10);

In [None]:
df.drop(["sqft_above","sqft_basement"],axis=1,inplace=True)

In [None]:
df.head()

#### Ages of renovated and non-renovated houses

In [None]:
ages=[]
for i in np.arange(0,df.shape[0]):
    variable= df.iloc[i,:]
    date=variable["date"][0:4]
    df["date"].iloc[i]=date
    date_int=int(date)
    if (variable["yr_renovated"]>0):
        age= date_int-variable["yr_renovated"]
    else: 
        age=date_int-variable["yr_built"]
    ages.append(age)
ages= pd.DataFrame(ages,index=df.index)
df["age"]=ages

In [None]:
df.head()

In [None]:
sns.lineplot(x="age",y="view",data=df);
plt.title("View and age between correlation: " + str(df.corr(method="spearman")["view"]["age"]));

In [None]:
plt.figure(figsize=(6,6))
sns.lineplot(x=df["age"],y=df["price"]);
plt.title("Price and age between correlation: " + str(df.corr(method="spearman")["price"]["age"]));

In [None]:
df.corr(method="spearman")["yr_renovated"].sort_values()

In [None]:
df.corr(method="spearman")["yr_built"].sort_values()

In [None]:
df.corr(method="spearman")["age"].sort_values()

In [None]:
df.info()

In [None]:
df["date"]=df["date"].astype("int64")

In [None]:
df.info()

In [None]:
df.corr(method="spearman")["date"].sort_values()

In [None]:
df.groupby(["date"])[["price"]].mean()

In [None]:
df.corr(method="spearman")["condition"].sort_values()

In [None]:
df.corr(method="spearman")["view"].sort_values()

In [None]:
df.drop(["date","yr_built","yr_renovated","condition"],axis=1,inplace=True)

#### *Age variable can represents "yr_built,yr_renovated,condition,date" variables*

In [None]:
df.head()

### *Examining locations of houses*

In [None]:
len(df["zipcode"].unique())

In [None]:
len(df["lat"].unique())

In [None]:
len(df["long"].unique())

In [None]:
sns.heatmap(df[["zipcode","lat","long","price"]].corr(method="spearman"),annot=True);

In [None]:
sns.lineplot(x=df["lat"],y=df["price"]);

### *LOCATIONS OF HOUSES*

In [None]:
plt.figure(figsize=(7,7))
sns.scatterplot(x=df["long"],y=df["lat"],hue=df["zipcode"]);

In [None]:
plt.figure(figsize=(7,7))
sns.scatterplot(x=df["long"],y=df["lat"],hue=df["price"]);

In [None]:
plt.figure(figsize=(7,7))
sns.scatterplot(x="long",y="lat",hue="waterfront",data=df,palette=["brown","blue"]);

![](https://gisgeography.com/wp-content/uploads/2020/06/Seattle-Road-Map.jpg)

In [None]:
df.head()

In [None]:
df.corr(method="spearman")["sqft_lot15"].sort_values()

## *HANDLE OUTLIERS*

In [None]:
from sklearn.neighbors import LocalOutlierFactor

In [None]:
clf= LocalOutlierFactor(n_neighbors=20)

In [None]:
pred= clf.fit_predict(df)

In [None]:
pred[0:10]

#### Let's see outliers

In [None]:
df[pred==-1].head()

In [None]:
len(df[pred==-1])

#### 1058 Values are outliers

#### Non-Outliers

In [None]:
df[pred!=-1].head()

In [None]:
len(df[pred!=-1])

In [None]:
df_negative_scores= clf.negative_outlier_factor_

In [None]:
df_negative_scores[0:5]

In [None]:
df_negatives= np.sort(df_negative_scores)

In [None]:
df_negatives[0:5]

In [None]:
df_negatives.max()

In [None]:
plt.plot(df_negatives);

#### Selecting a threshold value

In [None]:
threshold= df_negatives[500]
threshold

In [None]:
df[df_negatives==threshold]

#### Below the threshold

In [None]:
threshold_below= df[df_negatives<threshold]
threshold_below.head()

In [None]:
len(threshold_below)

In [None]:
threshold_below_index= threshold_below.index

#### Above the threshold

In [None]:
df[df_negatives>threshold].head()

In [None]:
threshold_np= np.array(df[df_negatives==threshold])

In [None]:
threshold_np

In [None]:
threshold_below_np= np.array(threshold_below)
threshold_below_np[:]= threshold_np

In [None]:
df[df_negatives<threshold]=threshold_below_np

In [None]:
threshold_below_index

In [None]:
df.loc[threshold_below_index,:].head()

In [None]:
X=df.drop("price",axis=1)
y=df[["price"]]

In [None]:
X_train,X_test,y_train,y_test= train_test_split(X,y,train_size=0.8,random_state=45)

## *MODEL*

> ### GBM

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
gbm_params={"learning_rate": [0.001,0.01,0.1],
              "max_depth":  [5,15,25]}

In [None]:
gbm_grid= GridSearchCV(GradientBoostingRegressor(),gbm_params,cv=10,n_jobs=-1,scoring="r2")

In [None]:
gbm_grid.fit(X_train,y_train)

In [None]:
gbm_grid.best_params_

In [None]:
gbm_params_1={"n_estimators": [100,1000,2000],
              "subsample": [1,0.5,0.75]}
gbm_grid_1= GridSearchCV(GradientBoostingRegressor(),gbm_params_1,cv=10,n_jobs=-1,scoring="r2")
gbm_grid_1.fit(X_train,y_train)
gbm_grid_1.best_params_

In [None]:
gbm_model= GradientBoostingRegressor(learning_rate=0.1,max_depth=5,n_estimators=2000,subsample=0.75 ).fit(X_train,y_train)

In [None]:
y_pred_gbm= gbm_model.predict(X_test)

In [None]:
r2_score(y_test,y_pred_gbm)

In [None]:
cross_val_score(gbm_model,X,y,cv=10,scoring="r2").mean()

> ### XGBOOST

In [None]:
xgb_params= {"learning_rate": [0.1,0.01,0.001],
             "max_depth": [5,10,20]}
xgb_grid= GridSearchCV(XGBRegressor(),xgb_params,cv=10,n_jobs=-1,scoring="r2")
xgb_grid.fit(X_train,y_train)
xgb_grid.best_params_

In [None]:
xgb_params_1= {"n_estimators":[100,1000,2000],
               "subsample": [1,0.5,0.75]}
xgb_grid_1= GridSearchCV(XGBRegressor(),xgb_params_1,cv=10,n_jobs=-1,scoring="r2")
xgb_grid_1.fit(X_train,y_train)
xgb_grid_1.best_params_

In [None]:
xgb_model= XGBRegressor(learning_rate=0.01 ,max_depth=5,n_estimators=2000).fit(X_train,y_train)
y_pred_xgb= xgb_model.predict(X_test)

In [None]:
r2_score(y_test,y_pred_xgb)

In [None]:
cross_val_score(xgb_model,X,y,cv=10,scoring="r2").mean()