In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
    
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
def rmse(y_test,y_pred):
      return np.sqrt(mean_squared_error(y_test,y_pred))
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn import utils

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

#Metrics: I will measure the accuracy of the model with following measures: coefficient of determination R-squared, MSE, MAE
#Methods: Linear Regression, Random Forest/Decision trees, KNN algorithm, XGBoost  


# Description of the dataset

In [None]:
df = pd.read_csv('/kaggle/input/nyc-property-sales/nyc-rolling-sales.csv')
df.head()

In [None]:
df.shape

The dataset constist of 22 feaures and 84548 observations. Dataset description:
* **Borough**: The name of the borough in which the property is located.
* **Neighborhood**: Department of Finance assessors determine the neighborhood name in the course of valuing properties. 
* **Building Class Category**: to simplify identification of similar properties by broad usage (e.g. One Family Homes)
* **Tax Class at Present**: Every property in the city is assigned to one of four tax classes (Classes 1, 2, 3, and 4), based on the use of the property.
* **Block**: A Tax Block is a sub-division of the borough on which real properties are located.
* **Lot**: A Tax Lot is a subdivision of a Tax Block and represents the property unique location.
* **Easement**: An easement is a right, such as a right of way, which allows an entity to make limited use of
another’s real property.
* **Building Class at Present**: The Building Classification is used to describe a property’s constructive use. 
* **Address**: The street address of the property as listed on the Sales File.
* **Apartment number**
* **Zip Code**: The property’s postal code
* **Residential Units**: The number of residential units at the listed property.
* **Commercial Units**: The number of commercial units at the listed property.
* **Total Units**: The total number of units at the listed property.
* **Land Square Feet**: The land area of the property listed in square feet.
* **Gross Square Feet**:  The total area of all the floors of a building as measured from the exterior surfaces of theoutside walls of the building, including the land area and space within any building or structure
on the property. 
* **Year Built**: Year the structure on the property was built.
* **Tax Class at Time of Sale**:
* **Building Class at Time of Sale**
* **Sales Price**: Price paid for the property. Start date: 1 Sept 2016, End date: 31.08.2017
* **Sale Date**: Date the property sold.
 

At first let's remove some features from our dataset:

In [None]:
del df['EASE-MENT'] #empty column
del df['Unnamed: 0'] #iteration column
del df['SALE DATE'] #that would be useful for time series
del df['ADDRESS'] #too many unique values
del df['APARTMENT NUMBER'] #unrelevant, 77% of empty records

Let's search for duplicates and drop rows with duplicates

In [None]:
sum(df.duplicated())

In [None]:
df = df.drop_duplicates(df.columns, keep='last')
sum(df.duplicated(df.columns))

# Type conversion

In [None]:
df.dtypes

In [None]:
#-> nominal feature
df['NEIGHBORHOOD'] = df['NEIGHBORHOOD'].astype('category')
df['BOROUGH'] = df['BOROUGH'].astype('category')
df['BUILDING CLASS CATEGORY'] = df['BUILDING CLASS CATEGORY'].astype('category')
df['TAX CLASS AT PRESENT'] = df['TAX CLASS AT PRESENT'].astype('category')
df['BLOCK'] = df['BLOCK'].astype('category')
df['LOT'] = df['LOT'].astype('category')
df['BUILDING CLASS AT PRESENT'] = df['BUILDING CLASS AT PRESENT'].astype('category')
df['ZIP CODE'] = df['ZIP CODE'].astype('category')
df['BUILDING CLASS AT TIME OF SALE'] = df['BUILDING CLASS AT TIME OF SALE'].astype('category')
df['TAX CLASS AT TIME OF SALE'] = df['TAX CLASS AT TIME OF SALE'].astype('category')
#-> numeric
df['LAND SQUARE FEET'] = pd.to_numeric(df['LAND SQUARE FEET'], errors='coerce') #By setting errors=’coerce’, you’ll transform the non-numeric values into NaN.
df['GROSS SQUARE FEET']= pd.to_numeric(df['GROSS SQUARE FEET'], errors='coerce')
df['SALE PRICE'] = pd.to_numeric(df['SALE PRICE'], errors='coerce')

In [None]:
df.dtypes

Let's deal with missing values

In [None]:
df.isnull().sum()

We can check percentage of missing data in columns ['LAND SQUARE FEET', 'GROSS SQUARE FEET', 'SALE PRICE']

In [None]:
miss = df.isnull().sum()/len(df) 
miss=miss[miss>0]
miss.sort_values(inplace=True) 
miss

In [None]:
numeric_data = df.select_dtypes(include=[np.number])
numeric_data.describe()

We have missing values in 3 feature columns. We found unexpected 0 values for LAND SQUARE FEET, GROSS SQUARE FEET, YEAR BUILT, SALE PRICE.

GROSS SQUARE FEET might be equal zero if land is sold without any building (we will investige that later).
> A $\$0$ sale indicates that there was a transfer of ownership without a cash consideration. 
There can be a number of reasons for a $\$0$ sale including transfers of ownership from
parents to children.

We could use rows with missing price or zero value (exactly 22,812 samples) to predict the prices as independent project.
YEAR BULIT can be equal 2017 but value '0' it quite alarming.

In [None]:
test2 = df[df['SALE PRICE'].isna() | df['SALE PRICE'] == 0.0]
df = df[~df['SALE PRICE'].isna() & df['SALE PRICE'] != 0.0]
test2 = test2.drop(columns='SALE PRICE')
df.shape 

In [None]:
numeric_data = df.select_dtypes(include=[np.number])
numeric_data.describe()

LAND SQUARE FEET          -        24938 missing values & 8520 zero values

GROSS SQUARE FEET          -       26231 missing values & 8032 zero values

In [None]:
df['LAND SQUARE FEET']=df['LAND SQUARE FEET'].fillna(0)
df['GROSS SQUARE FEET']=df['GROSS SQUARE FEET'].fillna(0)

We found 1039 samples of properties which are probably unbuilt

In [None]:
sum((df['LAND SQUARE FEET']!=0 )& (df['GROSS SQUARE FEET']==0))

We will reject 3469 samples with missing (or zero values) in the following columns: LAND SQUARE FEET, GROSS SQUARE FEET, YEAR BUILT. 

In [None]:
sum((df['LAND SQUARE FEET']==0 )& (df['GROSS SQUARE FEET']==0) & (df['YEAR BUILT']==0))

We will save samples which may represent vacant lands (unbulit hence zero value in GROSS SQUARE LAND and YEAR BUILT)

In [None]:
sum((df['LAND SQUARE FEET']!=0 )& (df['GROSS SQUARE FEET']==0) & (df['YEAR BUILT']==0))

In [None]:
df=df[~((df['LAND SQUARE FEET']==0 )& (df['GROSS SQUARE FEET']==0) & (df['YEAR BUILT']==0))]

LAND SQUARE FEET shouldn't be empty or equal 0

In [None]:
mean_lqf= df['LAND SQUARE FEET'].mean(skipna=True)
df=df.replace({'LAND SQUARE FEET': {0: mean_lqf}})

In [None]:
numeric_data = df.select_dtypes(include=[np.number])
numeric_data.describe()

Let's investigate samples with YEAR BUILT equals to zero

In [None]:
zero_year_only=df[(df['GROSS SQUARE FEET']!=0) & (df['YEAR BUILT']==0)]
zero_year_only.shape

In [None]:
mean_year= df['YEAR BUILT'].mean(skipna=True)
df.loc[ (df['GROSS SQUARE FEET']!=0) & (df['YEAR BUILT']==0) , 'YEAR BUILT'] = mean_year

We imputed average year into samples where we suspected it was ommitted by mistake. In other cases (see table below) year can be equal to 0 if there are no buildings on property

In [None]:
zero_year =df[(df['YEAR BUILT']==0 )]
zero_year

Let's take a closer look at RESIDENTIAL UNITS, COMMERCIAL UNITS and	TOTAL UNITS

In [None]:
df[["TOTAL UNITS", "SALE PRICE"]].groupby(['TOTAL UNITS'], as_index=False).count().sort_values(by='SALE PRICE', ascending=False)

Buildings with 1 total unit were mostly sold. We have one extremely large amount of 2261 units but the sample looks legitimate

In [None]:
largest_unit=df[ df['TOTAL UNITS']==2261]
largest_unit

In [None]:
zero_unit=df[ df['TOTAL UNITS']==0]
zero_unit

I checked that in 793 rows the sum of residential and commercial units is not equal to total units. It's only 1% of whole dataset so I will leave them as they are.

In [None]:
check_units_match=df[df['TOTAL UNITS'] != df['RESIDENTIAL UNITS']+ df['COMMERCIAL UNITS']]
len(check_units_match)/len(df)

In [None]:
numeric_data = df.select_dtypes(include=[np.number])
numeric_data.describe()

# Categorical data analysis

In [None]:
df.head()

In [None]:
cat_df=df.select_dtypes(exclude=[np.number])
cat_df.describe()

From the preeding table we can draw conclusions necessary to feature selection. 
* Location is undoubtly is very important factor that will influence the price. There are a lot of unique values of NEIGHBORHOOD, BLOCK, LOT and ZIPCODE which will produce enormous amount of dummie features. Hence we will model the price on the basis of BOROUGH.
* Building class
> Every property in the city is assigned to one of four tax classes (Classes 1, 2, 3, and 4),
based on the use of the property. 
The Building Classification is used to describe a property’s constructive use. 

According to the dataset desciprtion there is a correlation between building class and tax class. In order to reduce dimensionality of the dataset we will reject some of these information.

# Categorical features - Visual analysis

**BOROUGH - a digit code for the borough the property is located in; **

They are Manhattan (1), Bronx (2), Brooklyn (3), Queens (4), and Staten Island (5).
Median of price sales is the highest in Manhattan. The most in-demand borough is Queens.

In [None]:
pivot_b1=df.pivot_table(index='BOROUGH', values='SALE PRICE', aggfunc=np.median)
pivot_b1.plot(kind='bar')

In [None]:
pivot_b2=df.pivot_table(index='BOROUGH', aggfunc='size')
pivot_b2.plot(kind='bar')

### **TAX CLASS**

There are only 4 classes so column TAX CLASS AT PRESENT with 10 unique values must be unclean. We will use TAX CLASS AT TIME OF SALE for futher analysis and prediction.
The meadian of sale price is the higest among 4th tax class buldings, which constitue small part of all sold properties.
> Class 4: Includes all other properties not included in class 1,2, and 3, such as offices, factories, warehouses, garage buildings, etc. 

In [None]:
pivot_t1=df.pivot_table(index='TAX CLASS AT TIME OF SALE', values='SALE PRICE', aggfunc=np.median)
pivot_t1.plot(kind='bar')

In [None]:
pivot_t2=df.pivot_table(index='TAX CLASS AT TIME OF SALE', aggfunc='size')
pivot_t2.plot(kind='bar')

### **BUILDING CLASS CATEGORY**

By plotting histogram I noticed one outlier with respect to SALE PRICE. I would examine this sample in the next part.

In [None]:
pivot_bc1=df.pivot_table(index='BUILDING CLASS CATEGORY', values='SALE PRICE', aggfunc=np.median)
pivot_bc1.plot(kind='bar')

In [None]:
pivot_bc2=df.pivot_table(index='BUILDING CLASS CATEGORY', aggfunc='size')
result = pivot_bc2.sort_values(ascending=False)
result=result[0:5]
result

# Numeric features analysis and visualization

In [None]:
numeric_data = df.select_dtypes(include=[np.number])
numeric_data.describe()

### SALE PRICE - target variable

In [None]:
boxplot_s1= df.boxplot(column=['SALE PRICE'], vert=False)

Samples with particularly high sale price seem all right. 

In [None]:
sale_price_outliers_high=df[df['SALE PRICE']>1000000000]
sale_price_outliers_high

In [None]:
plt.hist(df['SALE PRICE'], range=[0, 2000])

In [None]:
sale_price_outliers_low=df[df['SALE PRICE']<=1000]
sale_price_outliers_low.sort_values('SALE PRICE')

In [None]:
df= df[df['SALE PRICE'] >1000]

We will exclude samples with SALE PRICE smaller then 1000$\$$. 

### SQUARE FEET

In [None]:
numeric_data = df.select_dtypes(include=[np.number])
numeric_data.describe()

In [None]:
boxplot_s1= df.boxplot(column=['LAND SQUARE FEET'], vert=False)

In [None]:
plt.hist(df['LAND SQUARE FEET'], range=[0, 6000])

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(df['LAND SQUARE FEET'], df['SALE PRICE'], alpha=0.5)
plt.xlim(0, 10000)
plt.ylim(0, 3000000)

In [None]:
boxplot_s2= df.boxplot(column=['GROSS SQUARE FEET'], vert=False)

In [None]:
plt.hist(df['GROSS SQUARE FEET'], range=[0, 6000])

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(df['GROSS SQUARE FEET'], df['SALE PRICE'], alpha=0.5)
plt.xlim(0, 10000)
plt.ylim(0, 3000000)

### RESIDENTIAL AND COMMERCIAL UNITS

In [None]:
units = df[(df['TOTAL UNITS'] <50) & (df['SALE PRICE']<5000000)] 
plt.figure(figsize=(10,6))
sns.boxplot(x='TOTAL UNITS', y='SALE PRICE', data=units)
plt.title('Total Units vs Sale Price')
plt.show()

### Pair-wise correlations

In [None]:
df.corr(method='pearson')

# ONE-HOT ENCODING

In [None]:
df.head()

In [None]:
cat_df=df.select_dtypes(exclude=[np.number])
cat_df.describe()

In [None]:
del df['NEIGHBORHOOD']
del df['BLOCK']
del df['LOT']
del df['BUILDING CLASS AT PRESENT']
del df['ZIP CODE']
del df['TAX CLASS AT PRESENT']
del df['BUILDING CLASS AT TIME OF SALE']

In [None]:
df.head()

In [None]:
s1=['BOROUGH', 'BUILDING CLASS CATEGORY', 'TAX CLASS AT TIME OF SALE']

one_hot_encoded_training_set = pd.get_dummies(df[s1])
one_hot_encoded_training_set.head()

In [None]:
# Replacing categorical columns with dummies
df2= df.drop(s1,axis=1)
df2= pd.concat([df2, one_hot_encoded_training_set] ,axis=1)
df2

In [None]:
df2.describe()

Dataset without outliers

In [None]:
df3= df2[(df2['SALE PRICE']<50000000) & (df2['GROSS SQUARE FEET']>0) & (df2['GROSS SQUARE FEET']<10000)]
df3.describe()

# 1.1 DECISION TREE / RANDOM FOREST

In [None]:
 def lin_regplot(X, y, model): 
        plt.scatter(X, y, c='blue') 
        plt.plot(X, model.predict(X), color='red') 
        return None 

In [None]:
from sklearn.tree import DecisionTreeRegressor
X = df2[['GROSS SQUARE FEET']].values 
y = df2['SALE PRICE'].values
tree = DecisionTreeRegressor(max_depth=3)# 3 BYŁO OK
tree.fit(X, y) 
sort_idx = X.flatten().argsort()
lin_regplot(X[sort_idx], y[sort_idx], tree) 
plt.xlabel('GROSS SQUARE FEET')
plt.ylabel('SALE PRICE')
plt.show() 

In [None]:
X= df2.iloc[:, df2.columns!='SALE PRICE'].values
y = df2['SALE PRICE'].values 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)

from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators=25, 
                               criterion='mse',
                               random_state=1, 
                               n_jobs=-1)
forest.fit(X_train, y_train)
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)
print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)))
print('RMSE train: %.3f, test: %.3f' % (rmse(y_train, y_train_pred), rmse(y_test, y_test_pred)))
print('MAE train: %.3f, test: %.3f' % (mean_absolute_error(y_train, y_train_pred), mean_absolute_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

### Feature importance with random forests

In [None]:
feat_labels = df2.columns[0:]
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1] 
for f in range(X_train.shape[1]): 
    print("%2d) %-*s %f" % (f + 1, 30, 
                            feat_labels[f],
                            importances[indices[f]])) 

In [None]:
plt.title('Feature Importances') 
plt.bar(range(X_train.shape[1]), 
        importances[indices], 
        color='lightblue',
        align='center')
plt.xticks(range(X_train.shape[1]), 
           feat_labels, rotation=90) 
plt.xlim([-1, X_train.shape[1]]) 
plt.tight_layout() 
plt.show()

In [None]:
plt.scatter(y_train_pred,  
            y_train_pred - y_train, 
            c='black', 
            marker='o', 
            s=15, 
            alpha=0.5, 
            label='Training data') 
plt.scatter(y_test_pred, 
            y_test_pred - y_test, 
            c='lightgreen', 
            marker='s', 
            s=15, 
            alpha=0.7, 
            label='Test data') 
plt.xlabel('Predicted values') 
plt.ylabel('Residuals') 
plt.legend(loc='upper left') 
#plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='red') 
plt.xlim([0, 1000000])
plt.ylim([-0.1, 0.1]) 
plt.show()

## 1.2 DECISION TREE / RANDOM FOREST without outliers

In [None]:
from sklearn.tree import DecisionTreeRegressor
X = df3[['GROSS SQUARE FEET']].values 
y = df3['SALE PRICE'].values
tree = DecisionTreeRegressor(max_depth=3)
tree.fit(X, y) 
sort_idx = X.flatten().argsort()
lin_regplot(X[sort_idx], y[sort_idx], tree) 
plt.xlabel('GROSS SQUARE FEET')
plt.ylabel('SALE PRICE')
plt.show() 

In [None]:
X= df3.iloc[:, df3.columns!='SALE PRICE'].values
y = df3['SALE PRICE'].values 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)

from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators=100, 
                               criterion='mse',
                               random_state=1, 
                               n_jobs=-1)
forest.fit(X_train, y_train)
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)
print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)))
print('RMSE train: %.3f, test: %.3f' % (rmse(y_train, y_train_pred), rmse(y_test, y_test_pred)))
print('MAE train: %.3f, test: %.3f' % (mean_absolute_error(y_train, y_train_pred), mean_absolute_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

In [None]:
important_rf=[]
feat_labels = df2.columns[0:]
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1] 
for f in range(X_train.shape[1]): 
    if f < 12: 
        important_rf.append(feat_labels[f])
    print("%2d) %-*s %f" % (f + 1, 30, 
                            feat_labels[f],
                            importances[indices[f]])) 

## 1.3 RANDOM FOREST WITH SELECTED FEATURES

In [None]:
df4=df2[important_rf]
df4

In [None]:
X= df4.iloc[:, df4.columns!='SALE PRICE'].values
y = df4['SALE PRICE'].values 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators=12, 
                               criterion='mse',
                               random_state=1, 
                               n_jobs=-1)
forest.fit(X_train, y_train)
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)
print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)))
print('RMSE train: %.3f, test: %.3f' % (rmse(y_train, y_train_pred), rmse(y_test, y_test_pred)))
print('MAE train: %.3f, test: %.3f' % (mean_absolute_error(y_train, y_train_pred), mean_absolute_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

# 2.1 XGBOOST

In [None]:
import xgboost as xgb
X= df3.iloc[:, df3.columns!='SALE PRICE'].values
y = df3['SALE PRICE'].values 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=123)
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.15,
                max_depth = 5, alpha = 10, n_estimators = 25)
xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)
y_train_pred = xg_reg.predict(X_train)
y_test_pred = xg_reg.predict(X_test)

print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)))
print('RMSE train: %.3f, test: %.3f' % (rmse(y_train, y_train_pred), rmse(y_test, y_test_pred)))
print('MAE train: %.3f, test: %.3f' % (mean_absolute_error(y_train, y_train_pred), mean_absolute_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

In [None]:
data_dmatrix = xgb.DMatrix(data=X,label=y)

In [None]:
params = {"objective":"reg:squarederror",'colsample_bytree': 0.3,'learning_rate': 0.2,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=5,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

In [None]:
cv_results.head()

In [None]:
print((cv_results["test-rmse-mean"]).tail(1))

In [None]:
xg_reg = xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=10)
xgb.plot_importance(xg_reg)
plt.rcParams['figure.figsize'] = [15, 15]
plt.show()

In [None]:
df3

In [None]:
df5=df3[['RESIDENTIAL UNITS', 'TOTAL UNITS', 'LAND SQUARE FEET', 'GROSS SQUARE FEET', 'YEAR BUILT', 'SALE PRICE']]
df5

In [None]:
import xgboost as xgb
X= df5.iloc[:, df5.columns!='SALE PRICE'].values
y = df5['SALE PRICE'].values 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=123)
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.15,
                max_depth = 3, alpha = 10, n_estimators = 10)
xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)
y_train_pred = xg_reg.predict(X_train)
y_test_pred = xg_reg.predict(X_test)

print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)))
print('RMSE train: %.3f, test: %.3f' % (rmse(y_train, y_train_pred), rmse(y_test, y_test_pred)))
print('MAE train: %.3f, test: %.3f' % (mean_absolute_error(y_train, y_train_pred), mean_absolute_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

#### Before we move to next part we need to prepare our numerical data for KNN and LINEAR REGRESSION

## Standarization

In [None]:
numeric_data = df.select_dtypes(include=[np.number])
numeric_data.describe()

In [None]:
df_std=df2
df_std.dtypes

In [None]:
scaler = StandardScaler()
scaler.fit(df_std[numeric_data.columns])
scaled = scaler.transform(df_std[numeric_data.columns])

for i, col in enumerate(numeric_data.columns):
       df_std[col] = scaled[:,i]

In [None]:
df_std

# 3. KNN

In [None]:
X= df_std.iloc[:, df_std.columns!='SALE PRICE'].values
y = df_std['SALE PRICE'].values

In [None]:
lab_enc = preprocessing.LabelEncoder()
encoded = lab_enc.fit_transform(y)
print(utils.multiclass.type_of_target(y))
print(utils.multiclass.type_of_target(y.astype('int')))
print(utils.multiclass.type_of_target(encoded))

In [None]:
from sklearn.neighbors import KNeighborsClassifier 

X_train, X_test, y_train, y_test = train_test_split(X, encoded, test_size=0.20)
knn = KNeighborsClassifier(n_neighbors=50, p=2, metric='minkowski') 
knn.fit(X_train, y_train)

y_train_pred = knn.predict(X_train)
y_test_pred = knn.predict(X_test)

print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)))
print('RMSE train: %.3f, test: %.3f' % (rmse(y_train, y_train_pred), rmse(y_test, y_test_pred)))
print('MAE train: %.3f, test: %.3f' % (mean_absolute_error(y_train, y_train_pred), mean_absolute_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

In [None]:
error = []

# Calculating error for K values between 1 and 40
for i in range(30,50):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error.append(np.mean(pred_i != y_test))


In [None]:
plt.figure(figsize=(12, 6))
plt.plot(range(30,50), error, color='red', linestyle='dashed', marker='o',
         markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')

## SBS algorithm

In [None]:
#from mlxtend.feature_selection import SequentialFeatureSelector as SFS

#sfs1 = SFS(knn, 
 #          k_features=3, 
  #         forward=True, 
   #        floating=False, 
    #       verbose=2,
     #      scoring='accuracy',
      #     cv=0)

#sfs1 = sfs1.fit(X, encoded)

# 4. LINEAR REGRESSION

In [None]:
numeric_data = df.select_dtypes(include=[np.number])
numeric_data.describe()

In [None]:
cm = np.corrcoef(df[numeric_data.columns].values.T) 
sns.set(font_scale=1.5) 
hm = sns.heatmap(cm, 
                cbar=True, 
                annot=True, 
                square=True, 
                fmt='.2f', 
                annot_kws={'size': 15}, 
                yticklabels=numeric_data.columns, 
                xticklabels=numeric_data.columns)
plt.show()

In [None]:
X= df_std.iloc[:, df_std.columns!='SALE PRICE']
y = df_std['SALE PRICE'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [None]:
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_train_pred = linreg.predict(X_train)
y_test_pred = linreg.predict(X_test)

print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)))
print('RMSE train: %.3f, test: %.3f' % (rmse(y_train, y_train_pred), rmse(y_test, y_test_pred)))
print('MAE train: %.3f, test: %.3f' % (mean_absolute_error(y_train, y_train_pred), mean_absolute_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

In [None]:
plt.scatter(y_train_pred, y_train_pred - y_train, c='blue', marker='o', label='Training data')
plt.scatter(y_test_pred,  y_test_pred - y_test, c='lightgreen', marker='s', label='Test data') 
plt.xlabel('Predicted values')
plt.ylabel('Residuals') 
plt.legend(loc='upper left') 
plt.xlim([0,30]) 
plt.ylim([-20,20]) 
plt.hlines(y=0, xmin=0, xmax=30, lw=2, color='red') 
plt.show()

In [None]:
from sklearn.linear_model import SGDRegressor

X= df_std.iloc[:, df_std.columns!='SALE PRICE']
y = df_std['SALE PRICE'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
clf = SGDRegressor(alpha=0.1, max_iter=20, loss='squared_loss') #penalty='l2')
clf.fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)))
print('RMSE train: %.3f, test: %.3f' % (rmse(y_train, y_train_pred), rmse(y_test, y_test_pred)))
print('MAE train: %.3f, test: %.3f' % (mean_absolute_error(y_train, y_train_pred), mean_absolute_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

In [None]:
from sklearn.linear_model import RANSACRegressor 
ransac = RANSACRegressor(LinearRegression(), 
                         max_trials=10, 
                         min_samples=50, 
                         residual_threshold=5.0, 
                         random_state=123)
ransac.fit(X_train, y_train)
y_train_pred = ransac.predict(X_train)
y_test_pred = ransac.predict(X_test)

print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)))
print('RMSE train: %.3f, test: %.3f' % (rmse(y_train, y_train_pred), rmse(y_test, y_test_pred)))
print('MAE train: %.3f, test: %.3f' % (mean_absolute_error(y_train, y_train_pred), mean_absolute_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

In [None]:
from sklearn.linear_model import Ridge 
ridge = Ridge(alpha=0.00099, max_iter=1000)
ridge.fit(X_train, y_train)
y_train_pred = ridge.predict(X_train)
y_test_pred = ridge.predict(X_test)

print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)))
print('RMSE train: %.3f, test: %.3f' % (rmse(y_train, y_train_pred), rmse(y_test, y_test_pred)))
print('MAE train: %.3f, test: %.3f' % (mean_absolute_error(y_train, y_train_pred), mean_absolute_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

In [None]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.00099, max_iter=1000)
lasso.fit(X_train, y_train)
y_train_pred = lasso.predict(X_train)
y_test_pred = lasso.predict(X_test)

print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)))
print('RMSE train: %.3f, test: %.3f' % (rmse(y_train, y_train_pred), rmse(y_test, y_test_pred)))
print('MAE train: %.3f, test: %.3f' % (mean_absolute_error(y_train, y_train_pred), mean_absolute_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

## Last try with few features selected via RANDOM FOREST

In [None]:
numeric_data = df.select_dtypes(include=[np.number])
numeric_data.describe()

In [None]:
df4_std=df4.copy()
df4_std

In [None]:
scaler = StandardScaler()
scaler.fit(df4_std[numeric_data.columns])
scaled = scaler.transform(df4_std[numeric_data.columns])

for i, col in enumerate(numeric_data.columns):
       df4_std[col] = scaled[:,i]

In [None]:
df4_std

In [None]:
X= df4_std.iloc[:, df4_std.columns!='SALE PRICE']
y = df4_std['SALE PRICE'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [None]:
ridge = Ridge(alpha=0.00099, max_iter=1000)
ridge.fit(X_train, y_train)
y_train_pred = ridge.predict(X_train)
y_test_pred = ridge.predict(X_test)

print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)))
print('RMSE train: %.3f, test: %.3f' % (rmse(y_train, y_train_pred), rmse(y_test, y_test_pred)))
print('MAE train: %.3f, test: %.3f' % (mean_absolute_error(y_train, y_train_pred), mean_absolute_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

# SUMMARY


**RANDOM FOREST (n_estimators=12)**
* MSE train: 32887397752547.805, test: 36782968316167.930
* RMSE train: 5734753.504, test: 6064896.398
* MAE train: 606061.501, test: 930501.404
* R^2 train: 0.802, test: 0.770

**XGBOOST (learning_rate = 0.15,max_depth = 3, alpha = 10, n_estimators = 10)**
* MSE train: 1868011735798.321, test: 1444052356572.113
* RMSE train: 1366752.258, test: 1201687.296
* MAE train: 455641.730, test: 443066.799
* R^2 train: 0.170, test: 0.166

**KNN (n_neighbors=50, and more probably would work)**
* MSE train: 4318301.989, test: 4536137.416
* RMSE train: 2078.052, test: 2129.821
* MAE train: 1454.516, test: 1509.331
* R^2 train: 0.131, test: 0.083

**LINEAR REGRESSION (Ridge(alpha=0.00099, max_iter=1000))**
* MSE train: 2654989.682, test: 2680561.178
* RMSE train: 1629.414, test: 1637.242
* MAE train: 1271.535, test: 1281.574
* R^2 train: 0.466, test: 0.458


We noted the best performance for RANDOM FOREST model with selected features. The second best is RIDGE REGRESSION for which we tried two models (with 63 features and 12) with similar results. 
We could avoid overfitting by reducing number of variables or by preparing data in a different way by examining outliers or using tranformations. 