# Import libraries and data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, Binarizer
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet

In [None]:
# Load data
house = pd.read_csv('../input/amsterdam-house-price-prediction/HousingPrices-Amsterdam-August-2021.csv')
house

# Data Preprocessing

## Dataset Overview
|  Column |              Comment              |
|:-------:|:---------------------------------:|
| Address | Residential address               |
|   Zip   | Residential Zip code              |
|  Price  | Residential price in Euros        |
|   Area  | Residential area in square meters |
|   Room  | Number of rooms at residence      |
|   Lon   | Longitude coordinate              |
|   Lat   | Latitude coordinates.             |

## Process Missing Values

In [None]:
# Check whether 'house' contains any Null or NaN
house.isnull().sum()

In [None]:
# Fill missing value with median
house.fillna(house.median(), inplace=True)
house

In [None]:
# Check Dtype of 'house'
house.info()

## Extract road names from address

In [None]:
# Extract road names
house['Road'] = house['Address'].str.split(',', expand=True)[0]
house['Road'] = house['Road'].str.split(' ')
house['Road_Extract'] = pd.Series()

for i in range(0, len(house), 1):
    lst = house.iloc[i, 8]
    lst_extract = [j for j in lst if j.isalpha()]
    lst_extract = ''.join(lst_extract)
    house.iloc[i, 9] = lst_extract

house

In [None]:
# Compare which columns has the least unique values.
columns_names = ['Address', 'Zip', 'Road_Extract']

for name in columns_names:
    print("Length of {0}: {1}".format(name, len(house[name].unique())))

### **Comment**
Preprocessed column has the least length.  
A variety sorts can cause overfitting on prediction.  
Therefore, choosing for the least one can be efficient way.

In [None]:
# Drop unnecessary columns
house.drop(['Unnamed: 0', 'Address', 'Zip', 'Road'], axis=1, inplace=True)
house.rename(columns={'Road_Extract':'Road'}, inplace=True)
house.reset_index(drop=True, inplace=True)
house = house[['Road', 'Area', 'Room', 'Lat', 'Lon', 'Price']]
house

## Check Distribution

In [None]:
# Check distribution of Area, Room and Price
fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize=(20, 10))

sns.distplot(house['Area'], ax=ax1)
ax1.set_title('Distribution of Area')
sns.distplot(house['Room'], ax=ax2)
ax2.set_title('Distribution of Room')
sns.distplot(house['Price'], ax=ax3)
ax3.set_title('Distribution of Price')

plt.suptitle('Distribution of features', fontweight='bold')
plt.tight_layout
plt.show()

### **Comment**
Overall, as you can see, most of features have biased values  
So, if we process Standard Scaling, we can have better results.  
100 square meters, 3 rooms were the heighest in each feature: Area, Room

# Create Datasets

In [None]:
# Apply Standard Scaling
area_scaler = StandardScaler()
room_scaler = StandardScaler()

area_n = area_scaler.fit_transform(house['Area'].values.reshape(-1, 1))
room_n = room_scaler.fit_transform(house['Room'].values.reshape(-1, 1))

house.insert(3, 'Area_Scaled', area_n)
house.insert(4, 'Room_Scaled', room_n)

house

In [None]:
# Check distribution of Area_Scaled, Room_Scaled
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20, 10))

sns.distplot(house['Area_Scaled'], ax=ax1)
ax1.set_title('Distribution of Area_Scaled')
sns.distplot(house['Room_Scaled'], ax=ax2)
ax2.set_title('Distribution of Room_Scaled')

plt.suptitle('Distribution of features scaled', fontweight='bold')
plt.tight_layout
plt.show()

In [None]:
# Standard Scaling doesn't work
# Maybe conerting into log1p can be another good idea.

area_n = np.log1p(house['Area'])
room_n = np.log1p(house['Room'])

house.insert(5, 'Area_Log', area_n)
house.insert(6, 'Room_Log', room_n)

house

In [None]:
# Check distribution of Area_Log, Room_Log
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20, 10))

sns.distplot(house['Area_Log'], ax=ax1)
ax1.set_title('Distribution of Area_Log')
sns.distplot(house['Room_Log'], ax=ax2)
ax2.set_title('Distribution of Room_Log')

plt.suptitle('Distribution of features applied of log', fontweight='bold')
plt.tight_layout
plt.show()

### **Comment**
We plotted each distribution of original, Standard Scaled and Log Scaled.  
At the last plot, we could see better distribution which scaler was Log (similar to Normal Distribution).

In [None]:
# Extract needed features for training
house_train = house[['Road', 'Area_Log', 'Room_Log', 'Price', 'Lat', 'Lon']]
house_train

In [None]:
# Process One-Hot Encoding
house_train = pd.get_dummies(house_train)
house_train

In [None]:
# Set feature and label dataset as X, y
X = house_train.drop('Price', axis=1, inplace=False)
y = house_train['Price']

print('Shape of X: ', X.shape)
print('Shape of y: ', y.shape)

In [None]:
# Split X, y into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

print('Shape of X_train: ', X_train.shape)
print('Shape of X_test: ', X_test.shape)
print('Shape of y_train: ', y_train.shape)
print('Shape of y_test: ', y_test.shape)

# Regression

## Logistic Regression

In [None]:
# Logistic Regression is one of the most fundamental estimator.
# So, let's try with Logistic Regression, first.

lr_reg = LogisticRegression(solver='liblinear')
lr_reg.fit(X_train, y_train)
lr_preds = lr_reg.predict(X_test)

lr_mse = mean_squared_error(y_test, lr_preds)
lr_rmse = np.sqrt(lr_mse)

print('MSE : {0:.3f}, RMSE : {1:.3f}'.format(lr_mse, lr_mse))
print('Variance score : {0:.3f}'.format(r2_score(y_test, lr_preds)))

## Ridge

In [None]:
ridge = Ridge(alpha=10)
neg_mse_scores = cross_val_score(ridge, X, y, scoring="neg_mean_squared_error", cv=5)
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

In [None]:
print('Individual Negative MSE scores of cross validation 5 times: ', np.round(neg_mse_scores, 2))
print('Individual RMSE scores of cross validation 5 times: ', np.round(rmse_scores, 2))
print('Average of RMSE scores of cross validation 5 times : {0:.3f} '.format(avg_rmse))

# Report
**Summary**  
* Features of datasets are few for training
* If I had more various features, I could have made better results.

**Comment**  
1. I tried to reduce the number of unique values of 'Address' (919 --> 759)
- Length of Address: 919
- Length of Zip: 834
- Length of Road_Extract: 759

2. I also tried to make features more scaled as much as Normal Distribution by comapring 3 methods.
3. I made Ridge model to regularize features.  
Comapring with Logistic Regression, the RMSE score was decreased from 323252587891.892 to 376525.286.  
Still, the evaulation score of my models are low. I need to improve them.