# Predykcja cen nieruchomości oparta na modelu maszyny wektorów nośnych

## Wstęp

Dokumentacja będzie skupiała się na rozwiązaniu problemu przewidywania cen nieruchomości z perspektywy potencjalnego kupującego oraz sprzedającego


A house value is simply more than location and square footage. Like the features that make up a person, an educated party would want to know all aspects that give a house its value.

We are going to take advantage of all of the feature variables available to use and use it to analyze and predict house prices.

We are going to break everything into logical steps that allow us to ensure the cleanest, most realistic data for our model to make accurate predictions from.

1. Load Data and Packages
2. Analyzing the Test Variable (Sale Price)
3. Multivariable Analysis
4. Impute Missing Data and Clean Data
5. Feature Transformation/Engineering
6. Modeling and Predictions.

# 1. Load Data and Packages

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import warnings
#import xgboost as xgb
#import lightgbm as lgb
from scipy.stats import skew
from scipy import stats
from scipy.stats.stats import pearsonr
from scipy.stats import norm
from collections import Counter
from sklearn.linear_model import LinearRegression,LassoCV, Ridge, LassoLarsCV,ElasticNetCV
from sklearn.model_selection import GridSearchCV, cross_val_score, learning_curve
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, Normalizer, RobustScaler
warnings.filterwarnings('ignore')
sns.set(style='white', context='notebook', palette='deep')
%config InlineBackend.figure_format = 'retina' #set 'png' here when working on notebook
%matplotlib inline

In [None]:
# Load train and Test set
train = pd.read_csv("./boston-housing/train.csv")
test = pd.read_csv("./boston-housing/test.csv")

In [None]:
# Check the numbers of samples and features
print("The train data size before dropping Id feature is : {} ".format(train.shape))
print("The test data size before dropping Id feature is : {} ".format(test.shape))

# Save the 'Id' column
train_ID = train['ID']
test_ID = test['ID']

# Now drop the 'Id' column since it's unnecessary for the prediction process.
train.drop("ID", axis = 1, inplace = True)
test.drop("ID", axis = 1, inplace = True)

# Check data size after dropping the 'Id' variable
print("\nThe train data size after dropping Id feature is : {} ".format(train.shape)) 
print("The test data size after dropping Id feature is : {} ".format(test.shape))

In [None]:
train.head()

In [None]:
test.head()

In [None]:
# Getting Description
train['medv'].describe()

In [None]:
# Plot Histogram
sns.distplot(train['medv'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['medv'])
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('Median Value distribution')

fig = plt.figure()
res = stats.probplot(train['medv'], plot=plt)
plt.show()

print("Skewness: %f" % train['medv'].skew())
print("Kurtosis: %f" % train['medv'].kurt())

# 3. Multivariable Analysis

In [None]:
# Correlation Matrix Heatmap
corrmat = train.corr()
f, ax = plt.subplots(figsize=(14, 8))
sns.heatmap(corrmat, vmin=-1, vmax=1, square=True, center=0, annot=True, fmt='.2f');

In [None]:
cols = corrmat.abs().nlargest(14, 'medv')['medv'].index
most_corr = pd.DataFrame(cols)
most_corr.columns = ['Most Correlated Features']
most_corr

**BLACKS**

In [None]:
sns.jointplot(x=train['black'], y=train['medv'], kind='reg');

In [None]:
var = 'black'
data = pd.concat([train['medv'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(25, 12))
fig = sns.boxplot(x=var, y="medv", data=data)
fig.axis(ymin=0, ymax=52);

**F A T A L N E** dane 

In [None]:
# usuwamy je
train.drop(['black'], axis=1, inplace=True)
test.drop(['black'], axis=1, inplace=True)

In [None]:
# usuwamy też rzekę - też lipna
train.drop(['chas'], axis=1, inplace=True)
test.drop(['chas'], axis=1, inplace=True)

**Dane o najwyższej wartości bezwzględnej korelacji**

In [None]:
sns.jointplot(x=train['lstat'], y=train['medv'], kind='reg');
print(train.shape[0])

In [None]:
# Removing outliers manually 
train = train.drop(train[(train['medv']>49.99) & (train['lstat']>8)].index).reset_index(drop=True)

In [None]:
sns.jointplot(x=train['lstat'], y=train['medv'], kind='reg');
print(pearsonr(train['lstat'], train['medv']));
print(train.shape[0]);

In [None]:
sns.jointplot(x=train['rm'], y=train['medv'], kind='reg');

In [None]:
# wiekość pokoi może być różna - nie odrzucamy tu obserwacji odstających
# train = train.drop(train[(train['medv'] > 8 * train['rm'] - 8)].index).reset_index(drop=True)

In [None]:
# sns.jointplot(x=train['rm'], y=train['medv'], kind='reg');
# print(pearsonr(train['rm'], train['medv']));
# print(train.shape[0]);

In [None]:
sns.jointplot(x=train['ptratio'], y=train['medv'], kind='reg');
# w jednym mieście może być wiele regionów o różnej charakterystyce; 
# współczynnik ptratio jest wspólny dla wszystkich regionów miasta - biednych i bogatych
# jednak można zauważyć silną ujemną korelację miedzy tymi wielkościami
# ciężko cokolwiek odrzucić

In [None]:
var = 'ptratio'
data = pd.concat([train['medv'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(25, 12))
fig = sns.boxplot(x=var, y="medv", data=data)
fig.axis(ymin=0, ymax=52);

In [None]:
sns.jointplot(x=train['indus'], y=train['medv'], kind='reg');

In [None]:
var = 'indus'
data = pd.concat([train['medv'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(25, 12))
fig = sns.boxplot(x=var, y="medv", data=data)
fig.axis(ymin=0, ymax=52);

In [None]:
sns.jointplot(x=train['tax'], y=train['medv'], kind='reg');

In [None]:
var = 'tax'
data = pd.concat([train['medv'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(25, 12))
fig = sns.boxplot(x=var, y="medv", data=data)
fig.axis(ymin=0, ymax=52);

# Uzupełnianie danych - nasze dane są kompletne

In [None]:
# We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
train["medv"] = np.log1p(train["medv"])

#Check the new distribution 
sns.distplot(train['medv'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['medv'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

fig = plt.figure()
res = stats.probplot(train['medv'], plot=plt)
plt.show()

y_train = train.medv.values

print("Skewness: %f" % train['medv'].skew())
print("Kurtosis: %f" % train['medv'].kurt())

In [None]:
ntrain = train.shape[0]
ntest = test.shape[0]
all_data = pd.concat((train, test)).reset_index(drop=True)
all_data.drop(['medv'], axis=1, inplace=True)
print("Train data size is : {}".format(train.shape))
print("Test data size is : {}".format(test.shape))
print("Combined dataset size is : {}".format(all_data.shape))

In [None]:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = all_data[numeric_feats].apply(lambda x: (skew(x.dropna()))).sort_values(ascending=False)
skewness = pd.DataFrame({'Positive skewed Features' :skewed_feats})
skewness.head(11)

In [None]:
# skewness = skewness[skewness > 0.8]
# print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))
skewness = skewness[skewness > 0.8]
skewness = skewness.dropna()

from scipy.special import boxcox1p

skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    all_data[feat] = boxcox1p(all_data[feat], lam)
    all_data[feat] += 1

In [None]:
# Check the skew of all numerical features
skewed_feats = all_data[numeric_feats].apply(lambda x: (skew(x.dropna()))).sort_values(ascending=False)
skewness = pd.DataFrame({'Positive skewed Features' :skewed_feats})
skewness.head(11)

In [None]:
train = all_data[:ntrain]
test = all_data[ntrain:]