In [None]:
#Import bibliotek
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Import danych
from sklearn.datasets import load_boston
boston_dataset = load_boston()
boston = pd.DataFrame(boston_dataset.data, columns = boston_dataset.feature_names)

In [None]:
#Przegląd danych
boston.head()

In [None]:
# Dodanie kolumny z predykowaną cechą
boston['MEDV'] = boston_dataset.target

Kolumny zbioru danych:
* CRIM - per capita crime rate by town
* ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
* INDUS - proportion of non-retail business acres per town.
* CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
* NOX - nitric oxides concentration (parts per 10 million)
* RM - average number of rooms per dwelling
* AGE - proportion of owner-occupied units built prior to 1940
* DIS - weighted distances to five Boston employment centres
* RAD - index of accessibility to radial highways
* TAX - full-value property-tax rate per \$ $10,000$
* PTRATIO - pupil-teacher ratio by town
* $ - 1000(Bk - 0.63)^2$ where Bk is the proportion of blacks by town
* LSTAT - % lower status of the population
* MEDV - Median value of owner-occupied homes in $1000's

In [None]:
#Sprawdzenie, czy są braki w danych
boston.isnull().sum()

In [None]:
#Podstawowe statystyki dla Boston Dataset
boston.describe()

In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(boston.corr(),  annot=True, cmap='twilight_shifted')

In [None]:
boston_filtr = boston.corr().abs()
filter = (boston_filtr == 1) | (boston_filtr < 0.75)
boston_filtr[filter] = 0
boston_filtr

plt.figure(figsize=(20, 10))
sns.heatmap(boston_filtr,  annot=True)


Ze względu na silną korelację odrzucam zmienne NOX i RAD podczas dalszej analizy.

In [None]:
col_out = ['NOX', 'RAD']
boston = boston.drop(columns=col_out)

**Regresja liniowa**

In [None]:
# Dzielimy na zbiór treningowy i testowy
X = boston.iloc[:, 0:-1]
y = boston.iloc[:, -1]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

# Dopasowanie modelu regresji liniowej
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_train)

from sklearn.metrics import r2_score
# R2 dla danych treningowych
y_pred_linear_train = lm.predict(X_train)
r2_score_linear_train = r2_score(y_train, y_pred_linear_train)
# R2 dla danych testowych
y_pred_linear_test = lm.predict(X_test)
r2_score_linear_test = r2_score(y_test, y_pred_linear_test)
# Pierwiastek błędu średniokwadratowego
from sklearn.metrics import mean_squared_error as mse
rmse_linear = (np.sqrt(mse(y_test, y_pred_linear_test)))
# Średni błąd bezwzględny
from sklearn.metrics import mean_absolute_error as mae
mae_linear= mae(y_test, y_pred_linear_test)

print('R2 - treningowe: ', r2_score_linear_train)
print('R2 - testowe: ', r2_score_linear_test)
print("RMSE: ", rmse_linear)
print("MAE: ", mae_linear)

plt.scatter(y_train, y_pred)
plt.xlabel("Rzeczywiste")
plt.ylabel("Predykowane")
plt.title("Wykres dopasowania")
plt.show()

**Krzywe uczenia - regresja liniowa**

In [None]:
def learning_curves(model,X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                        shuffle = True)
    train_errors, test_errors = [], []
    for i in range(1, len(X_train)):
        model.fit(X_train[:i],y_train[:i])
        y_train_pred = model.predict(X_train[:i])
        y_test_pred = model.predict(X_test)
        train_errors.append(mse(y_train[:i], y_train_pred))
        test_errors.append(mse(y_test, y_test_pred))
    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
    plt.plot(np.sqrt(test_errors), "b-", linewidth=3, label="test")
    plt.legend(("Zestaw uczący","Zestaw walidacyjny"), frameon=True)
    
learning_curves(lm, X, y)

**Regresja grzbietowa**

In [None]:
from sklearn.linear_model import Ridge, Lasso
ridge_reg = Ridge(alpha=1, solver = "cholesky")
ridge_reg.fit(X_train,y_train)
pred_ridge = ridge_reg.predict(X_test)
# Mean squared error
print('MSE:', mse(y_test, pred_ridge))
# Mean absolute error
print('MAE:', mae(y_test, pred_ridge))
# Coefficient of determination
print('R2:', ridge_reg.score(X_test,y_test))
# RMSE
print('RMSE:', np.sqrt(mse(y_test, pred_ridge)))

**Regresja metodą LASSO**

In [None]:
lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(X_train,y_train)
pred_lasso = lasso_reg.predict(X_test)
# Mean squared error
print('MSE:', mse(y_test, pred_lasso))
# Mean absolute error
print('MAE:', mae(y_test, pred_lasso))
# Coefficient of determination
print('R2:', lasso_reg.score(X_test,y_test))
# RMSE
print('RMSE:', np.sqrt(mse(y_test, pred_lasso)))

**Regresor maszyny wektorów nośnych**

Porównanie dla różnych wartości kernel, C oraz gamma

In [None]:
from sklearn import svm
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# pierwszy
svm_reg = Pipeline([('scaler', StandardScaler()), ('svr', SVR(kernel='linear',C=0.01))])
svm_reg.fit(X_train, y_train)
y_pred = svm_reg.predict(X_test)
# Mean squared error
print('MSE:', mse(y_test, y_pred))
# Mean absolute error
print('MAE:', mae(y_test, y_pred))
# Coefficient of determination
print('R2:', svm_reg.score(X_test,y_test))
# RMSE
print('MSE:', np.sqrt(mse(y_test, y_pred)))

In [None]:
#drugi
svm_reg = Pipeline([('scaler', StandardScaler()), ('svr', SVR(kernel='linear',C=100))])
svm_reg.fit(X_train, y_train)
y_pred = svm_reg.predict(X_test)
# Mean squared error
print('MSE:', mse(y_test, y_pred))
# Mean absolute error
print('MAE:', mae(y_test, y_pred))
# Coefficient of determination
print('R2:', svm_reg.score(X_test,y_test))
# RMSE
print('MSE:', np.sqrt(mse(y_test, y_pred)))

In [None]:
# trzeci
svm_reg = Pipeline([('scaler', StandardScaler()), ('svr', SVR(kernel='rbf',C=0.02, gamma=0.1))])
svm_reg.fit(X_train, y_train)
y_pred = svm_reg.predict(X_test)
# Mean squared error
print('MSE:', mse(y_test, y_pred))
# Mean absolute error
print('MAE:', mae(y_test, y_pred))
# Coefficient of determination
print('R2:', svm_reg.score(X_test,y_test))
# RMSE
print('RMSE:', np.sqrt(mse(y_test, y_pred)))

In [None]:
# czwarty
svm_reg = Pipeline([('scaler', StandardScaler()), ('svr', SVR(kernel='rbf',C=2, gamma=10))])
svm_reg.fit(X_train, y_train)
y_pred = svm_reg.predict(X_test)
# Mean squared error
print('MSE:', mse(y_test, y_pred))
# Mean absolute error
print('MAE:', mae(y_test, y_pred))
# Coefficient of determination
print('R2:', svm_reg.score(X_test,y_test))
# RMSE
print('RMSE:', np.sqrt(mse(y_test, y_pred)))

Dobór najlepszych parametrów - GridSearchCV oraz RandomizedSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
def fit_model (x,v):
    X_train, X_test, y_train, y_test = train_test_split(x, v, test_size=0.2, random_state=7)
    parameters = {'kernel':('linear', 'rbf'),'gamma':(np.logspace(-3, 3, 10)),'C':[1, 10, 20, 30, 40, 50]}
    svr = SVR()
    grid = GridSearchCV(svr, parameters)
    grid.fit(X_train, y_train)
    rand = RandomizedSearchCV(svr, parameters)
    rand.fit(X_train, y_train)
    if grid.score(X_test,y_test) > rand.score(X_test,y_test):
        return grid.best_estimator_
    else:
        return rand.best_estimator_

reg = fit_model(X,y)
print(reg.get_params())
print('R2:', reg.score(X_test,y_test))

In [None]:
# Predicting Test data with the model
y_pred = reg.predict(X_test)

In [None]:
# Porównanie predykowanych i rzeczywistych wartośći
plt.scatter(y_test, y_pred)
plt.xlabel("Rzeczywiste")
plt.ylabel("Predykowane")
plt.title("Wykres dopasowania")
plt.show()

In [None]:
# Mean squared error
print('MSE:', mse(y_test, y_pred))
# Mean absolute error
print('MAE:', mae(y_test, y_pred))
# Coefficient of determination
print('R2:', r2_score(y_test,y_pred))
# RMSE
print('RMSE:', np.sqrt(mse(y_test, y_pred)))