<a href="https://colab.research.google.com/github/serivan/mldmlab/blob/master/Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Regression

## Import libraries

Required libraries are : 
 - pandas 
 - scikitlearn 
 - pyplot
 - seaborn 
 - xgboost

In [None]:
import pandas as pd
from sklearn.datasets import load_boston
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, confusion_matrix
from sklearn.base import TransformerMixin, RegressorMixin
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})

## Import dataset

This is a test dataset from sklearn. It contains informations about vaious houses in Boston through different parameters. 

In [None]:
boston = load_boston()

In [None]:
print(boston.keys())

In [None]:
print(boston.DESCR)

In [None]:
df = pd.DataFrame(boston.data, columns = boston.feature_names)
df['PRICE'] = boston.target

In [None]:
display(df)

## Dataset preprocessing


In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
plt.hist(df['PRICE'], bins=50)
plt.xlabel("Prezzi in 1000$")
plt.show()

In [None]:
correlation_matrix = df.corr().round(2)
sns.heatmap(data=correlation_matrix, annot=True)

In [None]:
features = ['LSTAT', 'RM', 'NOX', 'ZN']
target = df['PRICE']

for i, col in enumerate(features):
    rows = len(features)/2 if len(features)%2 == 0 else len(features)/2+1
    plt.subplot(rows, rows , i+1)
    x = df[col]
    y = target
    plt.scatter(x, y, marker='o')
    plt.xlabel(col)
    plt.ylabel('Prezzi in 1000$')

## Custom functions

In [None]:
def plot_results(plot_type: str, 
                 res_list : list, 
                 title='', 
                 fontsize = 20, 
                 save_fig=False, 
                 xlabel = '', 
                 ylabel = '', 
                 xticks= [], 
                 yticks=[],
                 grid = '', 
                 legend = True):
    fig, ax = plt.subplots()
    
    for i in range(len(res_list)):
        if len(res_list[0]) == 2:
            [y, label] = res_list[i]
            x = np.arange(len(y))    
        elif len(res_list[0]) == 3:
            [x,y,label] = res_list[i]
        if plot_type == 'scatter':
            ax.scatter(x, y, label = label)
        elif plot_type == 'scatter-line':
            ax.plot(x, y, label=label)
            ax.scatter(x, y)
        elif plot_type == 'vertical_hist':
            ax.barh(x, y, align='center')

        
    #ax.set_xticks(np.arange(len(results[k])))
    #ax.set_xticklabels(keys)
    if grid != '':
        ax.grid(axis=grid)
    ax.set_title(title, fontsize=fontsize)
    ax.tick_params(labelsize=fontsize)
    if legend:
        ax.legend(fontsize=fontsize-4, bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.set_xlabel(xlabel, fontsize = fontsize)
    ax.set_ylabel(ylabel, fontsize= fontsize)
    
    if len(xticks) > 0:
        ax.set_xticks(np.arange(len(xticks)))
        ax.set_xticklabels(xticks)
        
    if len(yticks) > 0:
        ax.set_yticks(np.arange(len(yticks)))
        ax.set_yticklabels(yticks)
    
    if plot_type == 'vertical_hist':
        ax.invert_yaxis()  # labels read top-to-bottom
    
    if save_fig and title != '': 
        folder_path = './results/fig'
        create_folder(folder_path)
        #plt.tight_layout()
        fig.savefig(os.path.join(folder_path, '{0}.png'.format(title)))
    elif save_fig:
        print("L'immagine ha bisogno di un titolo per essere salvata")

In [None]:
def eval_model(X : pd.DataFrame, 
               Y : pd.DataFrame, 
               scaler : TransformerMixin, 
               regressor : RegressorMixin, 
               cols_to_drop : list = []):
    X = X.drop(cols_to_drop, axis=1)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=4993)
    pipe  = make_pipeline(scaler, regressor)
    pipe.fit(X_train, Y_train)
    print(f'MAE on test set : {mean_absolute_error(Y_test, pipe.predict(X_test))}')
    print(f'rMSE on test set : {mean_squared_error(Y_test, pipe.predict(X_test), squared = False)}')
    ordered_df = X_test.copy()
    ordered_df['PRICE'] = Y_test
    ordered_df = ordered_df.sort_values(by='PRICE')
    plot_results('scatter', [[ordered_df['PRICE'], 'actual'], [pipe.predict(ordered_df.drop('PRICE', axis=1)), 'pred']])
    return X_test, Y_test, pipe

## Regression Models

In [None]:
X = df.drop('PRICE', axis=1)
y = df['PRICE']
    

In [None]:
scaler = StandardScaler()
regressor = LinearRegression()
X_test, Y_test, pipe = eval_model(X, y, scaler, regressor, cols_to_drop=['CRIM', 'ZN', 'CHAS', 'NOX', 'AGE', 'DIS', 'RAD', 'TAX', 'B'])

In [None]:
Y_pred_test_class = [0 if x < 10 else 1 if x < 30 else 2 if x < 40 else 3 for x in pipe.predict(X_test)]
Y_true_test_class = [0 if x < 10 else 1 if x < 30 else 2 if x < 40 else 3 for x in Y_test]
cf_matrix = confusion_matrix(Y_true_test_class, Y_pred_test_class)
ax = plt.subplot()
sns.heatmap(cf_matrix, annot=True)
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['<10', '10-30', '30-40', '>40']); ax.yaxis.set_ticklabels(['<10', '10-30', '30-40', '>40']);

In [None]:
scaler = StandardScaler()
regressor = ExtraTreesRegressor(n_estimators=100)
X_test, Y_test, pipe = eval_model(X, y, scaler, regressor, cols_to_drop=[])

In [None]:
Y_pred_test_class = [0 if x < 10 else 1 if x < 30 else 2 if x < 40 else 3 for x in pipe.predict(X_test)]
Y_true_test_class = [0 if x < 10 else 1 if x < 30 else 2 if x < 40 else 3 for x in Y_test]
cf_matrix = confusion_matrix(Y_true_test_class, Y_pred_test_class)
ax = plt.subplot()
sns.heatmap(cf_matrix, annot=True)
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['<10', '10-30', '30-40', '>40']); ax.yaxis.set_ticklabels(['<10', '10-30', '30-40', '>40']);

In [None]:
scaler = StandardScaler()
regressor = RandomForestRegressor(n_estimators=100)
X_test, Y_test, pipe = eval_model(X, y, scaler, regressor, cols_to_drop=[])

In [None]:
Y_pred_test_class = [0 if x < 10 else 1 if x < 30 else 2 if x < 40 else 3 for x in pipe.predict(X_test)]
Y_true_test_class = [0 if x < 10 else 1 if x < 30 else 2 if x < 40 else 3 for x in Y_test]
cf_matrix = confusion_matrix(Y_true_test_class, Y_pred_test_class)
ax = plt.subplot()
sns.heatmap(cf_matrix, annot=True)
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['<10', '10-30', '30-40', '>40']); ax.yaxis.set_ticklabels(['<10', '10-30', '30-40', '>40']);

In [None]:
scaler = StandardScaler()
regressor = XGBRegressor(n_estimators=100, 
                         objective= 'reg:squarederror')
X_test, Y_test, pipe = eval_model(X, y, scaler, regressor)

In [None]:
Y_pred_test_class = [0 if x < 10 else 1 if x < 30 else 2 if x < 40 else 3 for x in pipe.predict(X_test)]
Y_true_test_class = [0 if x < 10 else 1 if x < 30 else 2 if x < 40 else 3 for x in Y_test]
cf_matrix = confusion_matrix(Y_true_test_class, Y_pred_test_class)
ax = plt.subplot()
sns.heatmap(cf_matrix, annot=True)
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['<10', '10-30', '30-40', '>40']); ax.yaxis.set_ticklabels(['<10', '10-30', '30-40', '>40']);