# Bucur Robert - Adrian
# Bogdan Gheorghe - Nicolae
## Grupa 10LF381

# Laborator 9
## Modele de regresie

# Setul de date Boston Housing

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_validate, cross_val_score

from sklearn.metrics        import mean_absolute_error, mean_squared_error, median_absolute_error

from sklearn.tree           import DecisionTreeRegressor
from sklearn.linear_model   import LinearRegression
from sklearn.linear_model   import Lasso
from sklearn.ensemble       import RandomForestRegressor
from sklearn.ensemble       import GradientBoostingRegressor

## Definirea listelor hiperparametrilor

In [2]:
parameters_dtr = {'criterion': ['mse', 'friedman_mse', 'mae'], 'max_depth': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
dtr = DecisionTreeRegressor()

parameters_linear = {'normalize': [True, False], 'fit_intercept': [True, False]}
linear = LinearRegression()

parameters_gbr = {'loss': ['ls', 'lad', 'huber', 'quantile'], 'criterion': ['mse', 'friedman_mse', 'mae'], 'random_state': [0, None]}
gbr = GradientBoostingRegressor()

parameters_rfr = {'criterion': ['mse', 'mae'], 'max_depth': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'random_state': [0, None]}
rfr = RandomForestRegressor()

parameters_lasso = {'alpha': [0.0, 1.0, 2.0], 'random_state': [0, None], 'normalize': [True, False]}
lasso = Lasso()

## Incarcarea datelor

In [3]:
housing = pd.read_csv('../data/boston/housing.csv', sep=';')
print('Housing shape:', housing.shape, '\n')

X_housing = housing.iloc[:, :-1]
y_housing = housing.iloc[:, -1]

housing.head()

Housing shape: (506, 14) 



Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [4]:
housing_df = pd.DataFrame()

## Primul Model DecisionTreeRegressor

In [5]:
grid_search_dtr_housing = GridSearchCV(estimator=dtr, param_grid=parameters_dtr, scoring='neg_mean_squared_error', cv=3)
grid_scores_dtr_housing = cross_validate(estimator=grid_search_dtr_housing, scoring=('neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'), X=X_housing, y=y_housing, cv=5, return_train_score=True)

df_grid_housing = pd.DataFrame(grid_scores_dtr_housing)
df_grid_housing = df_grid_housing.mean()
df_grid_housing['Model_name'] = 'DecisionTreeRegressor'
df_grid_housing['Search_strategy'] = 'GridSearch'

df_grid_housing = pd.Series(df_grid_housing)
housing_df = housing_df.append(df_grid_housing, ignore_index=True)

In [6]:
randomized_search_dtr_housing = RandomizedSearchCV(estimator=dtr, param_distributions=parameters_dtr, random_state=0, scoring='neg_mean_squared_error', n_iter=5, cv=3)
randomized_scores_dtr_housing = cross_validate(estimator=randomized_search_dtr_housing, scoring=('neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'), X=X_housing, y=y_housing, cv=5, return_train_score=True)

df_random_housing = pd.DataFrame(randomized_scores_dtr_housing)
df_random_housing = df_random_housing.mean()
df_random_housing['Model_name'] = 'DecisionTreeRegressor'
df_random_housing['Search_strategy'] = 'RandomizedSearch'

df_random_housing = pd.Series(df_random_housing)
housing_df = housing_df.append(df_random_housing, ignore_index=True)

## Al doilea model LinearRegression

In [7]:
grid_search_linear_housing = GridSearchCV(estimator=linear, param_grid=parameters_linear, scoring='neg_mean_squared_error', cv=3)
grid_scores_linear_housing = cross_validate(estimator=grid_search_linear_housing, scoring=('neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'), X=X_housing, y=y_housing, cv=5, return_train_score=True)

df_grid_housing = pd.DataFrame(grid_scores_linear_housing)
df_grid_housing = df_grid_housing.mean()
df_grid_housing['Model_name'] = 'LinearRegression'
df_grid_housing['Search_strategy'] = 'GridSearch'

df_grid_housing = pd.Series(df_grid_housing)
housing_df = housing_df.append(df_grid_housing, ignore_index=True)

In [8]:
randomized_search_linear_housing = RandomizedSearchCV(estimator=linear, param_distributions=parameters_linear, random_state=0, scoring='neg_mean_squared_error', n_iter=5, cv=3)
randomized_scores_linear_housing = cross_validate(estimator=randomized_search_linear_housing, scoring=('neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'), X=X_housing, y=y_housing, cv=5, return_train_score=True)

df_random_housing = pd.DataFrame(randomized_scores_linear_housing)
df_random_housing = df_random_housing.mean()
df_random_housing['Model_name'] = 'LinearRegression'
df_random_housing['Search_strategy'] = 'RandomizedSearch'

df_random_housing = pd.Series(df_random_housing)
housing_df = housing_df.append(df_random_housing, ignore_index=True)

## Al treilea model Lasso

In [9]:
grid_search_lasso_housing = GridSearchCV(estimator=lasso, param_grid=parameters_lasso, scoring='neg_mean_squared_error', cv=3)
grid_scores_lasso_housing = cross_validate(estimator=grid_search_lasso_housing, scoring=('neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'), X=X_housing, y=y_housing, cv=5, return_train_score=True)

df_grid_housing = pd.DataFrame(grid_scores_lasso_housing)
df_grid_housing = df_grid_housing.mean()
df_grid_housing['Model_name'] = 'Lasso'
df_grid_housing['Search_strategy'] = 'GridSearch'

df_grid_housing = pd.Series(df_grid_housing)
housing_df = housing_df.append(df_grid_housing, ignore_index=True)

In [10]:
randomized_search_lasso_housing = RandomizedSearchCV(estimator=lasso, param_distributions=parameters_lasso, random_state=0, scoring='neg_mean_squared_error', n_iter=5, cv=3)
randomized_scores_lasso_housing = cross_validate(estimator=randomized_search_lasso_housing, scoring=('neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'), X=X_housing, y=y_housing, cv=5, return_train_score=True)

df_random_housing = pd.DataFrame(randomized_scores_lasso_housing)
df_random_housing = df_random_housing.mean()
df_random_housing['Model_name'] = 'Lasso'
df_random_housing['Search_strategy'] = 'RandomizedSearch'

df_random_housing = pd.Series(df_random_housing)
housing_df = housing_df.append(df_random_housing, ignore_index=True)

## Al patrulea model RandomForestRegressor

In [11]:
grid_search_rfr_housing = GridSearchCV(estimator=rfr, param_grid=parameters_rfr, scoring='neg_mean_squared_error', cv=3)
grid_scores_rfr_housing = cross_validate(estimator=grid_search_rfr_housing, scoring=('neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'), X=X_housing, y=y_housing, cv=5, return_train_score=True)

df_grid_housing = pd.DataFrame(grid_scores_rfr_housing)
df_grid_housing = df_grid_housing.mean()
df_grid_housing['Model_name'] = 'RandomForestRegressor'
df_grid_housing['Search_strategy'] = 'GridSearch'

df_grid_housing = pd.Series(df_grid_housing)
housing_df = housing_df.append(df_grid_housing, ignore_index=True)

In [12]:
randomized_search_rfr_housing = RandomizedSearchCV(estimator=rfr, param_distributions=parameters_rfr, random_state=0, scoring='neg_mean_squared_error', n_iter=5, cv=3)
randomized_scores_rfr_housing = cross_validate(estimator=randomized_search_rfr_housing, scoring=('neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'), X=X_housing, y=y_housing, cv=5, return_train_score=True)

df_random_housing = pd.DataFrame(randomized_scores_rfr_housing)
df_random_housing = df_random_housing.mean()
df_random_housing['Model_name'] = 'RandomForestRegressor'
df_random_housing['Search_strategy'] = 'RandomizedSearch'

df_random_housing = pd.Series(df_random_housing)
housing_df = housing_df.append(df_random_housing, ignore_index=True)

## Al cincilea model GradientBoostingRegressor

In [13]:
grid_search_gbr_housing = GridSearchCV(estimator=gbr, param_grid=parameters_gbr, scoring='neg_mean_squared_error', cv=3)
grid_scores_gbr_housing = cross_validate(estimator=grid_search_gbr_housing, scoring=('neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'), X=X_housing, y=y_housing, cv=5, return_train_score=True)

df_grid_housing = pd.DataFrame(grid_scores_gbr_housing)
df_grid_housing = df_grid_housing.mean()
df_grid_housing['Model_name'] = 'GradientBoostingRegressor'
df_grid_housing['Search_strategy'] = 'GridSearch'

df_grid_housing = pd.Series(df_grid_housing)
housing_df = housing_df.append(df_grid_housing, ignore_index=True)

In [14]:
randomized_search_gbr_housing = RandomizedSearchCV(estimator=gbr, param_distributions=parameters_gbr, random_state=0, scoring='neg_mean_squared_error', n_iter=5, cv=3)
randomized_scores_gbr_housing = cross_validate(estimator=randomized_search_gbr_housing, scoring=('neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'), X=X_housing, y=y_housing, cv=5, return_train_score=True)

df_random_housing = pd.DataFrame(randomized_scores_gbr_housing)
df_random_housing = df_random_housing.mean()
df_random_housing['Model_name'] = 'GradientBoostingRegressor'
df_random_housing['Search_strategy'] = 'RandomizedSearch'

df_random_housing = pd.Series(df_random_housing)
housing_df = housing_df.append(df_random_housing, ignore_index=True)

## Raportul negativ

In [15]:
housing_df

Unnamed: 0,Model_name,Search_strategy,fit_time,score_time,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error
0,DecisionTreeRegressor,GridSearch,0.736851,0.00281,-6.085792,-67.937407,-4.512992,-5.038441,-43.537211,-4.030576
1,DecisionTreeRegressor,RandomizedSearch,0.132935,0.003336,-6.97536,-94.536722,-5.03,-6.478858,-85.085462,-4.29
2,LinearRegression,GridSearch,0.156283,0.003691,-4.047265,-35.919581,-2.950764,-3.295527,-22.761287,-2.360447
3,LinearRegression,RandomizedSearch,0.124743,0.002645,-4.047265,-35.919581,-2.950764,-3.295527,-22.761287,-2.360447
4,Lasso,GridSearch,0.345759,0.002279,-4.450778,-40.638336,-3.252412,-3.761285,-28.770287,-2.754841
5,Lasso,RandomizedSearch,0.164918,0.00237,-4.450778,-40.638336,-3.252412,-3.761285,-28.770287,-2.754841
6,RandomForestRegressor,GridSearch,33.270992,0.019217,-5.106941,-51.97134,-3.891323,-4.32091,-34.816084,-3.25273
7,RandomForestRegressor,RandomizedSearch,4.179109,0.013105,-6.955332,-94.09942,-4.9888,-6.479089,-85.061333,-4.288
8,GradientBoostingRegressor,GridSearch,45.862256,0.003185,-3.116205,-20.992965,-2.209966,-1.21698,-4.967903,-0.800447
9,GradientBoostingRegressor,RandomizedSearch,8.876134,0.002486,-3.032793,-21.173227,-2.110591,-1.378999,-8.13059,-0.695231


## Raportul pozitiv

In [16]:
def highlight_max(df):
    is_max = df == df.max()
    return ['background-color: red' if value else '' for value in is_max]

In [17]:
def highlight_min(df):
    is_min = df == df.min()
    return ['background-color: green' if value else '' for value in is_min]

In [18]:
housing_df_abs = pd.concat([housing_df.iloc[:,:2], housing_df.iloc[:,2:].abs()], axis=1, sort=False)
housing_df_abs.style.highlight_max(color='red').highlight_min(color='green')

Unnamed: 0,Model_name,Search_strategy,fit_time,score_time,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error
0,DecisionTreeRegressor,GridSearch,0.736851,0.00281,6.085792,67.937407,4.512992,5.038441,43.537211,4.030576
1,DecisionTreeRegressor,RandomizedSearch,0.132935,0.003336,6.97536,94.536722,5.03,6.478858,85.085462,4.29
2,LinearRegression,GridSearch,0.156283,0.003691,4.047265,35.919581,2.950764,3.295527,22.761287,2.360447
3,LinearRegression,RandomizedSearch,0.124743,0.002645,4.047265,35.919581,2.950764,3.295527,22.761287,2.360447
4,Lasso,GridSearch,0.345759,0.002279,4.450778,40.638336,3.252412,3.761285,28.770287,2.754841
5,Lasso,RandomizedSearch,0.164918,0.00237,4.450778,40.638336,3.252412,3.761285,28.770287,2.754841
6,RandomForestRegressor,GridSearch,33.270992,0.019217,5.106941,51.97134,3.891323,4.32091,34.816084,3.25273
7,RandomForestRegressor,RandomizedSearch,4.179109,0.013105,6.955332,94.09942,4.9888,6.479089,85.061333,4.288
8,GradientBoostingRegressor,GridSearch,45.862256,0.003185,3.116205,20.992965,2.209966,1.21698,4.967903,0.800447
9,GradientBoostingRegressor,RandomizedSearch,8.876134,0.002486,3.032793,21.173227,2.110591,1.378999,8.13059,0.695231


## Randarea fisierului html

In [20]:
housing_html = housing_df.style.render()
housing_abs_html = housing_df_abs.style.highlight_max(color='red').highlight_min(color='green').render()

text_file = open('../html_files/housing.html', 'w')
text_file.write('<br>')
text_file.write('<h1><center> Dataset: Boston Housing <center></h1>')
text_file.write('<br><br>')
text_file.write(housing_html)
text_file.write('<br><br><br>')
text_file.write(housing_abs_html)
text_file.close()