## Assignment 4 Linear Regression Task

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, LassoCV, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import (GridSearchCV, cross_val_score,
                                     train_test_split)
from sklearn.preprocessing import StandardScaler

In [2]:
DATA_PATH = "https://raw.githubusercontent.com/Yorko/mlcourse.ai/main/data/"

In [3]:
data = pd.read_csv(DATA_PATH + "winequality-white.csv", sep=";")

In [4]:
data.shape

(4898, 12)

In [5]:
data.shape  # (4898, 12)    

(4898, 12)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


In [7]:
y = data["quality"].values
X = data.drop("quality", axis=1)


X_train , X_test , y_train , y_test = train_test_split(X, y , test_size =0.3 , random_state = 17)

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
linreg = LinearRegression()

linreg.fit(X_train_scaled , y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [10]:
linreg.coef_

array([ 9.78219223e-02, -1.92259947e-01, -1.83224449e-04,  5.38164096e-01,
        8.12724353e-03,  4.21804406e-02,  1.43040227e-02, -6.65720472e-01,
        1.50036006e-01,  6.20533605e-02,  1.29533447e-01])

In [16]:
print("MSE Training:",mean_squared_error(linreg.predict(X_train_scaled), y_train),"\n" ,"MSE Test:",mean_squared_error(linreg.predict(X_test_scaled), y_test))

MSE Training: 0.5580606489803572 
 MSE Test: 0.5842473102404544


In [33]:
linreg_coef = pd.DataFrame(linreg.coef_ , index = X.columns , columns=["coef"])

linreg_coef["abs_coef"] = np.abs(linreg_coef["coef"])

linreg_coef = linreg_coef.sort_values("abs_coef", ascending=False)

linreg_coef

Unnamed: 0,coef,abs_coef
density,-0.66572,0.66572
residual sugar,0.538164,0.538164
volatile acidity,-0.19226,0.19226
pH,0.150036,0.150036
alcohol,0.129533,0.129533
fixed acidity,0.097822,0.097822
sulphates,0.062053,0.062053
free sulfur dioxide,0.04218,0.04218
total sulfur dioxide,0.014304,0.014304
chlorides,0.008127,0.008127


In [44]:
lasso1 = Lasso(alpha = 0.01 , random_state = 17)

lasso1.fit(X_train_scaled , y_train)

0,1,2
,alpha,0.01
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,17
,selection,'cyclic'


In [45]:
print("MSE Training:",mean_squared_error(lasso1.predict(X_train_scaled), y_train),"\n" ,"MSE Test:",mean_squared_error(lasso1.predict(X_test_scaled), y_test))

MSE Training: 0.5637869195669825 
 MSE Test: 0.5736627127525901


In [46]:
lasso1_coef = pd.DataFrame(
    {"coefs" : lasso1.coef_,
    "coefs_abs" : np.abs(lasso1.coef_)},
    index = X.columns
)

lasso1_coef = lasso1_coef.sort_values("coefs_abs", ascending=False)

lasso1_coef

Unnamed: 0,coefs,coefs_abs
alcohol,0.322425,0.322425
residual sugar,0.256363,0.256363
density,-0.235492,0.235492
volatile acidity,-0.188479,0.188479
pH,0.067277,0.067277
free sulfur dioxide,0.043088,0.043088
sulphates,0.029722,0.029722
chlorides,-0.002747,0.002747
fixed acidity,-0.0,0.0
citric acid,-0.0,0.0


In [47]:
alphas  = np.logspace(-6 , 2 , 200)

lasso_cv = LassoCV(alphas = alphas , cv = 5 , random_state=17)

lasso_cv.fit(X_train_scaled , y_train)

0,1,2
,eps,0.001
,n_alphas,'deprecated'
,alphas,array([1.0000...00000000e+02])
,fit_intercept,True
,precompute,'auto'
,max_iter,1000
,tol,0.0001
,copy_X,True
,cv,5
,verbose,False


In [48]:
lasso_cv.alpha_

np.float64(0.0002833096101839324)

In [49]:
print("MSE Training:",mean_squared_error(lasso_cv.predict(X_train_scaled), y_train),"\n" ,"MSE Test:",mean_squared_error(lasso_cv.predict(X_test_scaled), y_test))

MSE Training: 0.558070014187378 
 MSE Test: 0.5832976077860635


In [54]:
lasso_cv_coef = pd.DataFrame(
    {"coefs" : np.abs(lasso_cv.coef_)}
    ,index = X.columns
)

lasso_cv_coef = lasso_cv_coef.sort_values("coefs", ascending=True)

lasso_cv_coef

Unnamed: 0,coefs
citric acid,0.0
chlorides,0.006933
total sulfur dioxide,0.012969
free sulfur dioxide,0.042698
sulphates,0.060939
fixed acidity,0.093295
alcohol,0.137115
pH,0.146549
volatile acidity,0.192049
residual sugar,0.526883


In [55]:
forest = RandomForestRegressor(random_state = 17)
forest.fit(X_train_scaled , y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [56]:
print("MSE Training:",mean_squared_error(forest.predict(X_train_scaled), y_train),"\n" ,"MSE Test:",mean_squared_error(forest.predict(X_test_scaled), y_test))

MSE Training: 0.05261155192532089 
 MSE Test: 0.37163775510204083


In [59]:
np.mean(np.abs(cross_val_score(forest , X_train_scaled, y_train , scoring  = "neg_mean_squared_error")))

np.float64(0.4142003732204039)

In [None]:
forest_params = {
    "max_depth" : list(range(10,25)),
    "max_features" : list(range(6,12))
}

locally_best_forest = GridSearchCV(
    RandomForestRegressor(random_state = 17, n_jobs = -1),
    forest_params,
    scoring = "neg_mean_squared_error",
    n_jobs = -1,
    cv = 5,
    verbose = True)

locally_best_forest.fit(X_train_scaled, y_train)


Fitting 5 folds for each of 90 candidates, totalling 450 fits


0,1,2
,estimator,RandomForestR...ndom_state=17)
,param_grid,"{'max_depth': [10, 11, ...], 'max_features': [6, 7, ...]}"
,scoring,'neg_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,True
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,21
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,6
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [63]:
locally_best_forest.best_params_ , locally_best_forest.best_score_

({'max_depth': 21, 'max_features': 6}, np.float64(-0.39773288191505934))

In [64]:
np.mean(np.abs(cross_val_score(locally_best_forest.best_estimator_ , X_train_scaled, y_train , scoring  = "neg_mean_squared_error")))

np.float64(0.39773288191505934)

In [66]:
print(mean_squared_error(locally_best_forest.predict(X_test_scaled), y_test))

0.36572455603132475


In [69]:
locally_best_forest.n_features_in_

11