In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [4]:
#Importing our dataset
dataset = pd.read_csv("winequality-red.csv", sep = ';')
dataset.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:
#checking for any null values in the dataset
dataset[dataset.isnull().any(axis=1)].head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality


In [6]:
#All attributes
print(dataset.columns)

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')


In [7]:
#all target class
dataset.quality.unique()

array([5, 6, 7, 4, 8, 3], dtype=int64)

In [8]:
#Splitting dataset to create features and labels
x = dataset.drop(columns=['quality'])
x.head()

y = dataset['quality']
y.head()

0    5
1    5
2    5
3    6
4    5
Name: quality, dtype: int64

In [17]:
# x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state = 0)

Linear Regression

In [9]:
#Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

param_lin = {
    'fit_intercept': [True,False],
    'normalize': [True,False]
            }

grid = GridSearchCV(LinearRegression(), param_lin, cv = 5)
grid.fit(x,y)
grid.best_params_                   #result {'fit_intercept': False, 'normalize': True}

reg_model = grid.best_estimator_    #LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=True)

print("R2 score Linear regression: ",cross_val_score(reg_model, x, y, cv = 5 , scoring='r2').mean())

R2 score Linear regression:  0.2909827920687206


Polynomial Regression

In [37]:
from sklearn.preprocessing import PolynomialFeatures
print(x.shape,y.shape)
poly = PolynomialFeatures(degree=3)            #increasing the degree of existing features

x_poly = poly.fit_transform(x)

print(x_poly.shape)                            #transformed features

poly_reg = LinearRegression()
poly_reg.fit(x_poly,y)

print("R2 score for polynomial regression: ",cross_val_score(poly_reg, x, y, cv = 5 , scoring='r2').mean())

(1599, 11) (1599,)
(1599, 364)
R2 score for polynomial regression:  0.29004162884219475


SVM regression

In [22]:
from sklearn.svm import SVR

param_svr ={
    'kernel': ['linear', 'rbf'],
    'gamma': ['auto'],
    'C': np.arange(0.5,1.5,0.25)
    }

grid_svr = GridSearchCV(SVR(), param_svr, cv = 5)
grid_svr.fit(x,y)
grid_svr.best_params_               #{'C': 1.25, 'gamma': 'auto', 'kernel': 'linear'}

{'C': 1.25, 'gamma': 'auto', 'kernel': 'linear'}

In [98]:
svr_model = grid_svr.best_estimator_    #SVR(C=1.25, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
                                        #     kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
    
print("R2 score SVR: ",cross_val_score(svr_model, x, y, cv = 5 , scoring='r2').mean())

R2 score SVR:  0.28388010696878113


Decision Tree Regression


In [92]:
from sklearn.tree import DecisionTreeRegressor

param_tree = {
    'criterion': ['mse','friedman_mse','mae'],
    'max_depth': np.arange(2,5),
    'max_features' : [ 'auto', 'sqrt', 'log2'],
    'min_samples_split' : np.arange(3,10)
     }

grid_tree = GridSearchCV(DecisionTreeRegressor(), param_tree, cv = 5)
grid_tree.fit(x,y)
grid_tree.best_params_



{'criterion': 'mse',
 'max_depth': 4,
 'max_features': 'auto',
 'min_samples_split': 8,
 'splitter': 'best'}

In [97]:
tree_reg_model = grid_tree.best_estimator_
tree_reg_model                #DecisionTreeRegressor(criterion='mse', max_depth=3, max_features='auto',
                                                      # max_leaf_nodes=None, min_impurity_decrease=0.0,
                                                      # min_impurity_split=None, min_samples_leaf=1,
                                                      # min_samples_split=2, min_weight_fraction_leaf=0.0,
                                                      # presort=False, random_state=None, splitter='best')

print("R2 score tree regression: ",cross_val_score(tree_reg_model, x, y, cv = 5 , scoring='r2').mean())

R2 score tree regression:  0.21013860287906638
