In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_validate
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

In [2]:
df = pd.read_csv("./Wine_red.csv", sep=';')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
features = df.drop('quality',axis=1)
targets = df.quality

In [4]:
polys = PolynomialFeatures(2, interaction_only=True)
features_engineered = polys.fit_transform(features)
cols = polys.get_feature_names(features.columns)
features_engineered = pd.DataFrame(features_engineered, columns=cols)
features_engineered.head()

Unnamed: 0,1,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,...,total sulfur dioxide density,total sulfur dioxide pH,total sulfur dioxide sulphates,total sulfur dioxide alcohol,density pH,density sulphates,density alcohol,pH sulphates,pH alcohol,sulphates alcohol
0,1.0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,...,33.9252,119.34,19.04,319.6,3.502278,0.558768,9.37932,1.9656,32.994,5.264
1,1.0,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,...,66.7856,214.4,45.56,656.6,3.18976,0.677824,9.76864,2.176,31.36,6.664
2,1.0,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,...,53.838,176.04,35.1,529.2,3.25022,0.64805,9.7706,2.119,31.948,6.37
3,1.0,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,...,59.88,189.6,34.8,588.0,3.15368,0.57884,9.7804,1.8328,30.968,5.684
4,1.0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,...,33.9252,119.34,19.04,319.6,3.502278,0.558768,9.37932,1.9656,32.994,5.264


In [5]:
# drop features with zero standard deviation
lx = (features_engineered.std() == 0)
drop_cols = features_engineered.columns[lx]
features_engineered = features_engineered.drop(drop_cols,axis=1)

In [6]:
# standardize features
features_engineered = (features_engineered - features_engineered.mean())/features_engineered.std()

In [7]:
features_engineered.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,...,total sulfur dioxide density,total sulfur dioxide pH,total sulfur dioxide sulphates,total sulfur dioxide alcohol,density pH,density sulphates,density alcohol,pH sulphates,pH alcohol,sulphates alcohol
0,-0.528194,0.961576,-1.391037,-0.453077,-0.24363,-0.466047,-0.379014,0.5581,1.28824,-0.579025,...,-0.378041,-0.318888,-0.466169,-0.469886,1.330644,-0.575015,-0.959307,-0.393794,-0.368454,-0.837776
1,-0.298454,1.966827,-1.391037,0.043403,0.223805,0.872365,0.624168,0.028252,-0.719708,0.12891,...,0.624116,0.567921,0.583123,0.535341,-0.727649,0.128665,-0.589072,0.00366,-0.756503,-0.110527
2,-0.298454,1.29666,-1.185699,-0.169374,0.096323,-0.083643,0.228975,0.134222,-0.331073,-0.048074,...,0.229248,0.210063,0.169262,0.155323,-0.32945,-0.047314,-0.587208,-0.104015,-0.616863,-0.263249
3,1.654339,-1.384011,1.483689,-0.453077,-0.264878,0.107558,0.411372,0.664069,-0.978798,-0.461036,...,0.413513,0.336563,0.157392,0.330716,-0.965278,-0.456379,-0.577888,-0.644658,-0.849597,-0.619601
4,-0.528194,0.961576,-1.391037,-0.453077,-0.24363,-0.466047,-0.379014,0.5581,1.28824,-0.579025,...,-0.378041,-0.318888,-0.466169,-0.469886,1.330644,-0.575015,-0.959307,-0.393794,-0.368454,-0.837776


In [8]:
lr = LinearRegression()

In [9]:
results = cross_validate(lr, features_engineered,targets,return_train_score=True)



In [14]:
R2_train = results['train_score'].mean()

In [15]:
R2_test = results['test_score'].mean()

In [16]:
R2_train

0.4531523214096939

In [19]:
lr.fit(features_engineered,targets)
coef = pd.Series(lr.coef_, index=features_engineered.columns)

In [20]:
coef.sort_values(ascending=False)

total sulfur dioxide                        111.950530
free sulfur dioxide density                  89.493996
sulphates                                    64.908481
chlorides density                            59.382580
citric acid density                          30.382760
residual sugar                               15.652508
fixed acidity density                        14.535287
density pH                                   12.773722
density alcohol                               6.224390
volatile acidity density                      4.301363
fixed acidity pH                              1.659589
total sulfur dioxide pH                       1.634137
pH sulphates                                  1.389410
citric acid alcohol                           1.272608
free sulfur dioxide alcohol                   1.165908
fixed acidity sulphates                       0.695004
volatile acidity alcohol                      0.643912
fixed acidity total sulfur dioxide            0.576402
volatile a

In [21]:
coef['alcohol']

-5.646287046768286

#### this shit is terribly overfitted