In [1]:
from scipy import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.stats.stats import pearsonr

In [2]:
data=io.loadmat('wine.mat')

In [3]:
data

{'__globals__': [],
 '__header__': b'MATLAB 5.0 MAT-file, Platform: MACI64, Created on: Tue Jan 16 17:20:24 2018',
 '__version__': '1.0',
 'data': array([[ 1.        , -0.47687637, -0.47325129, ...,  0.69463614,
         -0.70381277, -0.58602832],
        [ 1.        ,  0.75412677, -0.22873765, ..., -1.12511895,
         -1.38699752,  1.26371301],
        [ 1.        , -1.40012873,  1.54398622, ...,  0.75738632,
          0.25264589, -0.67010747],
        ..., 
        [ 1.        , -0.86156485, -0.35099447, ..., -0.18386632,
         -0.9087682 , -1.17458238],
        [ 1.        , -1.47706643, -0.59550811, ...,  2.32614071,
         -1.04540515,  0.4229215 ],
        [ 1.        , -0.32300098, -0.5343797 , ..., -1.06236878,
         -1.04540515, -1.25866153]]),
 'labels': array([[6],
        [6],
        [6],
        ..., 
        [6],
        [8],
        [5]], dtype=uint8),
 'testdata': array([[ 1.        ,  3.37000845,  0.84100951, ..., -0.93686843,
          0.79919369, -0.389843

In [4]:
df=pd.DataFrame(data=data['data'],columns=['constant','fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol'])

In [5]:
len(df)

3249

In [6]:
df['target']=data['labels']

In [7]:
df.head()

Unnamed: 0,constant,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,target
0,1.0,-0.476876,-0.473251,-0.46939,1.326781,-0.323621,0.814313,1.0898,0.751055,0.694636,-0.703813,-0.586028,6
1,1.0,0.754127,-0.228738,0.878453,1.497674,-0.381332,1.209038,0.667122,-0.057235,-1.125119,-1.386998,1.263713,6
2,1.0,-1.400129,1.543986,-1.746294,0.344146,-0.323621,0.306809,0.543841,-0.097819,0.757386,0.252646,-0.670107,6
3,1.0,2.523694,-0.473251,1.09127,-0.745296,0.628617,-1.384871,-1.62238,0.984411,-0.246616,3.463614,0.338842,7
4,1.0,-0.092188,-1.023407,0.452818,0.429593,-0.121631,1.03987,1.160246,0.375656,0.820136,-0.088946,-0.754187,6


In [8]:
import statsmodels.api as sm

  from pandas.core import datetools


In [9]:
X=df[['constant','fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']]
y=df['target']

In [10]:
X = sm.add_constant(X)

In [11]:
X.head()

Unnamed: 0,constant,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,1.0,-0.476876,-0.473251,-0.46939,1.326781,-0.323621,0.814313,1.0898,0.751055,0.694636,-0.703813,-0.586028
1,1.0,0.754127,-0.228738,0.878453,1.497674,-0.381332,1.209038,0.667122,-0.057235,-1.125119,-1.386998,1.263713
2,1.0,-1.400129,1.543986,-1.746294,0.344146,-0.323621,0.306809,0.543841,-0.097819,0.757386,0.252646,-0.670107
3,1.0,2.523694,-0.473251,1.09127,-0.745296,0.628617,-1.384871,-1.62238,0.984411,-0.246616,3.463614,0.338842
4,1.0,-0.092188,-1.023407,0.452818,0.429593,-0.121631,1.03987,1.160246,0.375656,0.820136,-0.088946,-0.754187


In [12]:
est=sm.OLS(y,X)

In [13]:
est=est.fit()

In [14]:
pred_train=est.predict(X)
print("Squared loss ERM of the model :" + str(sum(pow((pred_train-y),2))))

Squared loss ERM of the model :1761.41595306


In [15]:
testdata=pd.DataFrame(data=data['testdata'],columns=['constant','fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol'])
dup_testdata_sulphur = testdata['total sulfur dioxide']

In [16]:
testdata['target']=data['testlabels']

In [17]:
datapred=testdata[['constant','fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']]

In [18]:
predictions=est.predict(datapred)

In [19]:
print("Squared loss ERM of the test data :" + str(sum(pow((predictions-testdata['target']),2))))

Squared loss ERM of the test data :1753.21751754


In [20]:
len(est.params)

12

In [21]:
import itertools

In [22]:
features=['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']

In [23]:
count=0
y=df['target']
models={}
combi={}
for subset in itertools.combinations(features,3):
    Xlist=[]
    for x in subset:
        Xlist.append(x)
    X=df[Xlist]
    X = sm.add_constant(X)
    est=sm.OLS(y,X)
    est=est.fit()
    pred=est.predict(X)
    model_pred=sum(pow((pred-df['target']),2))
    combi[model_pred]=Xlist
    models[model_pred]=est

In [24]:
best_model=models[min(list(combi.keys()))]

In [25]:
best_model.params

const               5.826100
volatile acidity   -0.244743
sulphates           0.108471
alcohol             0.373626
dtype: float64

In [26]:
min(list(combi.keys()))

1815.4584496758787

In [27]:
min_error_params=combi[min(list(combi.keys()))]

In [28]:
testd=testdata[min_error_params]
#print(testd)
testd=sm.add_constant(testd)
pred_tuple=best_model.predict(testd)
print(sum(pow((pred_tuple-testdata['target']),2)))
best_model.params
correlation=testd.corr(method='pearson')

1807.66010597


In [29]:
print(min_error_params)

['volatile acidity', 'sulphates', 'alcohol']


In [30]:
current_sulphur = testdata['total sulfur dioxide']
print(sum(current_sulphur == dup_testdata_sulphur))
print(testdata.shape)

3248
(3248, 13)


In [31]:
testdata.corr()

Unnamed: 0,constant,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,target
constant,,,,,,,,,,,,,
fixed acidity,,1.0,0.208286,0.338624,-0.111655,0.287892,-0.286422,-0.32745,0.450545,-0.257599,0.296863,-0.101017,-0.068061
volatile acidity,,0.208286,1.0,-0.375563,-0.19645,0.362829,-0.353564,-0.422007,0.259082,0.250111,0.211035,-0.037626,-0.265473
citric acid,,0.338624,-0.375563,1.0,0.146684,0.052269,0.137161,0.195286,0.106016,-0.33521,0.069663,-0.007105,0.102307
residual sugar,,-0.111655,-0.19645,0.146684,1.0,-0.127144,0.385025,0.481914,0.566198,-0.255788,-0.171889,-0.346262,-0.017396
chlorides,,0.287892,0.362829,0.052269,-0.127144,1.0,-0.169628,-0.260881,0.343797,0.012459,0.407115,-0.267461,-0.192667
free sulfur dioxide,,-0.286422,-0.353564,0.137161,0.385025,-0.169628,1.0,0.712698,0.019865,-0.132966,-0.173192,-0.167406,0.06121
total sulfur dioxide,,-0.32745,-0.422007,0.195286,0.481914,-0.260881,0.712698,1.0,0.036419,-0.232116,-0.252296,-0.263111,-0.037652
density,,0.450545,0.259082,0.106016,0.566198,0.343797,0.019865,0.036419,1.0,0.00823,0.262685,-0.679593,-0.298208
pH,,-0.257599,0.250111,-0.33521,-0.255788,0.012459,-0.132966,-0.232116,0.00823,1.0,0.177647,0.117113,0.005045
