# SVM - Regression applied to wines

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn import preprocessing

### Import Wine dataset and exploratory analysis

In [None]:
data = pd.read_csv("../input/whitewine/winequalitywhite.csv",sep=";")
data.head()

In [None]:
data.tail()

### Check for NA's in data set 

In [None]:
print("Numero de registros:"+str(data.shape[0]))
for column in data.columns.values:
    print(column + "-NAs:"+ str(pd.isnull(data[column]).values.ravel().sum()))

### Data types

In [None]:
print(data.dtypes)

### Correlations

In [None]:
print("Correlaciones en el dataset:")
data.corr()

In [None]:
plt.matshow(data.corr())

### Normalize data

In [None]:
x = data.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
data_n = pd.DataFrame(x_scaled, columns=data.columns.values)

In [None]:
data_n.head()

### Create train and test datasets

In [None]:
data_vars = data.columns.values.tolist()
Y = ['alcohol']
X = [v for v in data_vars if v not in Y]
X_train, X_test, Y_train, Y_test = train_test_split(data_n[X],data_n[Y], test_size=0.30)  

### Search for the best parameters for SVM

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
parameters = [
    {
        'kernel': ['rbf'],
        'gamma' : [1e-4,1e-3,1e-2, 0.1, 0.2, 0.5],
        'C': [1,10,100,1000]
    },
    {
        'kernel':["linear"],
        'C':[1,10,100,1000]
    }
]

clf = GridSearchCV(svm.SVR(),param_grid=parameters,cv=5)
clf.fit(X_train[X],Y_train[Y].values.ravel())

In [None]:
clf.best_params_

### Crete SVM for Regression

In [None]:
svr_rbf = SVR(kernel="rbf",C=100, gamma=0.2)

In [None]:
svr_rbf.fit(X_train,Y_train.values.ravel())

In [None]:
Y_predict = svr_rbf.predict(X_test)

In [None]:
print("R-square:",svr_rbf.score(X_test,Y_test))

### Compare prediction to real data

In [None]:
data_prediction = pd.DataFrame()
data_prediction['alcohol_prediction'] = Y_predict
data_prediction['alcohol_real_value'] = Y_test.values.ravel()

In [None]:
print(data_prediction.shape)
data_prediction[:20]

### Get MSE

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mean_squared_error(data_prediction['alcohol_real_value'] , data_prediction['alcohol_prediction'])

### Denormalize data

In [None]:
data.shape

In [None]:
min_alcohol = data['alcohol'].min()
min_alcohol

In [None]:
max_alcohol = data['alcohol'].max()
max_alcohol

In [None]:
data_prediction['d_alcohol_prediction'] =  np.multiply(data_prediction['alcohol_prediction'],(max_alcohol - min_alcohol))
data_prediction['d_alcohol_prediction'] =  np.add(data_prediction['alcohol_prediction'],min_alcohol) 
data_prediction['d_alcohol_real_value'] =  np.multiply(data_prediction['alcohol_real_value'],(max_alcohol - min_alcohol))
data_prediction['d_alcohol_real_value'] =  np.add(data_prediction['alcohol_real_value'],min_alcohol) 

In [None]:
data_prediction.head()