In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
wine_df = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
wine_df.head()

In [None]:
import ipywidgets as widgets
from IPython.display import display

style = {'description_width': 'initial'}

In [None]:
# adding f allowed the computations under curly brackets
print(f"Dataset has {wine_df.shape[0]} rows and {wine_df.shape[1]} columns")

Feature Engineering

In [None]:
#Check Nulls

wine_df.isnull().sum()

In [None]:
wine_df.info()

In [None]:
wine_df.columns.to_list()

In [None]:
#Classifying Dependent and independent variables
# Double brackets to make X while single brcket for Y as it's series
X = wine_df[[col for col in wine_df.columns if col not in ('quality')]]
y = wine_df['quality']

In [None]:
print("\nIdependent Variables :\n\n", X[:5])
print("\nDependent Variable (Score):\n\n", y[:5])

Create Train and Test Sets

In [None]:
#Creating training set and testing set using test_size floater
from sklearn.model_selection import train_test_split
test_size = widgets.FloatSlider(min=0.01, max=0.6, value=0.2, description="Test Size :", tooltips=['Usually 20-30%'])
display(test_size)

In [None]:
#Divide the dataset into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X ,y, test_size=test_size.value, random_state = 0) 

In [None]:
print("Training Set :\n----------------\n")
print("X = \n", X_train[:3])
print("y = \n", y_train[:3])

print("\n\nTest Set :\n----------------\n")
print("X = \n",X_test[:3])
print("y = \n", y_test[:3])

In [None]:
print(f"Shape of Training set is {X_train.shape}")
print(f"Shape of Testing set is {X_test.shape}")

ML Model Application

In [None]:
# Support Vector Regressor

from sklearn.svm import SVR

In [None]:
# Train the regressor and predict outcomes
regressor_lin = SVR(kernel='linear')
regressor_pol = SVR(kernel='poly', degree=3)
regressor_rbf = SVR(kernel='rbf', gamma=0.1)

y_pred_lin = regressor_lin.fit(X_train, y_train).predict(X_test)
y_pred_pol = regressor_pol.fit(X_train, y_train).predict(X_test)
y_pred_rbf = regressor_rbf.fit(X_train, y_train).predict(X_test)

Metrics and evaluating the results

RMLSE(Root Mean Squared Log Error)
RMSLE metric only considers the relative error between and the Predicted and the actual value and the scale of the error is not significant. On the other hand, we have RMSE value Increases in magnitude if the scale of error increase. Apart from it, RMLSE incurs a larger penalty for the underestimation of the Actual variable than the Overestimation.

In [None]:
def rmlse(y_test, y_predic):
    error = np.square(np.log10(y_predic +1) - np.log10(y_test +1)).mean() ** 0.5
    score = 1 - error
    return score

# Printing the score
print("\n----------------------------\nRMLSE_lin Score = ", rmlse(y_test, y_pred_lin))
print("\n----------------------------\nRMLSE_pol Score = ", rmlse(y_test, y_pred_pol))
print("\n----------------------------\nRMLSE_rbf Score = ", rmlse(y_test, y_pred_rbf))


# #9 Comparing Actual and Predicted Salaries for he test set
# print("\nActual vs Predicted Scores \n------------------------------\n")
# error_df = pd.DataFrame({"Actual" : y_test,
#                          "Predicted" : y_pred,
#                          "Abs. Error" : np.abs(y_test - y_pred)})

# error_df

In [None]:
regressor_lin.coef_

In [None]:
# Feature importance (defined for linear kernel) & plot a horizontal bar plot
pd.Series(abs(regressor_lin.coef_[0]), index=X.columns).sort_values(ascending=True).plot(kind='barh')