In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Project Description:** I am using a dataset from UCL Machine Learning Repository to examine wine attributes vs. corresponding quality ratings from sensory data
I plan to test the following hypotheses using linear regression:
1. Larger amounts of residual sugar lead to lower quality ratings
2. Larger amounts of sulphates lead to higher quality ratings
3. Volatile acidity is associated with higher quality rating 
4. (a) For red wines, a higher alcohol content is associated with higher quality ratings
4. (b) For white wines, a higher alochol content is associated with lower quality ratings
5. Larger amount of citric acid lead to higher quality ratings

**Why this is useful from a UX stance:** By identifying which attributes lead to higher or lower quality ratings, the reults could be used to help restaurants more easily predict which wines customers will enjoy. Additionally, the results could be integrated into a website or app to help people find wines within their budget, favorite variety of wine, etc. This would both help people to determine which wines they're more likely to enjoy before purchasing, and to help find new wines based on similar wines they already like. 


**About the dataset:**
- I initially found this data set from the UCI Machine Learning Repository at: http://archive.ics.uci.edu/ml/datasets/Wine+Quality (citation: P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.)
- When downloading the CSV from UCI and uploading it to Kaggle, I found that the dataset already exists on Kaggle in a cleaner format. As such, while the original source is UCI Machine Learning Repository, I am using the wine-quality dataset uploaded to Kaggle by Aleix Dorca

In [None]:
#assigning the csv file to a variable and checking to see what it looks like
wine_data = pd.read_csv("../input/wine-quality/winequality.csv")
wine_data.head(15)

**Creating new dataframes based on each hypotheses:**
(this will help me to easily switch out the full dataframe name rather than naming individual columns from the main dataframe each time)

In [None]:
#Hypothesis 1
#needed columns: residual sugar and quality ratings 

hyp_1 = wine_data[['residual sugar', 'quality']]
#hyp_1 (checking to see the info is correct then commenting it out)

In [None]:
#Hypothesis 2
#needed columns: sulphates and quality 

hyp_2 = wine_data[['sulphates', 'quality']]
#hyp_2

In [None]:
#hypothesis 3
#needed columns: volatile acidity and quality

hyp_3 = wine_data[['volatile acidity', 'quality']]
#hyp_3

In [None]:
#hypothesis 4a and b 
#needed columns: alcohol, quality, color

hyp_4 = wine_data[['alcohol', 'quality', 'color']]
#hyp_4

hyp_4a = hyp_4[hyp_4['color'] == 'red']
#hyp_4a

hyp_4b = hyp_4[hyp_4['color'] == 'white']
#hyp_4b

In [None]:
#hypothesis 5
#columns needed: citric acid and quality

hyp_5 = wine_data[['citric acid', 'quality']]
#hyp_5

**Answering Hypothesis 1: Larger amounts of residual sugar lead to lower quality ratings**

In [None]:
#importing necessary libraries

import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

#creating an initial scatterplot with the variables
x = hyp_1.iloc[:,0].values.reshape(-1,1) #making the dependent variable, quality rating, the x axis
y = hyp_1.iloc[:,1].values.reshape(-1,1) #the independent variable, residual sugar, goes as the y axis

plt.figure(figsize=(10,10))
plt.scatter(x, y, s=20) 
plt.xlabel('Residual Sugar')
plt.ylabel('Quality Rating')
plt.show()

In [None]:
#now using linear regression model to chart the best fit prediction line

model = LinearRegression()
model.fit(x, y)
y_pred = model.predict(x) 

plt.figure(figsize=(10,10))
plt.scatter(x, y, s=20) 
plt.plot(x, y_pred, color = 'red') #plotting the prediction for the y for every value x 
plt.xlabel('Residual Sugar')
plt.ylabel('Quality Rating')
plt.show()

While there is not a perfect correlation, the prediction line does show a slight dip in quality ratings as more residual sugar is present in the various wines. 

I am curious how relevant this really is, though, so I found a method for determining the P value and R^2 value here: https://towardsdatascience.com/the-complete-guide-to-linear-regression-in-python-3d3f8f06bf8

In [None]:
#importing neccessary libraries 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import statsmodels.api as sm

X = hyp_1['residual sugar']
y = hyp_1['quality']
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

The above results show that the p value is very low, but it is not 0. The R^s value shows that only about 0.1% of variability in the quality ratings are due to the amount of residual sugar. Not non-existant, but low!

**Answering Hypothesis 2: Larger amounts of sulphates lead to higher quality ratings**

In [None]:
#for the rest of the hypotheses I will be using the same method, and switching out the variables

x = hyp_2.iloc[:,0].values.reshape(-1,1) 
y = hyp_2.iloc[:,1].values.reshape(-1,1)

plt.figure(figsize=(10,10))
plt.scatter(x, y, s=20) 
plt.xlabel('Sulphates')
plt.ylabel('Quality Rating')

model = LinearRegression()
model.fit(x, y)
y_pred = model.predict(x) 

plt.figure(figsize=(10,10))
plt.scatter(x, y, s=20) 
plt.plot(x, y_pred, color = 'green') #plotting the prediction for the y for every value x 
plt.xlabel('Sulphates')
plt.ylabel('Quality Rating')
plt.show()

Once again, it looks like there is a weak correlation between the variables, which does show support for the hypothesis. 

In [None]:
#checking the p and r^s variables
#again, this same method will be used for the rest of the hypotheses, switching out variables

X = hyp_2['sulphates']
y = hyp_2['quality']
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

The p-value here is lower than in hypothesis 1, showing a slightly higher correlation, but still not .000. Again, the r^2 values shows that only 0.1% of variance in quality rating is due to the amount of sulphates. 

**Answering Hypthesis 3: Volatile acidity is associated with higher quality ratings**

In [None]:
x = hyp_3.iloc[:,0].values.reshape(-1,1) 
y = hyp_3.iloc[:,1].values.reshape(-1,1)

plt.figure(figsize=(10,10))
plt.scatter(x, y, s=20) 
plt.xlabel('Volatile Acidity')
plt.ylabel('Quality Rating')

model = LinearRegression()
model.fit(x, y)
y_pred = model.predict(x) #make variable so can send to plot function and other things like that

plt.figure(figsize=(10,10))
plt.scatter(x, y, s=20) 
plt.plot(x, y_pred, color = 'orange') #plotting the prediction for the y for every value x 
plt.xlabel('Volatile Acidity')
plt.ylabel('Quality Rating')
plt.show()

Again, not a strong looking correlation, but this disproved my hypothesis that higher volatile acidity is correlated with higher quality ratings!

In [None]:
X = hyp_3['volatile acidity']
y = hyp_3['quality']
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

This actually has a much stronger correlation! The p-value is zero (or, at least close to it), and the r^2 value shows that 7.1% of variability in quality ratings is due to the volatile acidity.

I'm curious if this differs by type of wine. Going to test that below. 

In [None]:
#making new dataframe variables
hyp_3_cont = wine_data[['volatile acidity', 'quality', 'color']]
#hyp_3a

hyp_3a = hyp_3_cont[hyp_3_cont['color'] == 'red']
#hyp_3a

hyp_3b = hyp_3_cont[hyp_3_cont['color'] == 'white']
#hyp_3b

In [None]:
#testing with red
x = hyp_3a.iloc[:,0].values.reshape(-1,1) 
y = hyp_3a.iloc[:,1].values.reshape(-1,1)

plt.figure(figsize=(10,10))
plt.scatter(x, y, s=20) 
plt.xlabel('Volatile Acidity')
plt.ylabel('Quality Rating - Red Wine')

model = LinearRegression()
model.fit(x, y)
y_pred = model.predict(x) #make variable so can send to plot function and other things like that

plt.figure(figsize=(10,10))
plt.scatter(x, y, s=20) 
plt.plot(x, y_pred, color = 'red') #plotting the prediction for the y for every value x 
plt.xlabel('Volatile Acidity')
plt.ylabel('Quality Rating - Red Wine')
plt.show()

In [None]:
X = hyp_3a['volatile acidity']
y = hyp_3a['quality']
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

Super interesting! For red wines, volatile acidity account for 15.3% of the variability in quality (with a negative correlation). 

In [None]:
#testing with white
x = hyp_3b.iloc[:,0].values.reshape(-1,1) 
y = hyp_3b.iloc[:,1].values.reshape(-1,1)

plt.figure(figsize=(10,10))
plt.scatter(x, y, s=20) 
plt.xlabel('Volatile Acidity')
plt.ylabel('Quality Rating - White Wine')

model = LinearRegression()
model.fit(x, y)
y_pred = model.predict(x) #make variable so can send to plot function and other things like that

plt.figure(figsize=(10,10))
plt.scatter(x, y, s=20) 
plt.plot(x, y_pred, color = 'yellow') #plotting the prediction for the y for every value x 
plt.xlabel('Volatile Acidity')
plt.ylabel('Quality Rating - White Wine')
plt.show()

In [None]:
X = hyp_3b['volatile acidity']
y = hyp_3b['quality']
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

For white wine, volatile acidity is still negatively correlated with quality ratings, but it only accounts for 3.8% of the variability in quality ratings for white wines. While my hypothesis was disproven, this shows that higher volatile acidity in red wines is more likely to lead to lower quality ratings than it is in white wines. 

**Answering Hypothesis 4a: For red wines, a higher alcohol content is associated with higher quality ratings**

In [None]:
x = hyp_4a.iloc[:,0].values.reshape(-1,1) 
y = hyp_4a.iloc[:,1].values.reshape(-1,1)

plt.figure(figsize=(10,10))
plt.scatter(x, y, s=20) 
plt.xlabel('Alcohol Content')
plt.ylabel('Red Wine Quality Rating')

model = LinearRegression()
model.fit(x, y)
y_pred = model.predict(x) #make variable so can send to plot function and other things like that

plt.figure(figsize=(10,10))
plt.scatter(x, y, s=20) 
plt.plot(x, y_pred, color = 'pink') #plotting the prediction for the y for every value x 
plt.xlabel('Alcohol Content')
plt.ylabel('Red Wine Quality Rating')
plt.show()

Looks like my hypthesis was accurate! This line shows that higher alcohol content is associated with higher quality ratings in red wines. 

In [None]:
X = hyp_4a['alcohol']
y = hyp_4a['quality']
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

Cool, the p value is (close to) zero, and the R^2 shows that 22.7% of variability in quality ratings is due to the alcohol content - the highest so far. 

**Answering Hypothesis 4b: For white wines, a higher alochol content is associated with lower quality ratings**

In [None]:
x = hyp_4b.iloc[:,0].values.reshape(-1,1) 
y = hyp_4b.iloc[:,1].values.reshape(-1,1)

plt.figure(figsize=(10,10))
plt.scatter(x, y, s=20) 
plt.xlabel('Alcohol Content')
plt.ylabel('White Wine Quality Rating')

model = LinearRegression()
model.fit(x, y)
y_pred = model.predict(x) #make variable so can send to plot function and other things like that

plt.figure(figsize=(10,10))
plt.scatter(x, y, s=20) 
plt.plot(x, y_pred, color = 'yellow') #plotting the prediction for the y for every value x 
plt.xlabel('Alcohol Content')
plt.ylabel('White Wine Quality Rating')
plt.show()

Again, a positive correlation is shown. This goes against my hypothesis. 

In [None]:
X = hyp_4b['alcohol']
y = hyp_4b['quality']
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

Although my hypothesis was wrong, this shows that only 19% of variability in quality ratings for white wines are due to the alochol content, lower than that for red wine. 

**Answering Hypothesis 5: Larger amount of citric acid lead to higher quality ratings**

In [None]:
x = hyp_5.iloc[:,0].values.reshape(-1,1) 
y = hyp_5.iloc[:,1].values.reshape(-1,1)

plt.figure(figsize=(10,10))
plt.scatter(x, y, s=20) 
plt.xlabel('Citric Acid')
plt.ylabel('Quality Rating')

model = LinearRegression()
model.fit(x, y)
y_pred = model.predict(x) #make variable so can send to plot function and other things like that

plt.figure(figsize=(10,10))
plt.scatter(x, y, s=20) 
plt.plot(x, y_pred, color = 'purple') #plotting the prediction for the y for every value x 
plt.xlabel('Citric Acid')
plt.ylabel('Quality Rating')
plt.show()

While the correlation does not look strong, this does support the hypothesis that larger amounts of citric acid are associated with higher quality ratings. 

In [None]:
X = hyp_5['citric acid']
y = hyp_5['quality']
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

As thought, only 0.7% of variability in quality ratings are due to the amount of citric acid. 

Based on these results, I think it is clear that quality ratings can not be determined accurately by only looking at one variable. 

The paper that was referenced in the dataset section at the top of the notebook discussed the use of Support Vector Machine method to predict quality. The SVM algorithm is useful because it can be used for both regression and classification models, both of which are listed as associated tasks with this dataset on the UCI page. 

I will attempt this to see if it allows for more clear or accurate predictions, using the method found here: https://intellipaat.com/blog/tutorial/machine-learning-tutorial/svm-algorithm-in-python/#Polynomial-SVM-Kernele

In [None]:
#re-examining the dataset
wine_data

In [None]:
#checking the data shape
wine_data.shape

In [None]:
#defining the features and the target (X and y)
X = wine_data.drop(['quality', 'color'], axis=1) #I dropped quality as it is the target, and color as it is not an integer and not one of the variable I am factoring into quality
y = wine_data['quality']

In [None]:
X

In [None]:
y

In [None]:
#splitting the dataset into train and test using sklearn before building the SVM algorithm model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

In [None]:
#importing the support vector classifier/SVC function, building the SVM model with help of SVC classifier
from sklearn.svm import SVC
svclassifer = SVC(kernel='linear')
svclassifer.fit(X_train, y_train)

In [None]:
#prediciting values using the SVM algorithm model
y_pred = svclassifer.predict(X_test)

In [None]:
#evaluating the SVM model
from sklearn.metrics import classification_report, confusion_matrix
print (confusion_matrix(y_test,y_pred))
print (classification_report(y_test,y_pred))

The above confusion matrix and classification report shows the an analysis of the SVM results for both red and white wine combined by the quality rating (3-9). This shows that predications are most accurate for the middle scores (5-7) and get less accurate as the ratings go lower or higher. 

This is a bit of a black box, however. I'm interested to know the top contributing factors to the SVM model, which is displayed in the academic article referenced earlier in the notebook that the dataset comes from. HOWEVER, I am having a really heard time finding methods to do this which build off of the model I created above. I even read through the ski-kit learn documenation for SVM , but could not find a method that would work with my dataframe type, number of variables, previously used code, etc. 

Some attempts in the next few code boxes. I will admit, I could not fully understand where all of these variables and methods were pulling from, but I wanted to show my attempts!

In the end, I think I would adopt a different method to determine top contributing facots, and to do multi-variate regression analysis. However, this was very interesting, and I hand't heard of SVM models before, so it was cool to try out! And, it did give me some new information. Mainly, that the middle quality ratings (5-7) are better predicted via the variables than the lower or higher end quality ratings. However, this also may be because there are more samples of the middle quality ratings than there are of the higher and lower. 

In [None]:
#method number 1 attempted

#from sklearn import svm

#model=svm.SVC(kernel='linear')

#a=model.fit(X,y)
#model.score(X,y)

#feature_names = vectorizer.get_feature_names() 
#coefs_with_fns = sorted(zip(model.coef_[0], feature_names)) 
#df=pd.DataFrame(coefs_with_fns)
#df.columns='coefficient','word'
#df.sort_values(by='coefficient')

In [None]:
#method number 2 attempted

#from sklearn import svm
#import numpy as np
#import matplotlib.pyplot as plt

#lin_clf = svm.SVC(kernel='linear')
#lin_clf.fit(X,y)

#variables = np.dot(X, lin_clf.coef_.T)

#lin_clf.predict(X[2].reshape(1,30))

#contributions = np.multiply(X[2], lin_clf.coef_.reshape((30,)))
#feature_number = np.arange(len(contributions)) +1

#plt.bar(feature_number, contributions, align='center')
#plt.xlabel('feature index')
#plt.ylabel('score contribution')
#plt.title('contribution to classification outcome by feature index')
#plt.show(feature_contrib_bar)