In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,precision_score, recall_score, roc_auc_score, roc_curve, accuracy_score
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data=pd.read_csv("/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

    The source of the data is UC Irvine Machine Learning. So, this dataset is from a university that provides reliable data for students to practice with machine learning techniques. In this data, there are 11 columns of measured indicators that influence overall quality, which is the 12th column. The quality is a continuous varible in the range of [3-8].
    I chose this dataset because I am interested in what indicators have an influence over the quality of the wine. You always hear about wine critics and how that profession sometimes is not the best, to put it lightly. However, is there some science to the idea that someone can taste wine and understand its quality. Furthermore, can a computer be trained to predict the wine quality, so you can efficiently make the best wine possible. 

# Explore the Data

In [None]:
data.head()

In [None]:
data.dtypes

In [None]:
data.groupby('quality').count()

Since the range of quality is [3,8], the cut off for 'good' wine will be 6.5

In [None]:
data.groupby('quality').mean()

It appears that a linear regression exists with quality and volatile acidity, citric acid, chlorides, sulphates. These will be important in how the algorithms consider quality.

# Transform the Quality Column and Visualize the Data

In [None]:
data['QualityB']=data['quality'] >= 6.5
data['QualityB']=data['QualityB'].astype(int)
dataset=data.drop('quality',axis=1)
dataset.groupby('QualityB').count()


In [None]:
sns.heatmap(dataset.corr())

In [None]:
dataset.corr()

From the heatmap and correlation, when comparing quality,alcohol is strongest correlation, along with  volatile acid,citric acid and sulphates being close behind.

In [None]:
sns.catplot(x="quality",y="alcohol",hue="QualityB",data=data,kind="violin")

In [None]:
sns.catplot(x="quality",y="volatile acidity",hue="QualityB",data=data,kind="violin")

In [None]:
sns.catplot(x="quality",y="citric acid",hue="QualityB",data=data,kind="violin")

In [None]:
sns.catplot(x="quality",y="sulphates",hue="QualityB",data=data,kind="violin")

In [None]:
sns.pairplot(data.drop("quality",axis=1),hue="QualityB",vars=["alcohol","volatile acidity","citric acid","sulphates"])

The four columns visualized above deliver an insight to where the quality is good (QualityB = 1) or bad (QualityB = 0). These have the strongest correlation, so they help understand what factors into making a quality wine.

# Split the dataset into train and test sets

In [None]:
X=dataset.iloc[:,0:-1]
y=dataset.QualityB
X.head()

In [None]:
y.head()

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=99)

# Logsitic Regression:

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
LogReg = LogisticRegression()
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
LogReg.fit(X_train,y_train)
y_pred=LogReg.predict(X_test)
print("Accuracy:",LogReg.score(X_test,y_test))

# Decision Tree Classification:

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=99)

In [None]:
clf = DecisionTreeClassifier(criterion='entropy', random_state=99)
clf.fit(X_train,y_train)
y_pred= clf.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
fig, ax = plt.subplots(figsize=(25, 15))
tree.plot_tree(decision_tree=clf, max_depth= 3,fontsize=12);

# Random Forest Classifier:

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=99)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_cl = RandomForestClassifier(n_estimators=100,
                            random_state=99)
rf_cl.fit(X_train, y_train)
rf_pred= rf_cl.predict(X_test)
accuracy = float(np.sum(rf_pred==y_test))/y_test.shape[0]
print("accuracy: %f" % (accuracy))

# XGBoost Classification:

In [None]:
import xgboost as xgb

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=99)

In [None]:
xg_cl = xgb.XGBClassifier(objective='binary:logistic',verbosity = 0,use_label_encoder=False, max_depth=10, n_estimators=100, seed=99) 
xg_cl.fit(X_train, y_train)
preds = xg_cl.predict(X_test) 
print("accuracy: %f" % (float(np.sum(preds==y_test))/y_test.shape[0]))

In [None]:
xgb.plot_importance(xg_cl,importance_type='weight',grid=False,height=.5)

# Neural Network with Keras:

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.layers import BatchNormalization

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=99)

In [None]:
model=Sequential()
model.add(Dense(8,input_shape=(11,),activation='softmax'))
model.add(Dense(4,activation='softmax'))
model.add(BatchNormalization())
model.add(Dense(2,activation='softmax'))
model.add(BatchNormalization())
model.add(Dense(1,activation='softmax'))
model.compile(optimizer='sgd',loss='binary_crossentropy')
model.fit(X_train,y_train,epochs=5,validation_split=0.2)
preds=model.predict(X_test)
model.evaluate(X_test,y_test)

# Neural Network with Pytorch:

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F
import torch.optim as optim

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=99)

In [None]:
class nnData(Dataset):
    def __init__(self):
        xy=dataset.to_numpy(dtype=np.float32)
        self.x=torch.from_numpy(xy[:,:-1])
        self.y=torch.from_numpy(xy[:,[-1]])
        self.n_samples=xy.shape[0]
    def __getitem__(self,index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return self.n_samples

In [None]:
class Net(nn.Module):
    def __init__(self,n_inputs, hidden_size):
        super(Net,self).__init__()
        self.inputs= n_inputs
        self.hidden= hidden_size
        
        self.fc1=nn.Linear(self.inputs,self.hidden)
        self.fc2=nn.Linear(self.hidden,1)
    
    def forward(self,x):
        out=self.fc1(x)
        out=F.relu(out)
        out=self.fc2(out)
        return out

In [None]:
nn_Data=nnData()
trainloader=DataLoader(dataset=nn_Data,batch_size=32,num_workers=1)
testloader=DataLoader(dataset=nn_Data,num_workers=1)

In [None]:
net=Net(n_inputs=11,hidden_size=10)
criterion= nn.CrossEntropyLoss()
optimizer= optim.Adam(params=net.parameters(),lr=3e-4)
for epoch in range(1):
    for inputs,labels in trainloader:
        outputs=net.forward(inputs)
        labels = np.argmax(labels,axis=1)
        loss=criterion(outputs,labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
correct, total=0,0
predictions=[]
net.eval()
for inputs,labels in testloader:
    outputs=net(inputs)
    _, predicted= torch.max(outputs.data,1)
    predictions.append(outputs)
    total +=labels.size(0)
    correct +=(predicted==labels).sum().item()

In [None]:
print(correct/total)

# Comparison and Discussion 

Through all these techniques, the most accurate model is Random Forrest Cluster at 0.9125. This makes sense, as it uses many decision trees and 'votes'. Since the Decision Tree model was the third most accurate at .8825, using many of them and finding the mode of what they find should show promising results. However, it was somewhat suprising the XGBoost was not the most accurate at 0.907. It was my belief that it would be, based on less knowledge of the topic and my belief that a tree would not be the strongest predictor to use for the features. Then, the logistic regression model was the fourth most accurate at 0.88. The fifth most acurate model was the Pytorch Neural Network at 0.864. Then finally the Keras Neural Network model at 0.466. I am not suprised that the neural networks were the weakest models. I found that the choice of the optimizers and activation functions made the overall models weak since I was not sure exactly of them to use. Possibly neural networks could be the strongest models, as 0.9125 is not the most accurate, however using different functions and a better understanding in how to utilze them would be needed. 

Furthermore, something I would like to pose as a question, maybe I can get the answer in my feedback for this project, I am confused as to why my Keras Neural Network gives me a different accuracy score every time. All the other models return the same scores, so my intuitioin that it was how the data is split doesn't hold. It is probably easily explained, I am just curious as to why and how can it become consitent. 