In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [None]:
data = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

In [None]:
data.head()

In [None]:
data.info()

12 columns with no missing values

### Data exploration and visualization

In [None]:
data.hist(bins = 15, figsize = (12,10))
plt.show()

#### Let us see the relationship between quality and alcohol. Does quality increases with increase in alcohol percent

In [None]:
plt.bar(data['quality'],data['alcohol'])
plt.xlabel('Quality')
plt.ylabel('Alcohol')
plt.show()

In [None]:
plt.figure(figsize = (12,8))
sns.boxplot(x = data['quality'],y = data['alcohol'], data = data)
plt.title('Quality vs Alcohol percent')
plt.show()

#### High quality wine has high alcohol percent

##### Which features has highest relationship with quality

In [None]:
plt.figure(figsize = (12,8))
sns.heatmap(data.corr(), annot = True,vmin=-1, vmax=1, center= 0, cmap= 'coolwarm',linewidths=3, linecolor='grey')
plt.title('Wine Correlation Heatmap',fontsize = 20)
plt.show()

In [None]:
plt.figure(figsize = (12,8))
sns.boxplot(x = data['quality'], y = data['volatile acidity'], data = data)

In [None]:
plt.figure(figsize = (12,8))
sns.boxplot(x = data['quality'], y = data['sulphates'], data = data)

### Data Pre-Processing

In [None]:
# Convert target variable to binary response of either good or bad quality
data['quality'] = data['quality'].map({3 : 'bad', 4 :'bad', 5: 'bad',
                                      6: 'good', 7: 'good', 8: 'good'})

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

data['quality'] = label_encoder.fit_transform(data['quality'])

data['quality'].value_counts



In [None]:
# Separate data
y = data.iloc[:,-1]
X = data.drop('quality',axis =1)
X


In [None]:
# Split Dataset

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.2,random_state = 42)
X_train

In [None]:
# Apply  standard scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Scaled_X_train = scaler.fit_transform(X_train)
Scaled_X_test= scaler.transform(X_test)

### Modelling

#### Logistic Regression


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_score


lr = LogisticRegression()


lr.fit(Scaled_X_train, y_train)

y_pred = lr.predict(Scaled_X_test)


print("Training accuracy :", lr.score(Scaled_X_train, y_train))
print("Testing accuracy :", lr.score(Scaled_X_test, y_test))

# classification report
print(classification_report(y_test, y_pred))

# confusion matrix
print(confusion_matrix(y_test, y_pred))

### Random Forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier

# creating the model
rfc = RandomForestClassifier(n_estimators = 200)

# feeding the training set into the model
rfc.fit(Scaled_X_train, y_train)

# predicting the results for the test set
y_pred = rfc.predict(Scaled_X_test)

# calculating the training and testing accuracies
print("Training accuracy :", rfc.score(Scaled_X_train, y_train))
print("Testing accuracy :", rfc.score(Scaled_X_test, y_test))

# classification report
print(classification_report(y_test, y_pred))

# confusion matrix
print(confusion_matrix(y_test, y_pred))


### KNN

In [None]:
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 7)

knn.fit(Scaled_X_train,y_train)
y_pred = knn.predict(Scaled_X_test)

# calculating the training and testing accuracies
print("Training accuracy :", knn.score(Scaled_X_train, y_train))
print("Testing accuracy :", knn.score(Scaled_X_test, y_test))

# classification report
print(classification_report(y_test, y_pred))

# confusion matrix
print(confusion_matrix(y_test, y_pred))



### SVM 

In [None]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

svm_clf = Pipeline([
                 ('scaler',StandardScaler()),
                ('linear_svc',LinearSVC(C=1, loss = 'hinge')),
                    ])

svm_clf.fit(Scaled_X_train,y_train)
y_pred = svm_clf.predict(Scaled_X_test)

# calculating the training and testing accuracies
print("Training accuracy :", svm_clf.score(Scaled_X_train, y_train))
print("Testing accuracy :", svm_clf.score(Scaled_X_test, y_test))

# classification report
print(classification_report(y_test, y_pred))

# confusion matrix
print(confusion_matrix(y_test, y_pred))
