In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.corr()['quality']

In [None]:
sns.heatmap(data.corr())

In [None]:
data[data==0].sum()

In [None]:
data.isnull().sum()

In [None]:
data.nunique()

# **Pairplot**

In [None]:
sns.pairplot(data)

In [None]:
columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol',]

# Scatterplot

In [None]:
for i in columns:
    sns.scatterplot(data = data[i])
    plt.xlabel(i)
    plt.show()

In [None]:
li = list(data['chlorides'].sort_values()[-4:].index)
data['chlorides'][li] = int(data.drop(li)['chlorides'].mode())

In [None]:
li = list(data['total sulfur dioxide'].sort_values()[-2:].index)
data['total sulfur dioxide'][li] = int(data.drop(li)['total sulfur dioxide'].mode())

In [None]:
li = list(data['sulphates'].sort_values()[-7:].index)
data['sulphates'][li] = int(data.drop(li)['sulphates'].mode())

In [None]:
li = list(data['residual sugar'].sort_values()[-11:].index)
data['residual sugar'][li] = int(data.drop(li)['residual sugar'].mean())

In [None]:
for i in columns:
    sns.scatterplot(data = data[i])
    plt.xlabel(i)
    plt.show()

# Barplot

In [None]:
for i in columns:
    sns.barplot(x='quality', y= i, data=data)
    plt.show()

# Boxplot

In [None]:
for i in columns:    
    sns.boxplot(x='quality', y= i, data=data)
    plt.show()

**Transforming Quality Column**

In [None]:
def quality_index(x):
    if x > 6:
        return 1
    else:
        return 0

In [None]:
data['quality'] = data['quality'].apply(quality_index)

In [None]:
for i in columns:    
    sns.boxplot(x='quality', y= i, data=data)
    plt.show()

In [None]:
data.head()

In [None]:
data.quality.value_counts()

Moduling

In [None]:
#Now seperate the dataset as response variable and feature variabes
X = data.drop('quality', axis = 1)
y = data['quality']

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
X = scaler.fit_transform(X)

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
x_pca = pca.fit_transform(X)
plt.figure(figsize=(5,4))
plt.plot(np.cumsum(pca.explained_variance_ratio_), 'ro-')
plt.grid()

In [None]:
#Train and Test splitting of data 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_pca, y, test_size = 0.2, random_state = 42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

In [None]:
lr = LogisticRegression()

svc = SVC(C=1.2, kernel='rbf')

rfc = RandomForestClassifier()

dtc = DecisionTreeClassifier()

knn = KNeighborsClassifier()

xgb = XGBClassifier()

In [None]:
from sklearn.metrics import roc_auc_score,accuracy_score,precision_score
from sklearn.metrics import recall_score,f1_score, confusion_matrix, roc_curve, auc
def train_model(model):
    # Checking accuracy
    model = model.fit(x_train, y_train)
    pred = model.predict(x_test)
    print('accuracy_score',accuracy_score(y_test, pred)*100)
    print('precision_score',precision_score(y_test, pred)*100)
    print('recall_score',recall_score(y_test, pred)*100)
    print('f1_score',f1_score(y_test, pred)*100)
    print('roc_auc_score',roc_auc_score(y_test, pred)*100)
    # confusion matrix
    print('confusion_matrix')
    print(pd.DataFrame(confusion_matrix(y_test, pred)))
    fpr, tpr, threshold = roc_curve(y_test, pred)
    roc_auc = auc(fpr, tpr)*100

    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

In [None]:
train_model(lr)

In [None]:
train_model(svc)

In [None]:
train_model(dtc)

In [None]:
train_model(knn)

In [None]:
train_model(xgb)

In [None]:
train_model(rfc)