In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter("ignore")
import joblib

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/dsrscientist/DSData/master/winequality-red.csv")

In [None]:
df

In [None]:
df.shape

In [None]:
#there are total 1599 rows and 12 columns present in our dataset.


In [None]:
df.isnull().sum()

In [None]:
#we do not see any missing values in any of the columns of our dataset

In [None]:
df.info()

In [None]:
#All the feature columns as float datatype

In [None]:
df.describe()

In [None]:
df.skew() #range is +/-0.5

In [None]:
#visualization

In [None]:
plt.figure(figsize=(10,7))
sns.countplot(x ='quality', data = df)
plt.xlabel('Quality of Red Wine')
plt.ylabel('Count of Rows in the dataset')
plt.show()

In [None]:
#quality score 5 and 6 is way higher than it's counterparts

In [None]:
index=0
labels = df['quality']
features = df.drop('quality', axis=1)

for col in features.items():
    plt.figure(figsize=(10,5))
    sns.barplot(x=labels, y=col[index], data=df, color="deeppink")
plt.tight_layout()
plt.show()


In [None]:
#we can conclude that to get better quality wine citric acid, sulphates and alcohol columns

In [None]:
fig, ax = plt.subplots(ncols=6, nrows=2, figsize=(15,10))
index = 0
ax = ax.flatten()
for col, value in df.items():
    sns.boxplot(y=col, data=df, ax=ax[index])
    index += 1
plt.tight_layout(pad=0.5, w_pad=0.7, h_pad=5.0)
plt.show()

In [None]:
fig, ax = plt.subplots(ncols=6, nrows=2, figsize=(15,10))
index = 0
ax = ax.flatten()
for col, value in df.items():
    sns.distplot(value, ax=ax[index], hist=False, color="g", kde_kws={"shade": True})
    index += 1
plt.tight_layout(pad=0.5, w_pad=0.7, h_pad=5.0)
plt.show()


In [None]:
#Correlation using a Heatmap

In [None]:
lower_triangle = np.tril(df.corr())
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=True, square=True, fmt='0.3f', 
            annot_kws={'size':10}, cmap="Spectral", mask=lower_triangle)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
#Dropping a column

In [None]:
df = df.drop('free sulfur dioxide', axis=1)
df

In [None]:
#Outlier removal

In [None]:
df.shape

In [None]:
%matplotlib inline
import scipy.stats as stats
from scipy.stats import zscore

In [None]:
# Z score method

z=np.abs(zscore(df))
threshold=3
np.where(z>3)

df=df[(z<3).all(axis=1)]
df

In [None]:
df.shape

In [None]:
data_loss=(1599-1464)/1599*100 
data_loss

In [None]:
#Splitting the dataset into 2 variables namely 'X' and 'Y' for feature and label

In [None]:
X = df.drop('quality', axis=1)
Y = df['quality']

In [None]:
Y.value_counts()

In [None]:
pip install imblearn

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
oversample = SMOTE()
X, Y = oversample.fit_resample(X, Y)

In [None]:
Y.value_counts()

In [None]:
Y

In [None]:
#Label Binarization

In [None]:
Y = Y.apply(lambda y_value:1 if y_value>=7 else 0) # 1 is for good quality and 0 for bad (not good) quality
Y # Displaying the label after applying label binarization

In [None]:
X

In [None]:
#Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X

In [None]:
#Creating the training and testing data sets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=21)

In [None]:
#Machine Learning Model for Classification and Evaluation Metrics

In [None]:
# Classification Model Function

def classify(model, X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=21)
    
    # Training the model
    model.fit(X_train, Y_train)
    
    # Predicting Y_test
    pred = model.predict(X_test)
    
    # Accuracy Score
    acc_score = (accuracy_score(Y_test, pred))*100
    print("Accuracy Score:", acc_score)
    
    # Classification Report
    class_report = classification_report(Y_test, pred)
    print("\nClassification Report:\n", class_report)
    
    # Cross Validation Score
    cv_score = (cross_val_score(model, X, Y, cv=5).mean())*100
    print("Cross Validation Score:", cv_score)
    
    # Result of accuracy minus cv scores
    result = acc_score - cv_score
    print("\nAccuracy Score - Cross Validation Score is", result)

In [None]:

from sklearn.preprocessing import StandardScaler

In [None]:
model=LogisticRegression()
classify(model, X, Y)

In [None]:
model=SVC(C=1.0, kernel='rbf', gamma='auto', random_state=42)
classify(model, X, Y)

In [None]:
model=DecisionTreeClassifier(random_state=21, max_depth=15)
classify(model, X, Y)

In [None]:
model=RandomForestClassifier(max_depth=15, random_state=111)
classify(model, X, Y)

In [None]:
model=KNeighborsClassifier(n_neighbors=15)
classify(model, X, Y)

In [None]:
model=ExtraTreesClassifier()
classify(model, X, Y)

In [None]:

model=xgb.XGBClassifier(verbosity=0)
classify(model, X, Y)

In [None]:
model=lgb.LGBMClassifier()
classify(model, X, Y)

In [None]:
#Hyper parameter tuning on the best ML Model

In [None]:
# Choosing Support Vector Classifier

svc_param = {'kernel' : ['poly', 'sigmoid', 'rbf'],
             'gamma' : ['scale', 'auto'],
             'shrinking' : [True, False],
             'random_state' : [21,42,104],
             'probability' : [True, False],
             'decision_function_shape' : ['ovo', 'ovr'],
             'verbose' : [True, False]}

In [None]:
Final_Model = SVC(decision_function_shape='ovo', gamma='scale', kernel='rbf', probability=True, random_state=21,
                 shrinking=True, verbose=True)
Classifier = Final_Model.fit(X_train, Y_train)
fmod_pred = Final_Model.predict(X_test)
fmod_acc = (accuracy_score(Y_test, fmod_pred))*100
print("Accuracy score for the Best Model is:", fmod_acc)

In [None]:
#AUC ROC Curve

In [None]:
disp = metrics.plot_roc_curve(Final_Model, X_test, Y_test)
disp.figure_.suptitle("ROC Curve")
plt.show()

In [None]:
#Confusion Matrix

In [None]:
class_names = df.columns
metrics.plot_confusion_matrix(Classifier, X_test, Y_test, cmap='mako')
plt.title('\t Confusion Matrix for Decision Tree Classifier \n')
plt.show()