In [None]:
# Import basic libraries 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Import Preprocessing Libraries
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [None]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier,  export_graphviz
from sklearn.metrics import mean_squared_error
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.externals import joblib 

In [None]:
# Read data file using Pandas  
df1=pd.read_csv('winequality_red.csv')
#df1=pd.read_csv('winequality_red.csv')

In [None]:
df1.head(10)

In [None]:
df1.columns

In [None]:
print(df1.shape)

In [None]:
print(df1.info())

In [None]:
# using isnull() function for finding null values   
print(df1.isnull().sum())

In [None]:
# Visual Representation for finding null values using Heat Map
sns.heatmap(df1.isnull())
plt.show()

In [None]:
print(df1.describe(include='all'))

In [None]:
print(df1.nunique())

In [None]:
c_count=df1['quality'].value_counts()
print(c_count)

In [None]:
# Visual Representation of the dependent variable distribution in the dataset
sns.set(rc={'figure.figsize':(12,7)})
quality_c=df1['quality']
df1_count=sns.countplot(x=quality_c, data=df1)
plt.title("Class Distribution")

In [None]:
sns.pairplot(df1, hue="quality")
plt.figure(figsize=(30, 30))

In [None]:
# Analysis the correlation in the dataset
corr = df1.corr()
print(corr)

plt.figure(figsize=(15, 10))

sns.heatmap(corr[(corr >= 0.4) | (corr <= -0.4)], 
            cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.2,
            annot=True, annot_kws={"size": 12}, square=True)

In [None]:
n_wines = df1.shape[0]

# Number of wines with quality rating above 6 considered good quality
quality_above_6 = df1.loc[(df1['quality'] > 6)]
n_above_6 = quality_above_6.shape[0]

# Number of wines with quality rating below 5 considered not appropiate quality (Bad)
quality_below_5 = df1.loc[(df1['quality'] < 5)]
n_below_5 = quality_below_5.shape[0]

# Number of wines with quality rating between 5 to 6 considered average quality 
quality_between_5 = df1.loc[(df1['quality'] >= 5) & (df1['quality'] <= 6)]
n_between_5 = quality_between_5.shape[0]

# Percentage of wines with quality rating above 6
greater_percent = n_above_6*100/n_wines

# Print the results
print("Total number of wine data: {}".format(n_wines))
print("Wines with rating 7 and above: {}".format(n_above_6))
print("Wines with rating less than 5: {}".format(n_below_5))
print("Wines with rating 5 and 6: {}".format(n_between_5))
print("Percentage of wines with quality 7 and above: {:.2f}%".format(greater_percent))


In [None]:
# Visualize the distribution of the data type in the feature space
df = pd.DataFrame(df1)
df.plot.hist(alpha=0.5, bins=15, grid=True, legend=None)  
plt.xlabel("Feature value")
plt.title("Histogram")
plt.show()

In [None]:
# Try eliminating skewness uisng squareroot function 
df_pow = df1.apply(np.sqrt)
df_pow.plot.hist(alpha=0.5, bins=15, grid=True, legend=None)
plt.xlabel("Feature value")
plt.title("Histogram")
plt.show()

In [None]:
#Define quality into three target classes 
def isQuality(quality):
    if quality > 6:
        return 1
    if (quality >= 5) and (quality <= 6):
        return 2
    else:
        return 0

In [None]:
df1['isQuality'] = df1['quality'].apply(isQuality)
print('New Classes are defined for the quality of wines:\n',df['isQuality'].value_counts())

In [None]:
# Numerical representation of the skweness in the feature set
df2=df1.drop('quality', axis=1)
print(df.skew())
df2.head()

In [None]:
# Signmoid Function established to deal with skewness
def sigmoid(x):
    e = np.exp(1)
    y = 1/(1+e**(-x))
    return y

In [None]:
for col in df2.columns:
    if df2.skew().loc[col]>0.55:
        df2[col]=sigmoid(df2[col])

In [None]:
df2.skew()

In [None]:
# Seprate Data into features and target set
feature_data=df2
target_class=df1['isQuality']
print(feature_data.shape)
print(target_class.shape)

In [None]:
# Standardize the feature_space values 
sc = StandardScaler()
feature_data_std = sc.fit_transform(feature_data)

In [None]:
# Reshape the target variable 
target_class=df1['isQuality'].values.reshape(-1,1)
print(target_class.shape)

In [None]:
# Split the dataset
x_train, x_test, y_train, y_test = train_test_split(feature_data_std, target_class, random_state = 83,test_size=0.28)
[subset.shape for subset in [x_train,y_test,x_train,y_test]]

In [None]:
# Create function for three classifiers to find best parameters to train and test the model. 
def grid_search(estimator, clf, x_train, x_test, y_train, y_test):
    if estimator == 'SVM':
        # Support Vector Machine
        svc_params = {'C':[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
                      'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}
        grid_svc = GridSearchCV(SVC(), svc_params)
        grid_svc.fit(x_train, y_train)
        # SVC best estimator
        svc = grid_svc.best_estimator_
        print("Best Parameters for SVM: ", grid_svc.best_estimator_)
        print("Best Score for SVM: ", grid_svc.best_score_)
        print("******************************************")
        return svc
    elif estimator == 'DecisionTree':
    
        # Decision Tree
        dtree_params = {"criterion": ["gini", "entropy"], "max_depth": list(range(2,30,1)),"min_samples_leaf": list(range(5,20,1))}
        grid_dtree = GridSearchCV(DecisionTreeClassifier(), dtree_params)
        grid_dtree.fit(x_train, y_train)
        # tree best estimator
        tree_clf = grid_dtree.best_estimator_
        print("Best Parameters for Decision Tree: ", grid_dtree.best_estimator_)
        print("Best Score for Decision Tree: ", grid_dtree.best_score_)
        print("******************************************")
        return tree_clf

In [None]:
#Define classification function
def apply_classification(estimator, clf, x_train, x_test, y_train, y_test):
    #Find the best parameter by grid search
    grid_clf = grid_search(estimator, clf, x_train, x_test, y_train, y_test)
    
    # 0.28% data randomly selected as a validation set.
    cv = ShuffleSplit(n_splits=10, test_size=0.28, random_state=83)
    
    scores = cross_val_score(grid_clf, x_train, y_train, cv=10)
    print("Mean Accuracy of Cross Validation: %", round(scores.mean()*100,2))
    print("Std of Accuracy of Cross Validation: %", round(scores.std()*100))
    print("==============================================")
    
    #Predict the test data as selected classifier
    clf_prediction = grid_clf.predict(x_test)
    _accuracy = sum(y_test == clf_prediction)/len(y_test)
    print("Accuracy of",estimator,":",_accuracy*100)
    
    #print confusion matrix and accuracy score before best parameters
    _conf_matrix = confusion_matrix(y_test, clf_prediction)
    print("Confusion matrix of",estimator,":\n", _conf_matrix)
    print("==========================================")
    
    print("Classification Report: \n {}".format (classification_report(y_test, clf_prediction)))
    return grid_clf

In [None]:
svm = SVC()
apply_classification('SVM', svm, x_train, x_test, y_train, y_test)

In [None]:
dt = DecisionTreeClassifier()
dt_classifier = apply_classification('DecisionTree', dt, x_train, x_test, y_train, y_test)

In [None]:
# Lastly Save the model for futher use
joblib.dump(dt_classifier, 'wine_quality.pkl')