# RED WINE QUALITY

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe().transpose()

# Exploratory Data Analysis

In [None]:
sns.countplot(x='quality',data=df)

In [None]:
df['quality'].value_counts()

In [None]:
sns.heatmap(df.corr(),vmin=-1,cmap='viridis')

In [None]:
df.corr()[abs(df.corr()['quality']) > 0.25]['quality'] 

### adding a new column 'binary quality'

In [None]:
df['binary quality'] = df['quality'].apply(lambda x: 1 if x>=6 else 0)
df.head()

In [None]:
sns.histplot(x='fixed acidity',data=df)

* majority has the fixed acidity of 7-8 pH

In [None]:
sns.lmplot(x='density',y='residual sugar',data=df,hue='binary quality')

In [None]:
sns.histplot(x='alcohol',data=df,hue='binary quality')

* higher the alcohol concenteration better the quality

In [None]:
sns.jointplot(x='free sulfur dioxide',y='total sulfur dioxide',data=df,hue='binary quality')

* lesser the total sulfur dioxide better the quality

### grouping by 'Quality' column

In [None]:
by_quality = df.groupby('quality')
by_quality.mean()

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(18, 10))
fig.suptitle('Groupby quality mean')

axes[0,0].plot(by_quality.mean().index.values,
        by_quality.mean()['total sulfur dioxide'].values,
         color='r',linestyle='dashed', marker='o',
         markersize=8)
axes[0,0].title.set_text('total sulfur dioxide VS quality')

axes[0,1].plot(by_quality.mean().index.values,
        by_quality.mean()['alcohol'].values,
         color='k',linestyle='dashed', marker='o',
         markersize=8)
axes[0,1].title.set_text('alcohol VS quality')

axes[1,0].plot(by_quality.mean().index.values,
        by_quality.mean()['volatile acidity'].values,
         color='y',linestyle='dashed', marker='o',
         markersize=8)
axes[1,0].title.set_text('volatile acidity VS quality')

axes[1,1].plot(by_quality.mean().index.values,
        by_quality.mean()['sulphates'].values,
         color='b',linestyle='dashed', marker='o',
         markersize=8)
axes[1,1].title.set_text('sulphates VS quality')

# Data Preprocessing

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=42)

In [None]:
df.columns

In [None]:
input_cols = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol']
target_col = 'binary quality'

In [None]:

train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_col].copy()

val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_col].copy()

test_inputs = test_df[input_cols].copy()
test_targets = test_df[target_col].copy()


In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler().fit(df[input_cols])

In [None]:
train_inputs[input_cols] = scaler.transform(train_inputs[input_cols])
val_inputs[input_cols] = scaler.transform(val_inputs[input_cols])
test_inputs[input_cols] = scaler.transform(test_inputs[input_cols])

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logmodel = LogisticRegression(penalty='l1',solver='liblinear',max_iter=180 )

In [None]:
logmodel.fit(train_inputs,train_targets)

In [None]:
logmodel.score(train_inputs,train_targets)

In [None]:
logmodel.score(val_inputs,val_targets)

## tuning optimiser, solver and penalty for Logistic Regression

penalty : {'l1', 'l2', 'elasticnet', 'none'}, default='l2'
    Used to specify the norm used in the penalization. The 'newton-cg',
    'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is
    only supported by the 'saga' solver. If 'none' (not supported by the
    liblinear solver), no regularization is applied.


solver : {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'},             default='lbfgs'

    Algorithm to use in the optimization problem.

    - For small datasets, 'liblinear' is a good choice, whereas 'sag' and
      'saga' are faster for large ones.
    - For multiclass problems, only 'newton-cg', 'sag', 'saga' and 'lbfgs'
      handle multinomial loss; 'liblinear' is limited to one-versus-rest
      schemes.
    - 'newton-cg', 'lbfgs', 'sag' and 'saga' handle L2 or no penalty
    - 'liblinear' and 'saga' also handle L1 penalty
    - 'saga' also supports 'elasticnet' penalty
    - 'liblinear' does not support setting ``penalty='none'``


In [None]:
list_of_reg_optm = [{'penalty':'l2', 'train_score': 0.7382690302398331, 'solver': 'lbfgs' , 'val_score':0.753125},
{'penalty':'l1', 'train_score': 0.7476538060479666, 'solver': 'liblinear' , 'val_score':0.775},
{'penalty':'elasticnet', 'train_score': 0.7382690302398331, 'solver': 'saga' , 'val_score':0.759375, 'l1_ratio':0.5},
{'penalty':'elasticnet', 'train_score': 0.7393117831074035, 'solver': 'saga' , 'val_score':0.771875, 'l1_ratio':0.7},
{'penalty':'elasticnet', 'train_score': 0.7413972888425443, 'solver': 'saga' , 'val_score':0.771875, 'l1_ratio':0.8},
{'penalty':'elasticnet', 'train_score': 0.7434827945776851, 'solver': 'saga' , 'val_score':0.775, 'l1_ratio':0.9},
{'penalty':'l1', 'train_score': 0.7434827945776851, 'solver': 'saga' , 'val_score':0.775},
{'penalty':'l2', 'train_score': 0.7372262773722628, 'solver': 'newton-cg' , 'val_score':0.753125}  
]


In [None]:
var_optm = pd.DataFrame(list_of_reg_optm)
var_optm

## Model's Predictions on test data

In [None]:
preds = logmodel.predict(test_inputs)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(test_targets,preds))

# Decision Tree Model

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree = DecisionTreeClassifier(max_leaf_nodes=32,max_depth=7,max_features=None,random_state=42)

In [None]:
tree.fit(train_inputs, train_targets)

In [None]:
tree.score(train_inputs, train_targets)

In [None]:
tree.score(val_inputs, val_targets)

In [None]:
from sklearn.tree import plot_tree, export_text

In [None]:
plt.figure(figsize=(80,20))
plot_tree(tree, feature_names=train_inputs.columns, max_depth=2, filled=True);

In [None]:
%%time
lst =[]

for x in range(1,21):
    model = DecisionTreeClassifier(max_depth=x, random_state=42)
    model.fit(train_inputs, train_targets)
    train_acc = 1 - model.score(train_inputs, train_targets)
    val_acc = 1 - model.score(val_inputs, val_targets)
    lst.append({'Max Depth': x,'Training Error': train_acc, 'Validation Error': val_acc})

In [None]:
error_df = pd.DataFrame(lst)
error_df.head()

In [None]:
plt.figure()
plt.plot(error_df['Max Depth'], error_df['Training Error'])
plt.plot(error_df['Max Depth'], error_df['Validation Error'])
plt.title('Training vs. Validation Error')
plt.xticks(range(0,21, 2))
plt.xlabel('Max. Depth')
plt.ylabel('Prediction Error (1 - Accuracy)')
plt.legend(['Training', 'Validation'])

In [None]:
%%time
lst =[]
l3 = [50,75,100,125,150,175,200]
for y in l3:
    model = DecisionTreeClassifier(max_depth=7,max_leaf_nodes=y, random_state=42)
    model.fit(train_inputs, train_targets)
    train_acc = 1 - model.score(train_inputs, train_targets)
    val_acc = 1 - model.score(val_inputs, val_targets)
    lst.append({'max leaf nodes': y,'Training Error': train_acc, 'Validation Error': val_acc})

In [None]:
error_df = pd.DataFrame(lst)
error_df.head()

In [None]:
plt.figure()
plt.plot(error_df['max leaf nodes'], error_df['Training Error'])
plt.plot(error_df['max leaf nodes'], error_df['Validation Error'])
plt.title('Training vs. Validation Error')
plt.xticks(l3)
plt.xlabel('max leaf nodes')
plt.ylabel('Prediction Error (1 - Accuracy)')
plt.legend(['Training', 'Validation'])

### Model's Predictions on Test Data

In [None]:
predictions = tree.predict(test_inputs)

from sklearn.metrics import classification_report,confusion_matrix

print(classification_report(test_targets,predictions))
print(confusion_matrix(test_targets,predictions))

## Future Work
* try random forest
* tune hyperparamerters for rfc

