In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd

# Load dataframe
df = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

df

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
# Convert target values to a binary classification task 
bins = (0, 6, 9)
labels = [0, 1]
df.quality = pd.cut(df.quality, bins=bins, labels=labels)

df.quality.value_counts()

## Data visualizing


In [None]:
import seaborn as sns

# Correlation matrix 
sns.heatmap(df.corr());

In [None]:
# Plot a histogram for all variables
df.hist(bins=50, figsize=(20, 15))
plt.show()

In [None]:
from matplotlib import pyplot as plt 

plt.scatter(
    x = df['fixed acidity'], y = df['volatile acidity'], c = df.quality, 
    alpha = 0.3
)
plt.xlabel('Fixed acidity')
plt.ylabel('Volatile acidity')
plt.legend() 
plt.show()

In [None]:
plt.scatter(
    x = df['density'], y = df['alcohol'], c = df.quality, 
    alpha = 0.3
)
plt.xlabel('Density')
plt.ylabel('Alcohol')
plt.legend() 
plt.show()

In [None]:
plt.scatter(
    x = df['density'], y = df['pH'], c = df.quality, 
    alpha = 0.3
)
plt.xlabel('Density')
plt.ylabel('pH')
plt.legend() 
plt.show()

In [None]:
plt.scatter(
    x = df['alcohol'], y = df['pH'], c = df.quality, 
    alpha = 0.3
)
plt.xlabel('Alcohol')
plt.ylabel('pH')
plt.legend() 
plt.show()

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'fixed acidity', data = df)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'volatile acidity', data = df)

## Train Machine Learning models

In [None]:
X = df.drop(['quality'], axis=1)
y = df.quality

In [None]:
from sklearn.preprocessing import StandardScaler

# Scale data 
scaler = StandardScaler() 
X = pd.DataFrame(data=scaler.fit_transform(X), columns=X.columns)

X.describe()

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score, f1_score

logistic = LogisticRegression() 
logistic.fit(X_train, y_train)

preds = logistic.predict(X_test)

print("Accuracy: {}".format(accuracy_score(y_test, preds)))
print("F1 score: {}".format(f1_score(y_test, preds)))

### Supported Vector Classifier

In [None]:
from sklearn.svm import SVC 

svc = SVC() 
svc.fit(X_train, y_train)

preds = svc.predict(X_test)

print("Accuracy: {}".format(accuracy_score(y_test, preds)))
print("F1 score: {}".format(f1_score(y_test, preds)))

In [None]:
# Finding best parameters for our SVC model
param = {
    'C': [0.1,0.8,0.9,1,1.1,1.2,1.3,1.4],
    'kernel':['linear', 'rbf'],
    'gamma' :[0.1,0.8,0.9,1,1.1,1.2,1.3,1.4]
}

grid_svc = GridSearchCV(SVC(), param_grid=param, scoring='accuracy', cv=3)

grid_svc.fit(X_train, y_train)

In [None]:
grid_svc.best_params_

In [None]:
model = grid_svc.best_estimator_

preds = model.predict(X_test)

print("Accuracy: {}".format(accuracy_score(y_test, preds)))
print("F1 score: {}".format(f1_score(y_test, preds)))

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier() 
forest.fit(X_train, y_train)

preds = forest.predict(X_test)

print("Accuracy: {}".format(accuracy_score(y_test, preds)))
print("F1 score: {}".format(f1_score(y_test, preds)))

In [None]:
params = {
    'n_estimators': np.arange(50, 201, 20), 
    'max_depth': np.arange(10, 51, 10)
}

grid_forest = GridSearchCV(RandomForestClassifier(), param_grid=params, scoring='accuracy', cv=3)

grid_forest.fit(X_train, y_train)

In [None]:
grid_forest.best_params_

In [None]:
model = grid_forest.best_estimator_

preds = model.predict(X_test)

print("Accuracy: {}".format(accuracy_score(y_test, preds)))
print("F1 score: {}".format(f1_score(y_test, preds)))